diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index a6a51b65f0..65d93d537a 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 
 . ./mfc.sh load -c f -m g
-./mfc.sh build -j 8 --gpu
+./mfc.sh test --dry-run -j 8 --gpu
diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh
index 5cdc57e78c..e89af47214 100644
--- a/.github/workflows/phoenix/test.sh
+++ b/.github/workflows/phoenix/test.sh
@@ -5,7 +5,7 @@ if [ "$job_device" == "gpu" ]; then
     build_opts="--gpu"
 fi
 
-./mfc.sh build -j 8 $build_opts
+./mfc.sh test --dry-run -j 8 $build_opts
 
 n_test_threads=8
 
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1be80f5c79..b312197e03 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -84,7 +84,7 @@ jobs:
       - name: Build
         run:  |
           if [ '${{ matrix.intel }}' == 'true' ]; then . /opt/intel/oneapi/setvars.sh; fi
-          /bin/bash mfc.sh build -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} 
+          /bin/bash mfc.sh test --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} 
 
       - name: Test
         run:  |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index de84d7915d..8a8e19120e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -335,7 +335,7 @@ macro(HANDLE_SOURCES target useCommon)
 
         add_custom_command(
             OUTPUT   ${f90}
-            COMMAND  ${FYPP_EXE} -m re
+            COMMAND  ${FYPP_EXE} -m re -m itertools
                                  -I "${CMAKE_BINARY_DIR}/include/${target}"
                                  -I "${${target}_DIR}/include"
                                  -I "${common_DIR}/include"
diff --git a/docs/documentation/case.md b/docs/documentation/case.md
index d9461453e4..86f9afb460 100644
--- a/docs/documentation/case.md
+++ b/docs/documentation/case.md
@@ -337,8 +337,8 @@ Details of implementation of viscosity in MFC can be found in [Coralic (2015)](r
 | Parameter              | Type    | Description                                    |
 | ---:                   | :----:  |          :---                                  |
 | `bc_[x,y,z]%%beg[end]` | Integer | Beginning [ending] boundary condition in the $[x,y,z]$-direction (negative integer, see table [Boundary Conditions](#boundary-conditions)) |
-| `bc_[x,y,z]%%vb[1,2,3]`‡| Real   | Velocity in the (x,1), (y, 2), (z,3) direction applied to `bc_[x,y,z]%%beg` |
-| `bc_[x,y,z]%%ve[1,2,3]`‡| Real   | Velocity in the (x,1), (y, 2), (z,3) direction applied to `bc_[x,y,z]%%end` |
+| `bc_[x,y,z]%%vel_beg[1,2,3]`‡| Real | Velocity in the (x,1), (y, 2), (z,3) direction applied to `bc_[x,y,z]%%beg` |
+| `bc_[x,y,z]%%vel_end[1,2,3]`‡| Real | Velocity in the (x,1), (y, 2), (z,3) direction applied to `bc_[x,y,z]%%end` |
 | `model_eqns`           | Integer | Multicomponent model: [1] $\Gamma/\Pi_\infty$; [2] 5-equation; [3] 6-equation; [4] 4-equation |
 | `alt_soundspeed` *     | Logical | Alternate sound speed and $K \nabla \cdot u$ for 5-equation model |
 | `adv_n`   	         | Logical | Solving directly for the number density (in the method of classes) and compute void fraction from the number density |
@@ -463,7 +463,7 @@ The value of `dt` needs to be sufficiently small to satisfy the Courant-Friedric
 To newly start the simulation, set `t_step_start = 0`.
 To restart the simulation from $k$-th time step, set `t_step_start = k`; see [Restarting Cases](running.md#restarting-cases).
 
-##### Adaptive Time-Stepping
+##### Adaptive Time-Stepping (optional)
 
 - `cfl_adap_dt` enables adaptive time stepping with a constant CFL when true
 
@@ -480,6 +480,36 @@ To restart the simulation from $k$-th time step, set `t_step_start = k`; see [Re
 To newly start the simulation, set `n_start = 0`.
 To restart the simulation from $k$-th time step, see [Restarting Cases](running.md#restarting-cases).
 
+##### Boundary Condition Patches (optional and experimental)
+
+> [!WARNING]  
+> This feature is currently experimental and may not produce physicall correct
+> results when certain other features are turned on.
+
+Boundary condition patches allow you to define boundary conditions with more granularity
+than `bc_[x,y,z]`. When using this feature, any point along the edge of the domain can
+be assigned its own boundary condition type. Since the boundaries of a 3D domain are
+2D surfaces, the concept of patches is re-introduced.
+
+Boundary conditions are applied using the [Painter's algorithm](https://en.wikipedia.org/wiki/Painter%27s_algorithm)
+where patches with higher indices take priority, in layers. The lowest priority is given
+to `bc_[x,y,z]`. This feature is opt-in and enabled by assigning `num_bc_patches` to a
+positive value.
+
+| Parameter              | Type    | Description                                    |
+| ---:                   | :----:  | :---                                           |
+| `num_bc_patches`       | Integer | Number of boundary condition patches (default 0) |
+| `%%type` *             | Integer | Boundary condition type (negative integer, see table [Boundary Conditions](#boundary-conditions)) |
+| `%%dir` *              | Integer | About the [1] x; [2] y; [3] z; axis |
+| `%%loc` *              | Integer | About the line or surface at the [-1] beginning; [+2] end; of the `%%dir` axis. |
+| `%%geometry` *         | Integer | The geometry of the boundary condition patch. See table [Boundary Condition Patch Geometry Types](#boundary-condition-patch-geometry-types) |
+| `%%vel(i)` *           | Real    | Meaning depends on the boundary condition |
+| `%%centroid(i)` *      | Real    | Meaning depends on the patch geometry. Index $i = \text{dir}$ is always ignored |
+| `%%length(i)` *        | Real    | Meaning depends on the patch geometry. Index $i = \text{dir}$ is always ignored |
+| `%%radius` *           | Real    | Meaning depends on the patch geometry. Index $i = \text{dir}$ is always ignored |
+
+*: These parameters should be prepended with `patch_bc(j)%` where $j$ is the boundary condition patch index.
+
 ### 7. Formatted Output
 
 | Parameter            | Type    | Description                                    |
@@ -862,6 +892,13 @@ This includes types exclusive to one-, two-, and three-dimensional problems.
 The patch type number (`#`) corresponds to the input value in `input.py` labeled  `patch_icpp(j)%%geometry` where $j$ is the patch index.
 Each patch requires a different set of parameters, which are also listed in this table.
 
+### Boundary Condition Patch Geometry Types
+
+| #    | Name               | Dim.   | Requirements                          |
+| ---: | :----:             | :---   | :---                                  |
+| 1    | Cuboid             | 1 & 2  | `%%centroid(1:3)` and `%%length(1:3)` |
+| 2    | Spheroid           | 1 & 2  | `%%centroid(1:3)` and `%%radius`      |
+
 ### Immersed Boundary Patch Types
 
 | #    | Name               | Dim.   |
diff --git a/examples/1D_inert_shocktube/export.py b/examples/1D_inert_shocktube/export.py
new file mode 100644
index 0000000000..e676b6cedd
--- /dev/null
+++ b/examples/1D_inert_shocktube/export.py
@@ -0,0 +1,30 @@
+import csv
+import numpy as np
+import statistics
+from tqdm import tqdm
+
+import mfc.viz
+from case import dt, sol_L as sol
+
+
+case = mfc.viz.Case(".", dt)
+
+for name in tqdm(sol.species_names, desc="Loading Variables"):
+    case.load_variable(f"Y_{name}", f"prim.{5 + sol.species_index(name)}")
+case.load_variable("rho", "prim.1")
+case.load_variable("u", "prim.2")
+case.load_variable("p", "prim.3")
+case.load_variable("T", "prim.15")
+
+steps = case.get_timesteps()
+
+for step in [min(steps), max(steps)]:
+    t = step * dt
+
+    with open(f"mfc-{step}.csv", "w") as f:
+        writer = csv.writer(f)
+        keys = ['x'] + list(set(case.get_data()[0].keys()) - set(["x"]))
+        writer.writerow(keys)
+        for ix, x in enumerate(sorted(case.get_coords()[0])):
+            row = [case.get_data()[step][key][ix] for key in keys]
+            writer.writerow(row)
diff --git a/examples/1D_reactive_shocktube/export.py b/examples/1D_reactive_shocktube/export.py
new file mode 100644
index 0000000000..e676b6cedd
--- /dev/null
+++ b/examples/1D_reactive_shocktube/export.py
@@ -0,0 +1,30 @@
+import csv
+import numpy as np
+import statistics
+from tqdm import tqdm
+
+import mfc.viz
+from case import dt, sol_L as sol
+
+
+case = mfc.viz.Case(".", dt)
+
+for name in tqdm(sol.species_names, desc="Loading Variables"):
+    case.load_variable(f"Y_{name}", f"prim.{5 + sol.species_index(name)}")
+case.load_variable("rho", "prim.1")
+case.load_variable("u", "prim.2")
+case.load_variable("p", "prim.3")
+case.load_variable("T", "prim.15")
+
+steps = case.get_timesteps()
+
+for step in [min(steps), max(steps)]:
+    t = step * dt
+
+    with open(f"mfc-{step}.csv", "w") as f:
+        writer = csv.writer(f)
+        keys = ['x'] + list(set(case.get_data()[0].keys()) - set(["x"]))
+        writer.writerow(keys)
+        for ix, x in enumerate(sorted(case.get_coords()[0])):
+            row = [case.get_data()[step][key][ix] for key in keys]
+            writer.writerow(row)
diff --git a/examples/hypotests/case.py b/examples/hypotests/case.py
new file mode 100644
index 0000000000..badaa39155
--- /dev/null
+++ b/examples/hypotests/case.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+import math
+import json
+
+# Numerical setup
+Nx      = 201  # Number of grid points in x
+Ny      = 201  # Number of grid points in y
+dx      = 1./(1.*(Nx+1))  # Grid spacing in x
+dy      = 1./(1.*(Ny+1))  # Grid spacing in y
+
+Tend    = 64E-06  # End time
+Nt      = 2000 # 2000  # Number of time steps
+mydt    = Tend/(1.*Nt)  # Time step size
+
+# Configuring case dictionary
+print(json.dumps({
+        # Logistics ================================================                 
+        'run_time_info'                : 'F',                       
+        # ==========================================================
+
+        # Computational Domain Parameters ==========================
+        'x_domain%beg'                 : 0.E+00,  # x start                    
+        'x_domain%end'                 : 2.E+00,  # x end                    
+        'y_domain%beg'                 : 0.E+00,  # y start
+        'y_domain%end'                 : 1.E+00,  # y end 
+        'm'                            : Nx,  # Number of grid points in x direction                       
+        'n'                            : Ny,  # Number of grid points in y direction                       
+        'p'                            : 0,  # Number of grid points in z (for 3D, change this)                      
+        'dt'                           : 1e-6,  # Time step size                      
+        't_step_start'                 : 0,  # Start time                         
+        't_step_stop'                  : Nt,  # End time                         
+        't_step_save'                  : 500,  # Save frequency
+        # ==========================================================
+                                                                            
+        # Simulation Algorithm Parameters ==========================
+        'num_patches'                  : 1,  # Two patches                        
+        'model_eqns'                   : 2,  # Number of model equations                       
+        'alt_soundspeed'               : 'F',                      
+        'num_fluids'                   : 2,        
+        'low_Mach'                     : 0,
+        'mpp_lim'                      : 'F',                      
+        # ' mixture_err'                  : 'F',                      
+        'time_stepper'                 : 3,                                               
+        'weno_order'                   : 5,                        
+        'weno_eps'                     : 1.E-16,
+        'weno_Re_flux'                 : 'F',  
+        'weno_avg'                     : 'F',
+        'mapped_weno'                  : 'F',                     
+        'null_weights'                 : 'F',                      
+        'mp_weno'                      : 'F',                      
+        'riemann_solver'               : 1,                        
+        'wave_speeds'                  : 1,                        
+        'avg_state'                    : 2,                        
+        'bc_x%beg'                     : -3,                       
+        'bc_x%end'                     : -3,                       
+        'bc_y%beg'                     : -3,  # Boundary conditions for y direction
+        'bc_y%end'                     : -3,
+        'num_bc_patches'               : 1,
+        'patch_bc(1)%type'             : -17,
+        'patch_bc(1)%dir'              : 1,
+        'patch_bc(1)%loc'              : -1,
+        'patch_bc(1)%geometry'         : 1,
+        'patch_bc(1)%centroid(1)'      : 0, 
+        'patch_bc(1)%centroid(2)'      : 0.5,
+        'patch_bc(1)%length(2)'        : 0.26,
+        'patch_bc(1)%vel(1)'           : 10,
+        'patch_bc(1)%vel(2)'           : 0,
+        # ==========================================================
+
+        # Turning on IB ================================
+        'ib'               : 'T',
+        'num_ibs'          : 2,    
+
+        # ==========================================================
+
+        # Formatted Database Files Structure Parameters ============
+        'format'                       : 1,                        
+        'precision'                    : 2,                        
+        'prim_vars_wrt'                :'T',                       
+        'parallel_io'                  :'T',                       
+        # ==========================================================
+
+        # Patch 1 (background flow) ===================
+        'patch_icpp(1)%geometry'       : 3,  # 2D geometry                
+        'patch_icpp(1)%x_centroid'     : 1.0,  # x-center                    
+        'patch_icpp(1)%y_centroid'     : 0.5,  # y-center
+        'patch_icpp(1)%length_x'       : 2.0,   # x-length                    
+        'patch_icpp(1)%length_y'       : 1.0,   # y-length
+        'patch_icpp(1)%vel(1)'         : 0.0,
+        'patch_icpp(1)%vel(2)'         : 0.0,   # y-velocity    '100*sin(3*x*pi)'
+        'patch_icpp(1)%pres'           : 1.E5,  # Pressure                    
+        'patch_icpp(1)%alpha_rho(1)'   : 1000,  # Density    
+        'patch_icpp(1)%alpha_rho(2)'   : 0.,        
+        'patch_icpp(1)%alpha(1)'       : 1,                       
+        'patch_icpp(1)%alpha(2)'       : 0.,                          
+        'patch_icpp(1)%tau_e(1)'       : 0.0,             
+
+
+        # ==========================================================
+
+        # Patch 2 (hypo material in the center) ================
+        'patch_ib(1)%geometry'       : 3,  # 2D geometry      
+        # 'patch_ib(1)%hcid'           : 201,                    
+        'patch_ib(1)%x_centroid'     : 0.5,  # x-center                    
+        'patch_ib(1)%y_centroid'     : 0.65,  # y-center                    
+        'patch_ib(1)%length_x'       : 1.0,   # x-length                   
+        'patch_ib(1)%length_y'       : 0.04,   # y-length       
+        'patch_ib(1)%slip'           : 'T',
+
+        # ==========================================================
+
+        # Patch 3 (hypo material in the center) ================
+        'patch_ib(2)%geometry'       : 3,  # 2D geometry      
+        # 'patch_ib(1)%hcid'           : 201,                    
+        'patch_ib(2)%x_centroid'     : 0.5,  # x-center                    
+        'patch_ib(2)%y_centroid'     : 0.35,  # y-center                    
+        'patch_ib(2)%length_x'       : 1.0,   # x-length                   
+        'patch_ib(2)%length_y'       : 0.04,   # y-length       
+        'patch_ib(2)%slip'           : 'T',
+
+        # Fluids Physical Parameters ===============================
+        'fluid_pp(1)%gamma'            : 1.E+00/(6.12E+00-1.E+00),   
+        'fluid_pp(1)%pi_inf'           : 6.12E+00*3.43E+08/(6.12E+00 - 1.E+00), 
+        # 'fluid_pp(1)%G'                : 0,        
+        'fluid_pp(2)%gamma'            : 1.E+00/(1.3E+00-1.E+00),   
+        'fluid_pp(2)%pi_inf'           : 1.3E+00*2.E+08/(1.3E+00 - 1.E+00),                        
+        # 'fluid_pp(2)%G'                : 2.7E+05/(2.E+00*(1.E+00 + 0.4E+00)),     
+        'fluid_pp(2)%G'                : 1.E7,                
+        # ==========================================================
+}))
diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index c1f447a260..055ba3a5f9 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -102,3 +102,161 @@
                          //${message or '"No error description."'}$)
     end if
 #:enddef
+
+#:def MAKE_LOOPS(impl, loops, gpu = None)
+    #:if gpu != False
+        !$acc parallel loop collapse(${len(loops)}$) gang vector default(present)
+    #:endif
+    #:for index, lbound, hbound in (loops or [])
+        do ${index}$ = ${lbound}$, ${hbound}$
+    #:endfor
+
+    $:impl
+
+    #:for i in range(len(loops or []))
+        end do
+        #:endfor
+    #:enddef
+
+    #:def ITERATE_OVER_BUFFER_REGION_SIDED(impl, dir = None, loc = None, thickness = None, gpu = None, outer_loops = None, inner_loops = None, pack_v_size = None)
+        #:set thickness   = thickness or 'buff_size'
+        #:set outer_loops = outer_loops or []
+        #:set inner_loops = inner_loops or []
+        #:set pack_v_size = pack_v_size or 'v_size'
+
+        if (${dir}$ <= num_dims) then
+
+            if ((${dir}$) == +1 .and. (${loc}$) == -1) then
+
+                #:block MAKE_LOOPS(loops = outer_loops + [("l", 0, "p"), ("k", 0, "n"), ("j", 1, thickness)] + inner_loops, gpu=gpu)
+                    x = -j; y = k; z = l ! Regular
+                    sx = j - 1; sy = k; sz = l ! Symmetry
+                    px = m - (j - 1); py = k; pz = l ! Periodic
+                    ex = 0; ey = k; ez = l ! Extrapolation
+                    locx = -1; locy = 0; locz = 0
+
+                    exlhs = ex; eylhs = ey; ezlhs = ez
+                    exlhs = f_clamp(exlhs, 0, m); eylhs = f_clamp(eylhs, 0, n); ezlhs = f_clamp(ezlhs, 0, p)
+
+#ifndef MFC_PRE_PROCESS
+                    pack_idr = (i - 1) + ${pack_v_size}$*((j - 1) + ${thickness}$*(k + (n + 1)*l))
+#endif
+
+                    $:impl
+                #:endblock
+
+            else if ((${dir}$) == +1 .and. (${loc}$) == +1) then
+
+                #:block MAKE_LOOPS(loops = outer_loops + [("l", 0, "p"), ("k", 0, "n"), ("j", 1, thickness)] + inner_loops, gpu=gpu)
+                    x = m + j; y = k; z = l ! Regular
+                    sx = m - (j - 1); sy = k; sz = l ! Symmetry
+                    px = j - 1; py = k; pz = l ! Periodic
+                    ex = m; ey = k; ez = l ! Extrapolation
+                    locx = +1; locy = 0; locz = 0
+
+                    exlhs = 0; eylhs = ey; ezlhs = ez
+                    exlhs = f_clamp(exlhs, 0, m); eylhs = f_clamp(eylhs, 0, n); ezlhs = f_clamp(ezlhs, 0, p)
+
+#ifndef MFC_PRE_PROCESS
+                    pack_idr = (i - 1) + ${pack_v_size}$*((j - 1) + ${thickness}$*(k + (n + 1)*l))
+#endif
+
+                    $:impl
+                #:endblock
+
+            else if ((${dir}$) == +2 .and. (${loc}$) == -1) then
+
+                #:block MAKE_LOOPS(loops = outer_loops + [("k", 0, "p"), ("j", 1, thickness), ("l", f"-({thickness})", f"m + ({thickness})")] + inner_loops, gpu=gpu)
+                    x = l; y = -j; z = k ! Regular
+                    sx = l; sy = j - 1; sz = k ! Symmetry
+                    px = l; py = n - (j - 1); pz = k ! Periodic
+                    ex = l; ey = 0; ez = k ! Extrapolation
+                    locx = 0; locy = -1; locz = 0
+
+                    exlhs = ex; eylhs = ey; ezlhs = ez
+                    exlhs = f_clamp(exlhs, 0, m); eylhs = f_clamp(eylhs, 0, n); ezlhs = f_clamp(ezlhs, 0, p)
+
+#ifndef MFC_PRE_PROCESS
+                    pack_idr = (i - 1) + ${pack_v_size}$*((l + ${thickness}$) + (m + 2*${thickness}$+1)*(j - 1 + ${thickness}$*k))
+#endif
+
+                    $:impl
+                #:endblock
+
+            else if ((${dir}$) == +2 .and. (${loc}$) == +1) then
+
+                #:block MAKE_LOOPS(loops = outer_loops + [("k", 0, "p"), ("j", 1, thickness), ("l", f"-({thickness})", f"m + ({thickness})")] + inner_loops, gpu=gpu)
+                    x = l; y = n + j; z = k ! Regular
+                    sx = l; sy = n - (j - 1); sz = k ! Symmetry
+                    px = l; py = j - 1; pz = k ! Periodic
+                    ex = l; ey = n; ez = k ! Extrapolation
+                    locx = 0; locy = +1; locz = 0
+
+                    exlhs = ex; eylhs = 0; ezlhs = ez
+                    exlhs = f_clamp(exlhs, 0, m); eylhs = f_clamp(eylhs, 0, n); ezlhs = f_clamp(ezlhs, 0, p)
+
+#ifndef MFC_PRE_PROCESS
+                    pack_idr = (i - 1) + ${pack_v_size}$*((l + ${thickness}$) + (m + 2*${thickness}$+1)*(j - 1 + ${thickness}$*k))
+#endif
+
+                    $:impl
+                #:endblock
+
+            else if ((${dir}$) == +3 .and. (${loc}$) == -1) then
+
+                #:block MAKE_LOOPS(loops = outer_loops + [("j", 1, thickness), ("l", f"-({thickness})", f"n + ({thickness})"), ("k", f"-({thickness})", f"m + ({thickness})")] + inner_loops, gpu=gpu)
+                    x = k; y = l; z = -j          ! Regular
+                    sx = k; sy = l; sz = j - 1       ! Symmetry
+                    px = k; py = l; pz = p - (j - 1) ! Periodic
+                    ex = k; ey = l; ez = 0           ! Extrapolation
+                    locx = 0; locy = 0; locz = -1
+
+                    exlhs = ex; eylhs = ey; ezlhs = ez
+                    exlhs = f_clamp(exlhs, 0, m); eylhs = f_clamp(eylhs, 0, n); ezlhs = f_clamp(ezlhs, 0, p)
+
+#ifndef MFC_PRE_PROCESS
+                    pack_idr = (i - 1) + ${pack_v_size}$*((k + ${thickness}$) + (m + 2*${thickness}$+1)*((l + ${thickness}$) + (n + 2*${thickness}$+1)*(j - 1)))
+#endif
+
+                    $:impl
+                #:endblock
+
+            else if ((${dir}$) == +3 .and. (${loc}$) == +1) then
+
+                #:block MAKE_LOOPS(loops = outer_loops + [("j", 1, thickness), ("l", f"-({thickness})", f"n + ({thickness})"), ("k", f"-({thickness})", f"m + ({thickness})")] + inner_loops, gpu=gpu)
+                    x = k; y = l; z = p + j       ! Regular
+                    sx = k; sy = l; sz = p - (j - 1) ! Symmetry
+                    px = k; py = l; pz = j - 1       ! Periodic
+                    ex = k; ey = l; ez = p           ! Extrapolation
+                    locx = 0; locy = 0; locz = +1
+
+                    exlhs = ex; eylhs = ey; ezlhs = 0
+                    exlhs = f_clamp(exlhs, 0, m); eylhs = f_clamp(eylhs, 0, n); ezlhs = f_clamp(ezlhs, 0, p)
+
+#ifndef MFC_PRE_PROCESS
+                    pack_idr = (i - 1) + ${pack_v_size}$*((k + ${thickness}$) + (m + 2*${thickness}$+1)*((l + ${thickness}$) + (n + 2*${thickness}$+1)*(j - 1)))
+#endif
+
+                    $:impl
+                #:endblock
+
+            else
+
+                stop "Invalid boundary condition direction or location."
+
+            end if
+
+        end if
+    #:enddef ITERATE_OVER_BUFFER_REGION_SIDED
+
+    #:def BOUNDARY_CONDITION_INTEGER_DECLARATIONS()
+        integer :: i, j, k, l, q
+
+        integer :: x, y, z
+        integer :: sx, sy, sz
+        integer :: px, py, pz
+        integer :: ex, ey, ez
+        integer :: locx, locy, locz
+        integer :: exlhs, eylhs, ezlhs
+        integer :: pack_idr
+    #:enddef
diff --git a/src/common/m_boundary_conditions_common.fpp b/src/common/m_boundary_conditions_common.fpp
new file mode 100644
index 0000000000..bd17a661ba
--- /dev/null
+++ b/src/common/m_boundary_conditions_common.fpp
@@ -0,0 +1,484 @@
+#:include 'macros.fpp'
+
+module m_boundary_conditions_common
+
+    use m_global_parameters
+
+    use m_mpi_proxy
+
+    use m_helper
+
+    use m_compile_specific
+
+    implicit none
+
+    private; public :: s_initialize_boundary_conditions_module, &
+ s_generate_boundary_condition_patch_buffers, &
+ bc_id_sfs, bc_id_has_bc, &
+ s_read_boundary_condition_files, &
+ s_write_boundary_condition_files
+
+    ! Boundary condition structure. bc_id_sfs(<dir>, <loc>)%sf(<x>, <y>, <z>)
+    ! holds the necessary boundary condition information for a point along the
+    ! surface in the direction <dir> on the side <loc>, to apply said condition.
+    type(t_bc_id_sf), dimension(1:3, -1:1) :: bc_id_sfs
+
+#ifdef MFC_MPI
+    integer :: mpi_bc_id_type
+    integer, dimension(1:3, -1:1) :: mpi_bc_sf_type
+#endif
+
+    ! Optimization structure. Holds whether a boundary condition can be found
+    ! on a surface. bc_id_has_bc(<dir>, <loc>, 0) is true if MPI communication
+    ! is required for at least one point on the surface. In general,
+    ! bc_id_has_bc(<dir>, <loc>, <id>) is true if a boundary condition with type
+    ! <id> can be found on the surface on the side <loc> of the direction <dir>.
+    logical, dimension(1:3, -1:1, -num_bcs_max:0) :: bc_id_has_bc
+
+!$acc declare create(bc_id_sfs)
+
+#ifndef MFC_PRE_PROCESS
+    public :: s_populate_prim_buffers, s_populate_cons_buffers
+#endif
+
+contains
+
+    subroutine s_initialize_boundary_conditions_module()
+
+        integer :: iter_loc
+
+        do iter_loc = -1, 2, 2
+            @:ALLOCATE(bc_id_sfs(1, iter_loc)%sf(0:0, 0:n, 0:p))
+            if (num_dims > 1) then
+                @:ALLOCATE(bc_id_sfs(2, iter_loc)%sf(0:m, 0:0, 0:p))
+
+                if (num_dims > 2) then
+                    @:ALLOCATE(bc_id_sfs(3, iter_loc)%sf(0:m, 0:n, 0:0))
+                end if
+            end if
+        end do
+
+#ifdef MFC_MPI
+        call s_create_mpi_types()
+#endif
+
+    end subroutine s_initialize_boundary_conditions_module
+
+#ifdef MFC_MPI
+    subroutine s_create_mpi_types()
+
+        use mpi
+
+        integer :: blocklengths(2) = (/1, 3/)
+        integer(KIND=MPI_ADDRESS_KIND) :: displacements(2) = (/0, 8/)   ! Offset between fields in memory
+        integer :: types(2) = (/MPI_INTEGER, MPI_DOUBLE_PRECISION/)
+        integer, dimension(1:3) :: sf_extents_loc, sf_start_idx
+        integer :: ierr
+
+        integer :: iter_dir, iter_loc
+
+        call MPI_Type_create_struct(2, blocklengths, displacements, types, mpi_bc_id_type, ierr)
+        call MPI_Type_commit(mpi_bc_id_type, ierr)
+
+        do iter_dir = 1, num_dims
+            do iter_loc = -1, 1, 2
+
+                sf_start_idx = (/0, 0, 0/)
+                sf_extents_loc = shape(bc_id_sfs(iter_dir, iter_loc)%sf)
+
+                call MPI_TYPE_CREATE_SUBARRAY(num_dims, sf_extents_loc, sf_extents_loc, sf_start_idx, &
+                                              MPI_ORDER_FORTRAN, mpi_bc_id_type, mpi_bc_sf_type(iter_dir, iter_loc), ierr)
+                call MPI_TYPE_COMMIT(mpi_bc_sf_type(iter_dir, iter_loc), ierr)
+
+            end do
+        end do
+
+    end subroutine s_create_mpi_types
+#endif
+
+    subroutine s_write_boundary_condition_files(step_dirpath)
+
+        character(LEN=*), intent(in) :: step_dirpath
+
+        integer :: iter_dir, iter_loc
+        character(len=path_len) :: file_path
+
+        character(len=10) :: status
+
+#ifdef MFC_MPI
+        integer :: ierr
+        integer :: file_id
+        integer :: offset
+        character(len=7) :: proc_rank_str
+#endif
+
+#ifdef MFC_PRE_PROCESS
+        if (old_grid) then
+            status = 'old'
+        else
+            status = 'new'
+        end if
+#else
+        status = 'unknown'
+#endif
+
+        if (parallel_io .eqv. .false.) then
+            file_path = trim(step_dirpath)//'/bc.dat'
+            open (1, FILE=trim(file_path), FORM='unformatted', STATUS=status)
+            do iter_dir = 1, num_dims
+                do iter_loc = -1, 1, 2
+                    write (1) bc_id_sfs(iter_dir, iter_loc)%sf
+                end do
+            end do
+            close (1)
+        else
+#ifdef MFC_MPI
+            write (proc_rank_str, '(I7.7)') proc_rank
+            file_path = trim(step_dirpath)//'/bc_'//trim(proc_rank_str)//'.dat'
+            call MPI_File_open(MPI_COMM_SELF, trim(file_path), MPI_MODE_CREATE + MPI_MODE_WRONLY, MPI_INFO_NULL, file_id, ierr)
+
+            offset = 0
+            do iter_dir = 1, num_dims
+                do iter_loc = -1, 1, 2
+                    call MPI_File_set_view(file_id, int(offset, KIND=MPI_ADDRESS_KIND), mpi_bc_id_type, mpi_bc_sf_type(iter_dir, iter_loc), 'native', MPI_INFO_NULL, ierr)
+                    call MPI_File_write_all(file_id, bc_id_sfs(iter_dir, iter_loc)%sf, 1, mpi_bc_sf_type(iter_dir, iter_loc), MPI_STATUS_IGNORE, ierr)
+                    offset = offset + sizeof(bc_id_sfs(iter_dir, iter_loc)%sf)
+                end do
+            end do
+
+            call MPI_File_close(file_id, ierr)
+#endif
+        end if
+
+    end subroutine s_write_boundary_condition_files
+
+    subroutine s_read_boundary_condition_files(step_dirpath)
+
+        character(LEN=*), intent(in) :: step_dirpath
+
+        integer :: iter_dir, iter_loc
+        logical :: file_exist
+        character(len=path_len) :: file_path
+
+#ifdef MFC_MPI
+        integer :: ierr
+        integer :: file_id
+        integer :: offset
+        character(len=7) :: proc_rank_str
+#endif
+
+        if (parallel_io .eqv. .false.) then
+            file_path = trim(step_dirpath)//'/bc.dat'
+        else
+#ifdef MFC_MPI
+            write (proc_rank_str, '(I7.7)') proc_rank
+            file_path = trim(step_dirpath)//'/bc_'//trim(proc_rank_str)//'.dat'
+#endif
+        end if
+
+        inquire (FILE=trim(file_path), EXIST=file_exist)
+        if (.not. file_exist) then
+            call s_mpi_abort(trim(file_path)//' is missing. Exiting ...')
+        end if
+
+        if (parallel_io .eqv. .false.) then
+            open (1, FILE=trim(file_path), FORM='unformatted', STATUS='unknown')
+            do iter_dir = 1, num_dims
+                do iter_loc = -1, 1, 2
+                    read (1) bc_id_sfs(iter_dir, iter_loc)%sf
+                end do
+            end do
+            close (1)
+        else
+#ifdef MFC_MPI
+            call MPI_File_open(MPI_COMM_SELF, trim(file_path), MPI_MODE_RDONLY, MPI_INFO_NULL, file_id, ierr)
+
+            offset = 0
+            do iter_dir = 1, num_dims
+                do iter_loc = -1, 1, 2
+                    call MPI_File_set_view(file_id, int(offset, KIND=MPI_ADDRESS_KIND), mpi_bc_id_type, mpi_bc_sf_type(iter_dir, iter_loc), 'native', MPI_INFO_NULL, ierr)
+                    call MPI_File_read_all(file_id, bc_id_sfs(iter_dir, iter_loc)%sf, 1, mpi_bc_sf_type(iter_dir, iter_loc), MPI_STATUS_IGNORE, ierr)
+                    offset = offset + sizeof(bc_id_sfs(iter_dir, iter_loc)%sf)
+                end do
+            end do
+
+            call MPI_File_close(file_id, ierr)
+#endif
+        end if
+
+        call s_generate_boundary_condition_lookup_buffers()
+
+    end subroutine s_read_boundary_condition_files
+
+    subroutine s_generate_boundary_condition_patch_buffers()
+
+        type(bc_patch_parameters) :: bc
+        integer :: iter_dir, iter_loc
+        real(kind(0d0)) :: radius2
+
+        type(int_bounds_info), dimension(1:3) :: user_input_bcs
+
+        @:BOUNDARY_CONDITION_INTEGER_DECLARATIONS()
+
+        user_input_bcs = [bc_x, bc_y, bc_z]
+
+        do iter_dir = 1, num_dims
+            #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="iter_dir", loc="-1", thickness=1, gpu=False)
+                bc_id_sfs(iter_dir, -1)%sf(exlhs, eylhs, ezlhs)%type = user_input_bcs(iter_dir)%beg
+                bc_id_sfs(iter_dir, -1)%sf(exlhs, eylhs, ezlhs)%vel = user_input_bcs(iter_dir)%vel_beg
+            #:endblock
+            #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="iter_dir", loc="+1", thickness=1, gpu=False)
+                bc_id_sfs(iter_dir, +1)%sf(exlhs, eylhs, ezlhs)%type = user_input_bcs(iter_dir)%end
+                bc_id_sfs(iter_dir, +1)%sf(exlhs, eylhs, ezlhs)%vel = user_input_bcs(iter_dir)%vel_end
+            #:endblock
+        end do
+
+        do iter_dir = 1, num_dims
+            do iter_loc = -1, 1, 2
+                #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="iter_dir", loc="iter_loc", thickness=1, gpu=False)
+                    do i = 1, num_bc_patches
+                        bc = patch_bc(i)
+                        if (bc%geometry == 1) then     ! Cuboid
+                            #:for dir, name in [(1, "x"), (2, "y"), (3, "z")]
+                                if (${dir}$ /= bc%dir .and. ${dir}$ <= num_dims) then
+                                    if (${name}$_cc(e${name}$lhs) >= bc%centroid(${dir}$) + 0.5d0*bc%length(${dir}$)) then
+                                        cycle
+                                    end if
+                                    if (${name}$_cc(e${name}$lhs) <= bc%centroid(${dir}$) - 0.5d0*bc%length(${dir}$)) then
+                                        cycle
+                                    end if
+                                end if
+                            #:endfor
+                        elseif (bc%geometry == 2) then ! Spheroid
+                            radius2 = 0d0
+                            #:for dir, name in [(1, "x"), (2, "y"), (3, "z")]
+                                if (${dir}$ /= bc%dir .and. ${dir}$ <= num_dims) then
+                                    radius2 = radius2 + (${name}$_cc(e${name}$lhs) - bc%centroid(${dir}$))**2
+                                end if
+                            #:endfor
+                            if (radius2 > bc%radius**2) then
+                                cycle
+                            end if
+                        end if
+
+                        bc_id_sfs(bc%dir, bc%loc)%sf(exlhs, eylhs, ezlhs)%type = bc%type
+                        bc_id_sfs(bc%dir, bc%loc)%sf(exlhs, eylhs, ezlhs)%vel = bc%vel
+                    end do
+                #:endblock
+            end do
+        end do
+
+        do iter_dir = 1, num_dims
+            do iter_loc = -1, 1, 2
+                #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="iter_dir", loc="iter_loc", thickness=1, gpu=False)
+                    if (proc_nums(iter_dir) > 1 .and. ( &
+                        bc_id_sfs(iter_dir, iter_loc)%sf(exlhs, eylhs, ezlhs)%type == -1 &
+                        .or. (iter_loc == -1 .and. proc_coords(iter_dir) > 0) &
+                        .or. (iter_loc == +1 .and. proc_coords(iter_dir) < proc_nums(iter_dir) - 1) &
+                        )) then
+                        bc_id_sfs(iter_dir, iter_loc)%sf(exlhs, eylhs, ezlhs)%type = neighbor_procs(iter_dir, iter_loc)
+                    end if
+                #:endblock
+
+                !$acc update device(bc_id_sfs(iter_dir, iter_loc)%sf)
+            end do
+        end do
+
+        call s_generate_boundary_condition_lookup_buffers()
+
+    end subroutine s_generate_boundary_condition_patch_buffers
+
+    subroutine s_generate_boundary_condition_lookup_buffers
+
+        integer :: iter_dir, iter_loc
+
+        @:BOUNDARY_CONDITION_INTEGER_DECLARATIONS()
+
+        do iter_dir = 1, num_dims
+            do iter_loc = -1, 1, 2
+                bc_id_has_bc(iter_dir, iter_loc, :) = .false.
+                #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="iter_dir", loc="iter_loc", thickness=1, gpu=False)
+                    bc_id_has_bc(iter_dir, iter_loc, min(0, bc_id_sfs(iter_dir, iter_loc)%sf(exlhs, eylhs, ezlhs)%type)) = .true.
+                #:endblock
+            end do
+        end do
+
+    end subroutine s_generate_boundary_condition_lookup_buffers
+
+#ifndef MFC_PRE_PROCESS
+    !>  The purpose of this procedure is to populate the buffers
+    !!      of the primitive variables, depending on the selected
+    !!      boundary conditions.
+    subroutine s_populate_prim_buffers(q_prim_vf &
+#ifdef MFC_SIMULATION
+                                       , pb, mv &
+#endif
+                                       )
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
+#ifdef MFC_SIMULATION
+        real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(inout) :: pb, mv
+#endif
+
+        integer :: iter_dir, iter_loc
+
+        @:BOUNDARY_CONDITION_INTEGER_DECLARATIONS()
+
+        do iter_dir = 1, num_dims
+            do iter_loc = -1, 1, 2
+                if (any(bc_id_has_bc(iter_dir, iter_loc, :-1))) then
+                    #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="iter_dir", loc="iter_loc")
+                        select case (bc_id_sfs(iter_dir, iter_loc)%sf(exlhs, eylhs, ezlhs)%type)
+                        case (-13:-3); ! Ghost-cell extrap.
+                            !$acc loop seq
+                            do i = 1, sys_size
+                                q_prim_vf(i)%sf(x, y, z) = q_prim_vf(i)%sf(ex, ey, ez)
+                            end do
+                        case (-2); ! Symmetry
+                            !$acc loop seq
+                            do i = 1, momxb + iter_dir - 2
+                                q_prim_vf(i)%sf(x, y, z) = q_prim_vf(i)%sf(sx, sy, sz)
+                            end do
+
+                            q_prim_vf(momxb + iter_dir - 1)%sf(x, y, z) = &
+                                -q_prim_vf(momxb + iter_dir - 1)%sf(sx, sy, sz)
+
+                            !$acc loop seq
+                            do i = momxb + iter_dir, sys_size
+                                q_prim_vf(i)%sf(x, y, z) = q_prim_vf(i)%sf(sx, sy, sz)
+                            end do
+                        case (-1); ! Periodic
+                            !$acc loop seq
+                            do i = 1, sys_size
+                                q_prim_vf(i)%sf(x, y, z) = q_prim_vf(i)%sf(px, py, pz)
+                            end do
+                        case (-15); ! Splip Wall
+                            !$acc loop seq
+                            do i = 1, sys_size
+                                if (i == momxb + iter_dir - 1) then
+                                    q_prim_vf(i)%sf(x, y, z) = &
+                                        -q_prim_vf(i)%sf(sx, sy, sz) + 2d0*bc_id_sfs(iter_dir, iter_loc)%sf(exlhs, eylhs, ezlhs)%vel(iter_dir)
+                                else
+                                    q_prim_vf(i)%sf(x, y, z) = q_prim_vf(i)%sf(ex, ey, ez)
+                                end if
+                            end do
+                        case (-16); ! No Slip Wall
+                            !$acc loop seq
+                            do i = 1, sys_size
+                                if (i >= momxb .and. i <= momxe) then
+                                    q_prim_vf(i)%sf(x, y, z) = &
+                                        -q_prim_vf(i)%sf(sx, sy, sz) + 2d0*bc_id_sfs(iter_dir, iter_loc)%sf(exlhs, eylhs, ezlhs)%vel(i - momxb + 1)
+                                else
+                                    q_prim_vf(i)%sf(x, y, z) = q_prim_vf(i)%sf(ex, ey, ez)
+                                end if
+                            end do
+                        case (-17); ! Velocity Inlet
+                            !$acc loop seq
+                            do i = 1, sys_size
+                                if (i >= momxb .and. i <= momxe) then
+                                    q_prim_vf(i)%sf(x, y, z) = &
+                                        bc_id_sfs(iter_dir, iter_loc)%sf(exlhs, eylhs, ezlhs)%vel(i - momxb + 1)
+                                else
+                                    q_prim_vf(i)%sf(x, y, z) = q_prim_vf(i)%sf(ex, ey, ez)
+                                end if
+                            end do
+                        case (-14); ! Axis
+                            if (z_cc(k) < pi) then
+                                !$acc loop seq
+                                do i = 1, momxb
+                                    q_prim_vf(i)%sf(x, y, z) = &
+                                        q_prim_vf(i)%sf(sx, sy, z + ((p + 1)/2))
+                                end do
+
+                                q_prim_vf(momxb + 1)%sf(x, y, z) = &
+                                    -q_prim_vf(momxb + 1)%sf(sx, sy, z + ((p + 1)/2))
+
+                                q_prim_vf(momxe)%sf(x, y, z) = &
+                                    -q_prim_vf(momxe)%sf(sx, sy, z + ((p + 1)/2))
+
+                                !$acc loop seq
+                                do i = E_idx, sys_size
+                                    q_prim_vf(i)%sf(x, y, z) = &
+                                        q_prim_vf(i)%sf(sx, sy, z + ((p + 1)/2))
+                                end do
+                            else
+                                !$acc loop seq
+                                do i = 1, momxb
+                                    q_prim_vf(i)%sf(x, y, z) = &
+                                        q_prim_vf(i)%sf(sx, sy, z - ((p + 1)/2))
+                                end do
+
+                                q_prim_vf(momxb + 1)%sf(x, y, z) = &
+                                    -q_prim_vf(momxb + 1)%sf(sx, sy, z - ((p + 1)/2))
+
+                                q_prim_vf(momxe)%sf(x, y, z) = &
+                                    -q_prim_vf(momxe)%sf(sx, sy, z - ((p + 1)/2))
+
+                                !$acc loop seq
+                                do i = E_idx, sys_size
+                                    q_prim_vf(i)%sf(x, y, z) = &
+                                        q_prim_vf(i)%sf(sx, sy, z - ((p + 1)/2))
+                                end do
+                            end if
+
+                        end select
+                    #:endblock
+
+#ifdef MFC_SIMULATION
+                    if (qbmm .and. .not. polytropic) then
+                        #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="iter_dir", loc="iter_loc", outer_loops=[("i", 1, "nb"), ("q", 1, "nnode")])
+                            select case (bc_id_sfs(iter_dir, iter_loc)%sf(exlhs, eylhs, ezlhs)%type)
+                            case (-13:-3); ! Ghost-cell extrap.
+                                pb(x, y, z, q, i) = pb(ex, ey, ez, q, i)
+                                mv(x, y, z, q, i) = mv(ex, ey, ez, q, i)
+                            case (-2); ! Symmetry
+                                pb(x, y, z, q, i) = pb(sx, sy, sz, q, i)
+                                mv(x, y, z, q, i) = mv(sx, sy, sz, q, i)
+                            case (-1); ! Periodic
+                                pb(x, y, z, q, i) = pb(px, py, pz, q, i)
+                                mv(x, y, z, q, i) = mv(px, py, pz, q, i)
+                            case (-14); ! Axis
+                                pb(x, y, z, q, i) = pb(sx, sy, z - ((p + 1)/2), q, i)
+                                mv(x, y, z, q, i) = mv(sx, sy, z - ((p + 1)/2), q, i)
+                            end select
+                        #:endblock
+                    end if
+#endif
+                end if
+
+                call s_mpi_sendrecv_variables_buffers(q_prim_vf, bc_id_sfs, &
+#ifdef MFC_SIMULATION
+                                                      pb, mv, &
+#endif
+                                                      iter_dir, iter_loc, &
+                                                      bc_id_has_bc)
+
+            end do
+        end do
+
+    end subroutine s_populate_prim_buffers
+
+    !>  The purpose of this procedure is to populate the buffers
+    !!      of the conservative variables, depending on the selected
+    !!      boundary conditions.
+    subroutine s_populate_cons_buffers(q_cons_vf &
+#ifdef MFC_SIMULATION
+                                       , pb, mv &
+#endif
+                                       )
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+#ifdef MFC_SIMULATION
+        real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(inout) :: pb, mv
+#endif
+
+        call s_populate_prim_buffers(q_cons_vf &
+#ifdef MFC_SIMULATION
+                                     , pb, mv &
+#endif
+                                     )
+
+    end subroutine s_populate_cons_buffers
+#endif
+
+end module m_boundary_conditions_common
diff --git a/src/common/m_checker_common.fpp b/src/common/m_checker_common.fpp
index 3623fabba8..0d232ffc44 100644
--- a/src/common/m_checker_common.fpp
+++ b/src/common/m_checker_common.fpp
@@ -222,7 +222,7 @@ contains
                 #:endif
 
                 if (.not. skip_check) then
-                    @:PROHIBIT(bc_${X}$%${BOUND}$ /= dflt_int .and. (bc_${X}$%${BOUND}$ > -1 .or. bc_${X}$%${BOUND}$ < -16), &
+                    @:PROHIBIT(bc_${X}$%${BOUND}$ /= dflt_int .and. (bc_${X}$%${BOUND}$ > -1 .or. bc_${X}$%${BOUND}$ < -17), &
                         "bc_${X}$%${BOUND}$ must be between -1 and -16")
 
                     @:PROHIBIT(bc_${X}$%${BOUND}$ /= dflt_int .and. bc_${X}$%${BOUND}$ == -14, &
@@ -320,35 +320,23 @@ contains
     !> Checks constraints on the inputs for moving boundaries.
         !! Called by s_check_inputs_common for all three stages
     subroutine s_check_inputs_moving_bc
-        #:for X, VB2, VB3 in [('x', 'vb2', 'vb3'), ('y', 'vb3', 'vb1'), ('z', 'vb1', 'vb2')]
-            if (any((/bc_${X}$%vb1, bc_${X}$%vb2, bc_${X}$%vb3/) /= 0d0)) then
-                if (bc_${X}$%beg == -15) then
-                    if (any((/bc_${X}$%${VB2}$, bc_${X}$%${VB3}$/) /= 0d0)) then
-                        call s_mpi_abort("bc_${X}$%beg must be -15 if "// &
-                                         "bc_${X}$%${VB2}$ or bc_${X}$%${VB3}$ "// &
-                                         "is set. Exiting ...")
-                    end if
-                elseif (bc_${X}$%beg /= -16) then
-                    call s_mpi_abort("bc_${X}$%beg must be -15 or -16 if "// &
-                                     "bc_${X}$%vb[1,2,3] is set. Exiting ...")
-                end if
-            end if
-        #:endfor
 
-        #:for X, VE2, VE3 in [('x', 've2', 've3'), ('y', 've3', 've1'), ('z', 've1', 've2')]
-            if (any((/bc_${X}$%ve1, bc_${X}$%ve2, bc_${X}$%ve3/) /= 0d0)) then
-                if (bc_${X}$%end == -15) then
-                    if (any((/bc_${X}$%${VE2}$, bc_${X}$%${VE3}$/) /= 0d0)) then
-                        call s_mpi_abort("bc_${X}$%end must be -15 if "// &
-                                         "bc_${X}$%${VE2}$ or bc_${X}$%${VE3}$ "// &
-                                         "is set. Exiting ...")
+        #:for DIR in ['x', 'y', 'z']
+            #:for LOC in ['beg', 'end']
+                if (any(bc_${DIR}$%vel_${LOC}$ /= 0d0)) then
+                    if (bc_${DIR}$%${LOC}$ == -15) then
+                        if (any((/bc_${DIR}$%vel_${LOC}$ (2), bc_${DIR}$%vel_${LOC}$ (3)/) /= 0d0)) then
+                            call s_mpi_abort("bc_${DIR}$%${LOC}$ must be -15 if "// &
+                                             "bc_${DIR}$%vel_${LOC}$[2,3] is set. Exiting ...")
+                        end if
+                    elseif (bc_${DIR}$%${LOC}$ /= -16) then
+                        call s_mpi_abort("bc_${DIR}$%${LOC}$ must be -15 or -16 if "// &
+                                         "bc_${DIR}$%vel_${LOC}$ is set. Exiting ...")
                     end if
-                elseif (bc_${X}$%end /= -16) then
-                    call s_mpi_abort("bc_${X}$%end must be -15 or -16 if "// &
-                                     "bc_${X}$%ve[1,2,3] is set. Exiting ...")
                 end if
-            end if
+            #:endfor
         #:endfor
+
     end subroutine s_check_inputs_moving_bc
 
 end module m_checker_common
diff --git a/src/common/m_constants.fpp b/src/common/m_constants.fpp
index 8bc120bef7..a1435b4bfa 100644
--- a/src/common/m_constants.fpp
+++ b/src/common/m_constants.fpp
@@ -20,8 +20,12 @@ module m_constants
     integer, parameter :: num_fluids_max = 10   !< Maximum number of fluids in the simulation
     integer, parameter :: num_probes_max = 10   !< Maximum number of flow probes in the simulation
     integer, parameter :: num_patches_max = 10
+    integer, parameter :: num_bc_patches_max = 20
     integer, parameter :: pathlen_max = 400
     integer, parameter :: nnode = 4    !< Number of QBMM nodes
+    integer, parameter :: gp_layers = 3 !< Number of ghost point layers
+    integer, parameter :: num_bcs_max = 100 !< Maximum number of boundary condition types.
+
     real(kind(0d0)), parameter :: capillary_cutoff = 1e-6 !< color function gradient magnitude at which to apply the surface tension fluxes
     real(kind(0d0)), parameter :: acoustic_spatial_support_width = 2.5d0 !< Spatial support width of acoustic source, used in s_source_spatial
     real(kind(0d0)), parameter :: dflt_vcfl_dt = 100d0 !< value of vcfl_dt when viscosity is off for computing adaptive timestep size
diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp
index b94ec89b1b..b43a72824b 100644
--- a/src/common/m_derived_types.fpp
+++ b/src/common/m_derived_types.fpp
@@ -73,12 +73,8 @@ module m_derived_types
     type int_bounds_info
         integer :: beg
         integer :: end
-        real(kind(0d0)) :: vb1
-        real(kind(0d0)) :: vb2
-        real(kind(0d0)) :: vb3
-        real(kind(0d0)) :: ve1
-        real(kind(0d0)) :: ve2
-        real(kind(0d0)) :: ve3
+        real(kind(0d0)) :: vel_beg(1:3)
+        real(kind(0d0)) :: vel_end(1:3)
         real(kind(0d0)) :: pres_in, pres_out
         real(kind(0d0)), dimension(3) :: vel_in, vel_out
         real(kind(0d0)), dimension(num_fluids_max) :: alpha_rho_in, alpha_in
@@ -320,7 +316,7 @@ module m_derived_types
     !> Ghost Point for Immersed Boundaries
     type ghost_point
 
-        real(kind(0d0)), dimension(3) :: loc !< Physical location of the ghost point
+        integer, dimension(3) :: loc !< Physical location of the ghost point
         real(kind(0d0)), dimension(3) :: ip_loc !< Physical location of the image point
         integer, dimension(3) :: ip_grid !< Top left grid point of IP
         real(kind(0d0)), dimension(2, 2, 2) :: interp_coeffs !< Interpolation Coefficients of image point
@@ -348,4 +344,27 @@ module m_derived_types
         integer :: gamma_method
     end type chemistry_parameters
 
+    type bc_patch_parameters
+        ! User inputs
+        integer :: type
+        integer :: dir   !  [x,y,z]  => [1,2,3]
+        integer :: loc   ! [beg,end] => [-1,+1]
+        integer :: geometry
+
+        real(kind(0d0)) :: centroid(1:3)
+        real(kind(0d0)) :: length(1:3)
+        real(kind(0d0)) :: radius
+
+        real(kind(0d0)) :: vel(1:3)
+    end type bc_patch_parameters
+
+    type t_bc_id
+        integer :: type
+        real(kind(0d0)) :: vel(1:3)
+    end type t_bc_id
+
+    type t_bc_id_sf
+        type(t_bc_id), dimension(:, :, :), allocatable :: sf
+    end type t_bc_id_sf
+
 end module m_derived_types
diff --git a/src/common/m_global_parameters_common.fpp b/src/common/m_global_parameters_common.fpp
new file mode 100644
index 0000000000..9164935d36
--- /dev/null
+++ b/src/common/m_global_parameters_common.fpp
@@ -0,0 +1,27 @@
+module m_global_parameters_common
+
+    use m_constants
+    use m_derived_types
+
+    implicit none
+
+contains
+
+    subroutine s_bc_assign_default_values_to_user_inputs(num_bc_patches, patch_bc)
+
+        integer, intent(inout) :: num_bc_patches
+        type(bc_patch_parameters), intent(inout) :: patch_bc(num_bc_patches_max)
+
+        integer :: i
+
+        num_bc_patches = 0
+        do i = 1, num_bc_patches_max
+            patch_bc(i)%type = dflt_int
+            patch_bc(i)%geometry = dflt_int
+            patch_bc(i)%dir = dflt_int
+            patch_bc(i)%loc = dflt_int
+        end do
+
+    end subroutine s_bc_assign_default_values_to_user_inputs
+
+end module m_global_parameters_common
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index dce2711209..83023ec4cb 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -11,8 +11,6 @@ module m_helper
 
     use m_global_parameters    !< Definitions of the global parameters
 
-    use m_mpi_common           !< MPI modules
-
     use ieee_arithmetic        !< For checking NaN
 
     ! ==========================================================================
@@ -36,7 +34,7 @@ module m_helper
               s_print_2D_array, &
               f_xor, &
               f_logical_to_int, &
-              s_prohibit_abort
+              f_clamp
 
 contains
 
@@ -69,9 +67,6 @@ contains
 
         nR3 = dot_product(weights, nRtmp**3.d0)
         ntmp = DSQRT((4.d0*pi/3.d0)*nR3/vftmp)
-        !ntmp = (3.d0/(4.d0*pi))*0.00001
-
-        !print *, "nbub", ntmp
 
     end subroutine s_comp_n_from_cons
 
@@ -466,20 +461,15 @@ contains
         end if
     end function f_logical_to_int
 
-    subroutine s_prohibit_abort(condition, message)
-        character(len=*), intent(in) :: condition, message
+    function f_clamp(x, xmin, xmax) result(y)
 
-        print *, ""
-        print *, "===================================================================================================="
-        print *, "                                          CASE FILE ERROR                                           "
-        print *, "----------------------------------------------------------------------------------------------------"
-        print *, "Prohibited condition: ", trim(condition)
-        if (len_trim(message) > 0) then
-            print *, "Note: ", trim(message)
-        end if
-        print *, "===================================================================================================="
-        print *, ""
-        call s_mpi_abort
-    end subroutine s_prohibit_abort
+        !$acc routine seq
+
+        integer, intent(in) :: x, xmin, xmax
+        integer :: y
+
+        y = max(xmin, min(x, xmax))
+
+    end function f_clamp
 
 end module m_helper
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 403ac4f133..fc523922ae 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -13,9 +13,14 @@ module m_mpi_common
     use mpi                    !< Message passing interface (MPI) module
 #endif
 
+    use m_nvtx
+
+    use m_helper
+
     use m_derived_types        !< Definitions of the derived types
 
     use m_global_parameters    !< Definitions of the global parameters
+
     ! ==========================================================================
 
     implicit none
@@ -25,6 +30,27 @@ module m_mpi_common
     integer, private :: ierr
     !> @}
 
+#ifndef MFC_PRE_PROCESS
+    real(kind(0d0)), private, allocatable, dimension(:), target :: q_cons_buff_send !<
+    !! This variable is utilized to pack and send the buffer of the cell-average
+    !! conservative variables, for a single computational domain boundary at the
+    !! time, to the relevant neighboring processor.
+
+    real(kind(0d0)), private, allocatable, dimension(:), target :: q_cons_buff_recv !<
+    !! q_cons_buff_recv is utilized to receive and unpack the buffer of the cell-
+    !! average conservative variables, for a single computational domain boundary
+    !! at the time, from the relevant neighboring processor.
+
+    !$acc declare create(q_cons_buff_send, q_cons_buff_recv)
+
+    integer :: v_size
+
+    !$acc declare create(v_size)
+#endif
+
+    integer, dimension(3, -1:1) :: neighbor_procs
+    !$acc declare create(neighbor_procs)
+
 contains
 
     !> The subroutine initializes the MPI execution environment
@@ -60,6 +86,72 @@ contains
 
     end subroutine s_mpi_initialize
 
+    subroutine s_initialize_mpi_common_module()
+
+#if defined(MFC_MPI) && !defined(MFC_PRE_PROCESS)
+
+        ! Allocating q_cons_buff_send/recv and ib_buff_send/recv. Please note that
+        ! for the sake of simplicity, both variables are provided sufficient
+        ! storage to hold the largest buffer in the computational domain.
+
+#ifdef MFC_SIMULATION
+        if (qbmm .and. .not. polytropic) then
+            if (n > 0) then
+                if (p > 0) then
+                    @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*(sys_size + 2*nb*4)* &
+                                             & (m + 2*buff_size + 1)* &
+                                             & (n + 2*buff_size + 1)* &
+                                             & (p + 2*buff_size + 1)/ &
+                                             & (min(m, n, p) + 2*buff_size + 1)))
+                else
+                    @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*(sys_size + 2*nb*4)* &
+                                             & (max(m, n) + 2*buff_size + 1)))
+                end if
+            else
+                @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*(sys_size + 2*nb*4)))
+            end if
+
+            @:ALLOCATE(q_cons_buff_recv(0:ubound(q_cons_buff_send, 1)))
+
+            v_size = sys_size + 2*nb*4
+        else
+#endif
+            if (n > 0) then
+                if (p > 0) then
+                    @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*sys_size* &
+                                             & (m + 2*buff_size + 1)* &
+                                             & (n + 2*buff_size + 1)* &
+                                             & (p + 2*buff_size + 1)/ &
+                                             & (min(m, n, p) + 2*buff_size + 1)))
+                else
+                    @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*sys_size* &
+                                             & (max(m, n) + 2*buff_size + 1)))
+                end if
+            else
+                @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*sys_size))
+            end if
+
+            @:ALLOCATE(q_cons_buff_recv(0:ubound(q_cons_buff_send, 1)))
+
+            v_size = sys_size
+#if MFC_SIMULATION
+        end if
+#endif
+
+!$acc update device(v_size)
+
+#endif
+
+    end subroutine s_initialize_mpi_common_module
+
+    subroutine s_finalize_mpi_common_module()
+
+#if defined(MFC_MPI) && !defined(MFC_PRE_PROCESS)
+        @:DEALLOCATE(q_cons_buff_send, q_cons_buff_recv)
+#endif
+
+    end subroutine s_finalize_mpi_common_module
+
     !! @param q_cons_vf Conservative variables
     !! @param ib_markers track if a cell is within the immersed boundary
     !! @param levelset closest distance from every cell to the IB
@@ -475,4 +567,699 @@ contains
 
     end subroutine s_mpi_finalize
 
+#ifndef MFC_PRE_PROCESS
+    !>  The goal of this procedure is to populate the buffers of
+        !!      the cell-average conservative variables by communicating
+        !!      with the neighboring processors.
+        !!  @param q_cons_vf Cell-average conservative variables
+        !!  @param mpi_dir MPI communication coordinate direction
+        !!  @param pbc_loc Processor boundary condition (PBC) location
+    subroutine s_mpi_sendrecv_variables_buffers(q_cons_vf, &
+                                                bc_id_sfs, &
+#ifdef MFC_SIMULATION
+                                                pb, mv, &
+#endif
+                                                mpi_dir, &
+                                                pbc_loc, &
+                                                bc_id_has_bc)
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(t_bc_id_sf), dimension(1:3, -1:1), intent(in) :: bc_id_sfs
+
+#ifdef MFC_SIMULATION
+        real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(inout) :: pb, mv
+#endif
+        integer, intent(in) :: mpi_dir, pbc_loc
+
+        logical, dimension(1:3, -1:1, -num_bcs_max:0), intent(in) :: bc_id_has_bc
+
+        @:BOUNDARY_CONDITION_INTEGER_DECLARATIONS()
+
+        type(bc_patch_parameters) :: bc
+
+        integer :: buffer_counts(1:3)
+
+        real(kind(0d0)), pointer :: p_send, p_recv
+
+        integer :: r
+
+#ifdef MFC_MPI
+
+!$acc update device(v_size)
+
+#ifdef MFC_SIMULATION
+        if (qbmm .and. .not. polytropic) then
+            buffer_counts = (/ &
+                            buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), &
+                            buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), &
+                            buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
+                            /)
+        else
+#endif
+            buffer_counts = (/ &
+                            buff_size*sys_size*(n + 1)*(p + 1), &
+                            buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), &
+                            buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
+                            /)
+#ifdef MFC_SIMULATION
+        end if
+#endif
+
+        if (bc_id_has_bc(mpi_dir, -pbc_loc, 0)) then
+            call nvtxStartRange("RHS-COMM-PACKBUF")
+
+            #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="mpi_dir", loc="-pbc_loc", inner_loops=[("i", 1, "sys_size")])
+                q_cons_buff_send(pack_idr) = q_cons_vf(i)%sf(sx, sy, sz)
+            #:endblock
+
+#ifdef MFC_SIMULATION
+            if (qbmm .and. .not. polytropic) then
+                #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="mpi_dir", loc="-pbc_loc", inner_loops=[("i", "sys_size + 1", "sys_size + 4"), ("q", 1, "nb")])
+                    q_cons_buff_send(pack_idr + (q - 1)*4) = pb(sx, sy, sz, i - sys_size, q)
+                    q_cons_buff_send(pack_idr + (q - 1)*4 + nb*4) = mv(sx, sy, sz, i - sys_size, q)
+                #:endblock
+            end if
+#endif
+
+            call nvtxEndRange ! Packbuf
+        end if
+
+        p_send => q_cons_buff_send(0)
+        p_recv => q_cons_buff_recv(0)
+
+        #:for rdma_mpi in [False, True]
+            if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
+                #:if rdma_mpi
+                    !$acc data attach(p_send, p_recv)
+                    !$acc host_data use_device(p_send, p_recv)
+
+                    call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
+                #:else
+                    call nvtxStartRange("RHS-COMM-DEV2HOST")
+                    !$acc update host(q_cons_buff_send)
+                    call nvtxEndRange
+
+                    call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
+                #:endif
+
+                call MPI_SENDRECV( &
+                    p_send, buffer_counts(mpi_dir), MPI_DOUBLE_PRECISION, neighbor_procs(mpi_dir, -pbc_loc), 0, &
+                    p_recv, buffer_counts(mpi_dir), MPI_DOUBLE_PRECISION, neighbor_procs(mpi_dir, +pbc_loc), 0, &
+                    MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
+                call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
+
+                #:if rdma_mpi
+                    !$acc end host_data
+                    !$acc end data
+                    !$acc wait
+                #:else
+                    call nvtxStartRange("RHS-COMM-HOST2DEV")
+                    !$acc update device(q_cons_buff_recv)
+                    call nvtxEndRange
+                #:endif
+            end if
+        #:endfor
+
+        if (bc_id_has_bc(mpi_dir, pbc_loc, 0)) then
+            call nvtxStartRange("RHS-COMM-UNPACKBUF")
+
+            #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="mpi_dir", loc="pbc_loc", inner_loops=[("i", 1, "sys_size")])
+                if (bc_id_sfs(mpi_dir, pbc_loc)%sf(exlhs, eylhs, ezlhs)%type >= 0) then
+                    q_cons_vf(i)%sf(x, y, z) = q_cons_buff_recv(pack_idr)
+                end if
+            #:endblock
+
+#ifdef MFC_SIMULATION
+            if (qbmm .and. .not. polytropic) then
+                #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="mpi_dir", loc="pbc_loc", inner_loops=[("i", "sys_size + 1", "sys_size + 4"), ("q", 1, "nb")])
+                    if (bc_id_sfs(mpi_dir, pbc_loc)%sf(exlhs, eylhs, ezlhs)%type >= 0) then
+                        pb(x, y, z, i - sys_size, q) = q_cons_buff_recv(pack_idr + (q - 1)*4)
+                        mv(x, y, z, i - sys_size, q) = q_cons_buff_recv(pack_idr + (q - 1)*4 + nb*4)
+                    end if
+                #:endblock
+            end if
+#endif
+
+            call nvtxEndRange
+        end if
+#endif
+
+    end subroutine s_mpi_sendrecv_variables_buffers
+
+    !>  The goal of this procedure is to populate the buffers of
+        !!      the grid variables by communicating with the neighboring
+        !!      processors. Note that only the buffers of the cell-width
+        !!      distributions are handled in such a way. This is because
+        !!      the buffers of cell-boundary locations may be calculated
+        !!      directly from those of the cell-width distributions.
+    subroutine s_mpi_sendrecv_grid_spacing_buffers(bc_id_sfs)
+
+        type(t_bc_id_sf), dimension(1:3, -1:1), intent(in) :: bc_id_sfs
+
+        @:BOUNDARY_CONDITION_INTEGER_DECLARATIONS()
+
+        type(t_bc_id) :: bc
+
+        integer :: iter_loc
+
+        integer :: send_offset, recv_offset
+        integer :: extra_cell_count
+
+        #:for cmp, dir_id, extent in zip(['x', 'y', 'z'], [1, 2, 3], ['m', 'n', 'p'])
+
+            if (${dir_id}$ > num_dims) then
+                return
+            end if
+
+            do iter_loc = -1, 1, 2
+
+#ifdef MFC_MPI
+                if (iter_loc == -1) then
+                    send_offset = ${extent}$-buff_size + 1
+                    recv_offset = -buff_size
+                else
+                    send_offset = 0
+                    recv_offset = ${extent}$+1
+                end if
+
+                call MPI_SENDRECV( &
+                    d${cmp}$ (send_offset), buff_size, &
+                    MPI_DOUBLE_PRECISION, neighbor_procs(${dir_id}$, -iter_loc), 0, &
+                    d${cmp}$ (recv_offset), buff_size, &
+                    MPI_DOUBLE_PRECISION, neighbor_procs(${dir_id}$, +iter_loc), 0, &
+                    MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+#endif
+
+                ! Note: This does a little TOO much work (iterating over too many cells)
+                !       but this is because dx, dy, dz SHOULD be two dimensional arrays,
+                !       not one-dimensional arrays. Indeed, the boundary conditions
+                !       can vary along two axes, not just one if the case is 3D.
+                #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir=dir_id, loc="iter_loc", gpu=False)
+                    bc = bc_id_sfs(${dir_id}$, iter_loc)%sf(exlhs, eylhs, ezlhs)
+
+                    if (bc%type <= -3) then
+                        d${cmp}$ (${cmp}$) = d${cmp}$ (e${cmp}$)
+                    elseif (bc%type == -2) then
+                        d${cmp}$ (${cmp}$) = d${cmp}$ (s${cmp}$)
+                    elseif (bc%type == -1) then
+                        d${cmp}$ (${cmp}$) = d${cmp}$ (p${cmp}$)
+                    end if
+                #:endblock
+
+            end do
+
+#ifdef MFC_POST_PROCESS
+            extra_cell_count = offset_${cmp}$%beg
+#else
+            extra_cell_count = buff_size
+#endif
+
+            do i = 1, extra_cell_count
+                ! Computing the cell-boundary locations buffer, at the beginning of
+                ! the coordinate direction, from the cell-width distribution buffer.
+                ${cmp}$_cb(-1 - i) = ${cmp}$_cb(-i) - d${cmp}$ (-i)
+            end do
+
+#ifdef MFC_POST_PROCESS
+            extra_cell_count = offset_${cmp}$%end
+#else
+            extra_cell_count = buff_size
+#endif
+
+            do i = 1, extra_cell_count
+                ! Populating the cell-boundary locations buffer, at the end of the
+                ! coordinate direction, from buffer of the cell-width distribution.
+                ${cmp}$_cb(${extent}$+i) = ${cmp}$_cb(${extent}$+(i - 1)) + d${cmp}$ (${extent}$+i)
+            end do
+
+            do i = 1, buff_size
+                ! Computing the cell-center locations buffer, at the beginning of
+                ! the coordinate direction, from the cell-width distribution buffer.
+                ${cmp}$_cc(-i) = ${cmp}$_cc(1 - i) - (d${cmp}$ (1 - i) + d${cmp}$ (-i))/2d0
+
+                ! Populating the cell-center locations buffer, at the end of the
+                ! coordinate direction, from buffer of the cell-width distribution.
+                ${cmp}$_cc(${extent}$+i) = ${cmp}$_cc(${extent}$+(i - 1)) + (d${cmp}$ (${extent}$+(i - 1)) + d${cmp}$ (${extent}$+i))/2d0
+            end do
+        #:endfor
+
+    end subroutine s_mpi_sendrecv_grid_spacing_buffers
+#endif
+
+    !>  The purpose of this procedure is to optimally decompose
+        !!      the computational domain among the available processors.
+        !!      This is performed by attempting to award each processor,
+        !!      in each of the coordinate directions, approximately the
+        !!      same number of cells, and then recomputing the affected
+        !!      global parameters.
+    subroutine s_mpi_decompose_computational_domain
+
+#ifdef MFC_MPI
+
+        real(kind(0d0)) :: tmp_num_procs_x, tmp_num_procs_y, tmp_num_procs_z !<
+                !! Non-optimal number of processors in the x-, y- and z-directions
+
+        real(kind(0d0)) :: fct_min !<
+                !! Processor factorization (fct) minimization parameter
+
+        integer :: MPI_COMM_CART !<
+                !! Cartesian processor topology communicator
+
+        integer, dimension(1:3) :: rem_cells !<
+                !! Remaining number of cells, in a particular coordinate direction,
+                !! after the majority is divided up among the available processors
+
+        integer :: i, j !< Generic loop iterators
+
+        integer :: iter_dir
+
+#endif
+
+        proc_coords(:) = 0
+        proc_nums(:) = 1
+
+#ifdef MFC_MPI
+
+        if (num_procs == 1 .and. parallel_io) then
+            do i = 1, num_dims
+                start_idx(i) = 0
+            end do
+            return
+        end if
+
+        ! 3D Cartesian Processor Topology ==================================
+        if (n > 0) then
+
+            if (p > 0) then
+
+                if (cyl_coord .and. p > 0) then
+                    ! Implement pencil processor blocking if using cylindrical coordinates so
+                    ! that all cells in azimuthal direction are stored on a single processor.
+                    ! This is necessary for efficient application of Fourier filter near axis.
+
+                    ! Initial values of the processor factorization optimization
+                    proc_nums(1) = 1
+                    proc_nums(2) = num_procs
+                    proc_nums(3) = 1
+                    ierr = -1
+
+                    ! Computing minimization variable for these initial values
+                    tmp_num_procs_x = proc_nums(1)
+                    tmp_num_procs_y = proc_nums(2)
+                    tmp_num_procs_z = proc_nums(3)
+                    fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
+                                       - (n + 1)/tmp_num_procs_y)
+
+                    ! Searching for optimal computational domain distribution
+                    do i = 1, num_procs
+
+                        if (mod(num_procs, i) == 0 &
+                            .and. &
+                            (m + 1)/i >= num_stcls_min*weno_order) then
+
+                            tmp_num_procs_x = i
+                            tmp_num_procs_y = num_procs/i
+
+                            if (fct_min >= abs((m + 1)/tmp_num_procs_x &
+                                               - (n + 1)/tmp_num_procs_y) &
+                                .and. &
+                                (n + 1)/tmp_num_procs_y &
+                                >= &
+                                num_stcls_min*weno_order) then
+
+                                proc_nums(1) = i
+                                proc_nums(2) = num_procs/i
+                                fct_min = abs((m + 1)/tmp_num_procs_x &
+                                              - (n + 1)/tmp_num_procs_y)
+                                ierr = 0
+
+                            end if
+
+                        end if
+
+                    end do
+
+                else
+
+                    ! Initial estimate of optimal processor topology
+                    proc_nums(1) = 1
+                    proc_nums(2) = 1
+                    proc_nums(3) = num_procs
+                    ierr = -1
+
+                    ! Benchmarking the quality of this initial guess
+                    tmp_num_procs_x = proc_nums(1)
+                    tmp_num_procs_y = proc_nums(2)
+                    tmp_num_procs_z = proc_nums(3)
+                    fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
+                                       - (n + 1)/tmp_num_procs_y) &
+                              + 10d0*abs((n + 1)/tmp_num_procs_y &
+                                         - (p + 1)/tmp_num_procs_z)
+
+                    ! Optimization of the initial processor topology
+                    do i = 1, num_procs
+
+                        if (mod(num_procs, i) == 0 &
+                            .and. &
+                            (m + 1)/i >= num_stcls_min*weno_order) then
+
+                            do j = 1, num_procs/i
+
+                                if (mod(num_procs/i, j) == 0 &
+                                    .and. &
+                                    (n + 1)/j >= num_stcls_min*weno_order) then
+
+                                    tmp_num_procs_x = i
+                                    tmp_num_procs_y = j
+                                    tmp_num_procs_z = num_procs/(i*j)
+
+                                    if (fct_min >= abs((m + 1)/tmp_num_procs_x &
+                                                       - (n + 1)/tmp_num_procs_y) &
+                                        + abs((n + 1)/tmp_num_procs_y &
+                                              - (p + 1)/tmp_num_procs_z) &
+                                        .and. &
+                                        (p + 1)/tmp_num_procs_z &
+                                        >= &
+                                        num_stcls_min*weno_order) &
+                                        then
+
+                                        proc_nums(1) = i
+                                        proc_nums(2) = j
+                                        proc_nums(3) = num_procs/(i*j)
+                                        fct_min = abs((m + 1)/tmp_num_procs_x &
+                                                      - (n + 1)/tmp_num_procs_y) &
+                                                  + abs((n + 1)/tmp_num_procs_y &
+                                                        - (p + 1)/tmp_num_procs_z)
+                                        ierr = 0
+
+                                    end if
+
+                                end if
+
+                            end do
+
+                        end if
+
+                    end do
+
+                end if
+
+                ! Verifying that a valid decomposition of the computational
+                ! domain has been established. If not, the simulation exits.
+                if (proc_rank == 0 .and. ierr == -1) then
+                    call s_mpi_abort('Unsupported combination of values '// &
+                                     'of num_procs, m, n, p and '// &
+                                     'weno_order. Exiting ...')
+                end if
+
+                ! Creating new communicator using the Cartesian topology
+                call MPI_CART_CREATE(MPI_COMM_WORLD, 3, (/proc_nums(1), &
+                                                          proc_nums(2), proc_nums(3)/), &
+                                     (/.true., .true., .true./), &
+                                     .false., MPI_COMM_CART, ierr)
+
+                ! Finding the Cartesian coordinates of the local process
+                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 3, &
+                                     proc_coords, ierr)
+                ! END: 3D Cartesian Processor Topology =============================
+
+                ! Global Parameters for z-direction ================================
+
+                ! Number of remaining cells
+                rem_cells(3) = mod(p + 1, proc_nums(3))
+
+#ifdef MFC_PRE_PROCESS
+                ! Preliminary uniform cell-width spacing
+                if (old_grid .neqv. .true.) then
+                    dz = (z_domain%end - z_domain%beg)/real(p + 1, kind(0d0))
+                end if
+#endif
+
+                ! Optimal number of cells per processor
+                p = (p + 1)/proc_nums(3) - 1
+
+                ! Distributing the remaining cells
+                do i = 1, rem_cells(3)
+                    if (proc_coords(3) == i - 1) then
+                        p = p + 1; exit
+                    end if
+                end do
+
+                ! ==================================================================
+
+                ! 2D Cartesian Processor Topology ==================================
+            else
+
+                ! Initial estimate of optimal processor topology
+                proc_nums(1) = 1
+                proc_nums(2) = num_procs
+                ierr = -1
+
+                ! Benchmarking the quality of this initial guess
+                tmp_num_procs_x = proc_nums(1)
+                tmp_num_procs_y = proc_nums(2)
+                fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
+                                   - (n + 1)/tmp_num_procs_y)
+
+                ! Optimization of the initial processor topology
+                do i = 1, num_procs
+
+                    if (mod(num_procs, i) == 0 &
+                        .and. &
+                        (m + 1)/i >= num_stcls_min*weno_order) then
+
+                        tmp_num_procs_x = i
+                        tmp_num_procs_y = num_procs/i
+
+                        if (fct_min >= abs((m + 1)/tmp_num_procs_x &
+                                           - (n + 1)/tmp_num_procs_y) &
+                            .and. &
+                            (n + 1)/tmp_num_procs_y &
+                            >= &
+                            num_stcls_min*weno_order) then
+
+                            proc_nums(1) = i
+                            proc_nums(2) = num_procs/i
+                            fct_min = abs((m + 1)/tmp_num_procs_x &
+                                          - (n + 1)/tmp_num_procs_y)
+                            ierr = 0
+
+                        end if
+
+                    end if
+
+                end do
+
+                ! Verifying that a valid decomposition of the computational
+                ! domain has been established. If not, the simulation exits.
+                if (proc_rank == 0 .and. ierr == -1) then
+                    call s_mpi_abort('Unsupported combination of values '// &
+                                     'of num_procs, m, n and '// &
+                                     'weno_order. Exiting ...')
+                end if
+
+                ! Creating new communicator using the Cartesian topology
+                call MPI_CART_CREATE(MPI_COMM_WORLD, 2, (/proc_nums(1), &
+                                                          proc_nums(2)/), (/.true., &
+                                                                            .true./), .false., MPI_COMM_CART, &
+                                     ierr)
+
+                ! Finding the Cartesian coordinates of the local process
+                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 2, &
+                                     proc_coords, ierr)
+
+            end if
+            ! END: 2D Cartesian Processor Topology =============================
+
+            ! Global Parameters for y-direction ================================
+
+            ! Number of remaining cells
+            rem_cells(2) = mod(n + 1, proc_nums(2))
+
+#ifdef MFC_PRE_PROCESS
+            ! Preliminary uniform cell-width spacing
+            if (old_grid .neqv. .true.) then
+                dy = (y_domain%end - y_domain%beg)/real(n + 1, kind(0d0))
+            end if
+#endif
+
+            ! Optimal number of cells per processor
+            n = (n + 1)/proc_nums(2) - 1
+
+            ! Distributing the remaining cells
+            do i = 1, rem_cells(2)
+                if (proc_coords(2) == i - 1) then
+                    n = n + 1; exit
+                end if
+            end do
+
+            ! ==================================================================
+
+            ! 1D Cartesian Processor Topology ==================================
+        else
+
+            ! Optimal processor topology
+            proc_nums(1) = num_procs
+
+#ifdef MFC_POST_PROCESS
+            ! Number of cells in undecomposed computational domain needed
+            ! for sub-domain reassembly during formatted data output
+            m_root = m
+#endif
+
+            ! Creating new communicator using the Cartesian topology
+            call MPI_CART_CREATE(MPI_COMM_WORLD, 1, (/proc_nums(1)/), &
+                                 (/.true./), .false., MPI_COMM_CART, &
+                                 ierr)
+
+            ! Finding the Cartesian coordinates of the local process
+            call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 1, &
+                                 proc_coords, ierr)
+
+        end if
+        ! ==================================================================
+
+        ! Global Parameters for x-direction ================================
+
+        ! Number of remaining cells
+        rem_cells(1) = mod(m + 1, proc_nums(1))
+
+#ifdef MFC_PRE_PROCESS
+        ! Preliminary uniform cell-width spacing
+        if (old_grid .neqv. .true.) then
+            dx = (x_domain%end - x_domain%beg)/real(m + 1, kind(0d0))
+        end if
+#endif
+
+        ! Optimal number of cells per processor
+        m = (m + 1)/proc_nums(1) - 1
+
+        ! Distributing the remaining cells
+        do i = 1, rem_cells(1)
+            if (proc_coords(1) == i - 1) then
+                m = m + 1; exit
+            end if
+        end do
+        ! ==================================================================
+
+        #:for cmp, dir, extent in zip(['z', 'y', 'x'], [3, 2, 1], ['p', 'n', 'm'])
+
+            if (${dir}$ <= num_dims) then
+
+#ifdef MFC_PRE_PROCESS
+                ! Beginning and end sub-domain boundary locations
+                if (parallel_io .neqv. .true.) then
+                    if (old_grid .neqv. .true.) then
+                        if (proc_coords(${dir}$) < rem_cells(${dir}$)) then
+                            ${cmp}$_domain%beg = ${cmp}$_domain%beg + d${cmp}$*real((${extent}$+1)* &
+                                                                                    proc_coords(${dir}$))
+                            ${cmp}$_domain%end = ${cmp}$_domain%end - d${cmp}$*real((${extent}$+1)* &
+                                                                                    (proc_nums(${dir}$) - proc_coords(${dir}$) - 1) &
+                                                                                    - (proc_nums(${dir}$) - rem_cells(${dir}$)))
+                        else
+                            ${cmp}$_domain%beg = ${cmp}$_domain%beg + d${cmp}$*real((${extent}$+1)* &
+                                                                                    proc_coords(${dir}$) + rem_cells(${dir}$))
+                            ${cmp}$_domain%end = ${cmp}$_domain%end - d${cmp}$*real((${extent}$+1)* &
+                                                                                    (proc_nums(${dir}$) - proc_coords(${dir}$) - 1))
+                        end if
+                    end if
+                else
+                    if (proc_coords(${dir}$) < rem_cells(${dir}$)) then
+                        start_idx(${dir}$) = (${extent}$+1)*proc_coords(${dir}$)
+                    else
+                        start_idx(${dir}$) = (${extent}$+1)*proc_coords(${dir}$) + rem_cells(${dir}$)
+                    end if
+                end if
+#endif
+
+#ifdef MFC_POST_PROCESS
+                ! Ghost zone at the beginning
+                if (proc_coords(${dir}$) > 0 .and. format == 1) then
+                    offset_${cmp}$%beg = 2
+                else
+                    offset_${cmp}$%beg = 0
+                end if
+
+                ! Ghost zone at the end
+                if (proc_coords(${dir}$) < proc_nums(${dir}$) - 1 .and. format == 1) then
+                    offset_${cmp}$%end = 2
+                else
+                    offset_${cmp}$%end = 0
+                end if
+#endif
+
+#ifndef MFC_PRE_PROCESS
+                if (parallel_io) then
+                    if (proc_coords(${dir}$) < rem_cells(${dir}$)) then
+                        start_idx(${dir}$) = (${extent}$+1)*proc_coords(${dir}$)
+                    else
+                        start_idx(${dir}$) = (${extent}$+1)*proc_coords(${dir}$) + rem_cells(${dir}$)
+                    end if
+                end if
+#endif
+
+            end if
+
+        #:endfor
+
+        do iter_dir = 1, num_dims
+            proc_coords(iter_dir) = proc_coords(iter_dir) - 1
+            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, neighbor_procs(iter_dir, -1), ierr)
+            proc_coords(iter_dir) = proc_coords(iter_dir) + 1
+            proc_coords(iter_dir) = proc_coords(iter_dir) + 1
+            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, neighbor_procs(iter_dir, +1), ierr)
+            proc_coords(iter_dir) = proc_coords(iter_dir) - 1
+        end do
+
+        if (proc_nums(1) > 1) then
+            if (bc_x%beg == -1 .or. proc_coords(1) > 0) then
+                bc_x%beg = neighbor_procs(1, -1)
+            end if
+
+            if (bc_x%end == +1 .or. proc_coords(1) < proc_nums(1) - 1) then
+                bc_x%end = neighbor_procs(1, +1)
+            end if
+        end if
+
+        if (num_dims >= 2 .and. proc_nums(2) > 1) then
+            if (bc_y%beg == -1 .or. proc_coords(2) > 0) then
+                bc_y%beg = neighbor_procs(2, -1)
+            end if
+
+            if (bc_y%end == +1 .or. proc_coords(2) < proc_nums(2) - 1) then
+                bc_y%end = neighbor_procs(2, +1)
+            end if
+        end if
+
+        if (num_dims >= 3 .and. proc_nums(3) > 1) then
+            if (bc_z%beg == -1 .or. proc_coords(3) > 0) then
+                bc_z%beg = neighbor_procs(3, -1)
+            end if
+
+            if (bc_z%end == +1 .or. proc_coords(3) < proc_nums(3) - 1) then
+                bc_z%end = neighbor_procs(3, +1)
+            end if
+        end if
+
+#endif
+
+    end subroutine s_mpi_decompose_computational_domain
+
+    subroutine s_prohibit_abort(condition, message)
+        character(len=*), intent(in) :: condition, message
+
+        print *, ""
+        print *, "===================================================================================================="
+        print *, "                                          CASE FILE ERROR                                           "
+        print *, "----------------------------------------------------------------------------------------------------"
+        print *, "Prohibited condition: ", trim(condition)
+        if (len_trim(message) > 0) then
+            print *, "Note: ", trim(message)
+        end if
+        print *, "===================================================================================================="
+        print *, ""
+        call s_mpi_abort
+    end subroutine s_prohibit_abort
+
 end module m_mpi_common
diff --git a/src/simulation/m_nvtx.f90 b/src/common/m_nvtx.f90
similarity index 100%
rename from src/simulation/m_nvtx.f90
rename to src/common/m_nvtx.f90
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index ffc6e02ba6..f6fd7e5542 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -23,7 +23,7 @@ module m_variables_conversion
     use m_helper
 
     use m_thermochem, only: &
-        num_species, get_temperature, get_pressure, &
+        num_species, get_temperature, get_pressure, gas_constant, &
         get_mixture_molecular_weight, get_mixture_energy_mass
 
     ! ==========================================================================
@@ -1114,7 +1114,7 @@ contains
                         end do
 
                         call get_mixture_molecular_weight(Ys, mix_mol_weight)
-                        T = q_prim_vf(T_idx)%sf(j, k, l)
+                        T = q_prim_vf(E_idx)%sf(j, k, l)*mix_mol_weight/(gas_constant*rho)
                         call get_mixture_energy_mass(T, Ys, e_mix)
 
                         q_cons_vf(E_idx)%sf(j, k, l) = &
diff --git a/src/post_process/m_data_input.f90 b/src/post_process/m_data_input.fpp
similarity index 53%
rename from src/post_process/m_data_input.f90
rename to src/post_process/m_data_input.fpp
index 567816f73b..9421865870 100644
--- a/src/post_process/m_data_input.f90
+++ b/src/post_process/m_data_input.fpp
@@ -2,6 +2,8 @@
 !! @file m_data_input.f90
 !> @brief Contains module m_data_input
 
+#:include 'macros.fpp'
+
 !> @brief This module features procedures, which for a specific time-step,
 !!             read in the raw simulation data for the grid and the conservative
 !!             variables and fill out their buffer regions.
@@ -21,6 +23,8 @@ module m_data_input
     use m_compile_specific
 
     use m_helper
+
+    use m_boundary_conditions_common
     ! ==========================================================================
 
     implicit none
@@ -29,8 +33,6 @@ module m_data_input
  s_read_data_files, &
  s_read_serial_data_files, &
  s_read_parallel_data_files, &
- s_populate_grid_variables_buffer_regions, &
- s_populate_conservative_variables_buffer_regions, &
  s_finalize_data_input_module
 
     abstract interface ! ===================================================
@@ -89,6 +91,8 @@ subroutine s_read_serial_data_files(t_step)
         logical :: file_check  !<
             !! Generic logical used to test the existence of a particular file
 
+        integer :: iter_dir, iter_loc
+
         integer :: i !< Generic loop iterator
 
         ! Setting location of time-step folder based on current time-step
@@ -226,6 +230,10 @@ subroutine s_read_serial_data_files(t_step)
 
         end do
 
+        if (t_step == 0) then
+            call s_read_boundary_condition_files(t_step_dir)
+        end if
+
         if (ib) then
             write (file_loc_ib, '(A,I0,A)') &
                 trim(t_step_ib_dir)//'/ib.dat'
@@ -235,7 +243,8 @@ subroutine s_read_serial_data_files(t_step)
                       FORM='unformatted', &
                       ACTION='read', &
                       STATUS='old')
-                call s_mpi_abort(trim(file_loc)//' is missing. Exiting ...')
+            else
+                call s_mpi_abort(trim(file_loc_ib)//' is missing. Exiting ...')
             end if
         end if
 
@@ -340,6 +349,10 @@ subroutine s_read_parallel_data_files(t_step)
             end if
         end if
 
+        if (t_step == 0) then
+            call s_read_boundary_condition_files(trim(trim(case_dir)//'/restart_data'))
+        end if
+
         if (file_per_process) then
             call s_int_to_str(t_step, t_step_string)
             ! Open the file to read conservative variables
@@ -506,635 +519,6 @@ subroutine s_read_parallel_data_files(t_step)
 
     end subroutine s_read_parallel_data_files
 
-    !>  The following subroutine populates the buffer regions of
-        !!      the cell-width spacings, the cell-boundary locations and
-        !!      the cell-center locations. Note that the buffer regions
-        !!      of the last two variables should be interpreted slightly
-        !!      differently than usual. They are really ghost zones that
-        !!      are used in aiding the multidimensional visualization of
-        !!      Silo database files, in VisIt, when processor boundary
-        !!      conditions are present.
-    subroutine s_populate_grid_variables_buffer_regions
-
-        integer :: i !< Generic loop iterator
-
-        ! Populating Buffer Regions in the x-direction =====================
-
-        ! Ghost-cell extrapolation BC at the beginning
-        if (bc_x%beg <= -3) then
-
-            do i = 1, buff_size
-                dx(-i) = dx(0)
-            end do
-
-            ! Symmetry BC at the beginning
-        elseif (bc_x%beg == -2) then
-
-            do i = 1, buff_size
-                dx(-i) = dx(i - 1)
-            end do
-
-            ! Periodic BC at the beginning
-        elseif (bc_x%beg == -1) then
-
-            do i = 1, buff_size
-                dx(-i) = dx((m + 1) - i)
-            end do
-
-            ! Processor BC at the beginning
-        else
-
-            call s_mpi_sendrecv_grid_vars_buffer_regions('beg', 'x')
-
-        end if
-
-        do i = 1, offset_x%beg
-            x_cb(-1 - i) = x_cb(-i) - dx(-i)
-        end do
-
-        do i = 1, buff_size
-            x_cc(-i) = x_cc(1 - i) - (dx(1 - i) + dx(-i))/2d0
-        end do
-
-        ! Ghost-cell extrapolation BC at the end
-        if (bc_x%end <= -3) then
-
-            do i = 1, buff_size
-                dx(m + i) = dx(m)
-            end do
-
-            ! Symmetry BC at the end
-        elseif (bc_x%end == -2) then
-
-            do i = 1, buff_size
-                dx(m + i) = dx((m + 1) - i)
-            end do
-
-            ! Periodic BC at the end
-        elseif (bc_x%end == -1) then
-
-            do i = 1, buff_size
-                dx(m + i) = dx(i - 1)
-            end do
-
-            ! Processor BC at the end
-        else
-
-            call s_mpi_sendrecv_grid_vars_buffer_regions('end', 'x')
-
-        end if
-
-        do i = 1, offset_x%end
-            x_cb(m + i) = x_cb(m + (i - 1)) + dx(m + i)
-        end do
-
-        do i = 1, buff_size
-            x_cc(m + i) = x_cc(m + (i - 1)) + (dx(m + (i - 1)) + dx(m + i))/2d0
-        end do
-
-        ! END: Populating Buffer Regions in the x-direction ================
-
-        ! Populating Buffer Regions in the y-direction =====================
-
-        if (n > 0) then
-
-            ! Ghost-cell extrapolation BC at the beginning
-            if (bc_y%beg <= -3 .and. bc_y%beg /= -14) then
-
-                do i = 1, buff_size
-                    dy(-i) = dy(0)
-                end do
-
-                ! Symmetry BC at the beginning
-            elseif (bc_y%beg == -2 .or. bc_y%beg == -14) then
-
-                do i = 1, buff_size
-                    dy(-i) = dy(i - 1)
-                end do
-
-                ! Periodic BC at the beginning
-            elseif (bc_y%beg == -1) then
-
-                do i = 1, buff_size
-                    dy(-i) = dy((n + 1) - i)
-                end do
-
-                ! Processor BC at the beginning
-            else
-
-                call s_mpi_sendrecv_grid_vars_buffer_regions('beg', 'y')
-
-            end if
-
-            do i = 1, offset_y%beg
-                y_cb(-1 - i) = y_cb(-i) - dy(-i)
-            end do
-
-            do i = 1, buff_size
-                y_cc(-i) = y_cc(1 - i) - (dy(1 - i) + dy(-i))/2d0
-            end do
-
-            ! Ghost-cell extrapolation BC at the end
-            if (bc_y%end <= -3) then
-
-                do i = 1, buff_size
-                    dy(n + i) = dy(n)
-                end do
-
-                ! Symmetry BC at the end
-            elseif (bc_y%end == -2) then
-
-                do i = 1, buff_size
-                    dy(n + i) = dy((n + 1) - i)
-                end do
-
-                ! Periodic BC at the end
-            elseif (bc_y%end == -1) then
-
-                do i = 1, buff_size
-                    dy(n + i) = dy(i - 1)
-                end do
-
-                ! Processor BC at the end
-            else
-
-                call s_mpi_sendrecv_grid_vars_buffer_regions('end', 'y')
-
-            end if
-
-            do i = 1, offset_y%end
-                y_cb(n + i) = y_cb(n + (i - 1)) + dy(n + i)
-            end do
-
-            do i = 1, buff_size
-                y_cc(n + i) = y_cc(n + (i - 1)) + (dy(n + (i - 1)) + dy(n + i))/2d0
-            end do
-
-            ! END: Populating Buffer Regions in the y-direction ================
-
-            ! Populating Buffer Regions in the z-direction =====================
-
-            if (p > 0) then
-
-                ! Ghost-cell extrapolation BC at the beginning
-                if (bc_z%beg <= -3) then
-
-                    do i = 1, buff_size
-                        dz(-i) = dz(0)
-                    end do
-
-                    ! Symmetry BC at the beginning
-                elseif (bc_z%beg == -2) then
-
-                    do i = 1, buff_size
-                        dz(-i) = dz(i - 1)
-                    end do
-
-                    ! Periodic BC at the beginning
-                elseif (bc_z%beg == -1) then
-
-                    do i = 1, buff_size
-                        dz(-i) = dz((p + 1) - i)
-                    end do
-
-                    ! Processor BC at the beginning
-                else
-
-                    call s_mpi_sendrecv_grid_vars_buffer_regions('beg', 'z')
-
-                end if
-
-                do i = 1, offset_z%beg
-                    z_cb(-1 - i) = z_cb(-i) - dz(-i)
-                end do
-
-                do i = 1, buff_size
-                    z_cc(-i) = z_cc(1 - i) - (dz(1 - i) + dz(-i))/2d0
-                end do
-
-                ! Ghost-cell extrapolation BC at the end
-                if (bc_z%end <= -3) then
-
-                    do i = 1, buff_size
-                        dz(p + i) = dz(p)
-                    end do
-
-                    ! Symmetry BC at the end
-                elseif (bc_z%end == -2) then
-
-                    do i = 1, buff_size
-                        dz(p + i) = dz((p + 1) - i)
-                    end do
-
-                    ! Periodic BC at the end
-                elseif (bc_z%end == -1) then
-
-                    do i = 1, buff_size
-                        dz(p + i) = dz(i - 1)
-                    end do
-
-                    ! Processor BC at the end
-                else
-
-                    call s_mpi_sendrecv_grid_vars_buffer_regions('end', 'z')
-
-                end if
-
-                do i = 1, offset_z%end
-                    z_cb(p + i) = z_cb(p + (i - 1)) + dz(p + i)
-                end do
-
-                do i = 1, buff_size
-                    z_cc(p + i) = z_cc(p + (i - 1)) + (dz(p + (i - 1)) + dz(p + i))/2d0
-                end do
-
-            end if
-
-        end if
-
-        ! END: Populating Buffer Regions in the z-direction ================
-
-    end subroutine s_populate_grid_variables_buffer_regions
-
-    !>  The purpose of this procedure is to populate the buffers
-        !!      of the cell-average conservative variables, depending on
-        !!      the boundary conditions.
-    subroutine s_populate_conservative_variables_buffer_regions
-
-        integer :: i, j, k !< Generic loop iterators
-
-        ! Populating Buffer Regions in the x-direction =====================
-
-        ! Ghost-cell extrapolation BC at the beginning
-        if (bc_x%beg <= -3) then
-
-            do j = 1, buff_size
-                do i = 1, sys_size
-                    q_cons_vf(i)%sf(-j, 0:n, 0:p) = q_cons_vf(i)%sf(0, 0:n, 0:p)
-                end do
-            end do
-
-            ! Symmetry BC at the beginning
-        elseif (bc_x%beg == -2) then
-
-            do j = 1, buff_size
-
-                ! Density or partial densities
-                do i = 1, cont_idx%end
-                    q_cons_vf(i)%sf(-j, 0:n, 0:p) = &
-                        q_cons_vf(i)%sf(j - 1, 0:n, 0:p)
-                end do
-
-                ! x-component of momentum
-                q_cons_vf(mom_idx%beg)%sf(-j, 0:n, 0:p) = &
-                    -q_cons_vf(mom_idx%beg)%sf(j - 1, 0:n, 0:p)
-
-                ! Remaining momentum component(s), if any, as well as the
-                ! energy and the variable(s) from advection equation(s)
-                do i = mom_idx%beg + 1, sys_size
-                    q_cons_vf(i)%sf(-j, 0:n, 0:p) = &
-                        q_cons_vf(i)%sf(j - 1, 0:n, 0:p)
-                end do
-
-            end do
-
-            ! Periodic BC at the beginning
-        elseif (bc_x%beg == -1) then
-
-            do j = 1, buff_size
-                do i = 1, sys_size
-                    q_cons_vf(i)%sf(-j, 0:n, 0:p) = &
-                        q_cons_vf(i)%sf((m + 1) - j, 0:n, 0:p)
-                end do
-            end do
-
-            ! Processor BC at the beginning
-        else
-
-            call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                         'beg', 'x')
-
-        end if
-
-        ! Ghost-cell extrapolation BC at the end
-        if (bc_x%end <= -3) then
-
-            do j = 1, buff_size
-                do i = 1, sys_size
-                    q_cons_vf(i)%sf(m + j, 0:n, 0:p) = &
-                        q_cons_vf(i)%sf(m, 0:n, 0:p)
-                end do
-            end do
-
-            ! Symmetry BC at the end
-        elseif (bc_x%end == -2) then
-
-            do j = 1, buff_size
-
-                ! Density or partial densities
-                do i = 1, cont_idx%end
-                    q_cons_vf(i)%sf(m + j, 0:n, 0:p) = &
-                        q_cons_vf(i)%sf((m + 1) - j, 0:n, 0:p)
-                end do
-
-                ! x-component of momentum
-                q_cons_vf(mom_idx%beg)%sf(m + j, 0:n, 0:p) = &
-                    -q_cons_vf(mom_idx%beg)%sf((m + 1) - j, 0:n, 0:p)
-
-                ! Remaining momentum component(s), if any, as well as the
-                ! energy and the variable(s) from advection equation(s)
-                do i = mom_idx%beg + 1, sys_size
-                    q_cons_vf(i)%sf(m + j, 0:n, 0:p) = &
-                        q_cons_vf(i)%sf((m + 1) - j, 0:n, 0:p)
-                end do
-
-            end do
-
-            ! Perodic BC at the end
-        elseif (bc_x%end == -1) then
-
-            do j = 1, buff_size
-                do i = 1, sys_size
-                    q_cons_vf(i)%sf(m + j, 0:n, 0:p) = &
-                        q_cons_vf(i)%sf(j - 1, 0:n, 0:p)
-                end do
-            end do
-
-            ! Processor BC at the end
-        else
-
-            call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                         'end', 'x')
-
-        end if
-
-        ! END: Populating Buffer Regions in the x-direction ================
-
-        ! Populating Buffer Regions in the y-direction =====================
-
-        if (n > 0) then
-
-            ! Ghost-cell extrapolation BC at the beginning
-            if (bc_y%beg <= -3 .and. bc_y%beg /= -14) then
-
-                do j = 1, buff_size
-                    do i = 1, sys_size
-                        q_cons_vf(i)%sf(:, -j, 0:p) = q_cons_vf(i)%sf(:, 0, 0:p)
-                    end do
-                end do
-
-                ! Axis BC at the beginning
-            elseif (bc_y%beg == -14) then
-
-                do j = 1, buff_size
-                    do k = 0, p
-                        if (z_cc(k) < pi) then
-                            do i = 1, mom_idx%beg
-                                q_cons_vf(i)%sf(:, -j, k) = &
-                                    q_cons_vf(i)%sf(:, j - 1, k + ((p + 1)/2))
-                            end do
-
-                            q_cons_vf(mom_idx%beg + 1)%sf(:, -j, k) = &
-                                -q_cons_vf(mom_idx%beg + 1)%sf(:, j - 1, k + ((p + 1)/2))
-
-                            q_cons_vf(mom_idx%end)%sf(:, -j, k) = &
-                                -q_cons_vf(mom_idx%end)%sf(:, j - 1, k + ((p + 1)/2))
-
-                            do i = E_idx, sys_size
-                                q_cons_vf(i)%sf(:, -j, k) = &
-                                    q_cons_vf(i)%sf(:, j - 1, k + ((p + 1)/2))
-                            end do
-                        else
-                            do i = 1, mom_idx%beg
-                                q_cons_vf(i)%sf(:, -j, k) = &
-                                    q_cons_vf(i)%sf(:, j - 1, k - ((p + 1)/2))
-                            end do
-
-                            q_cons_vf(mom_idx%beg + 1)%sf(:, -j, k) = &
-                                -q_cons_vf(mom_idx%beg + 1)%sf(:, j - 1, k - ((p + 1)/2))
-
-                            q_cons_vf(mom_idx%end)%sf(:, -j, k) = &
-                                -q_cons_vf(mom_idx%end)%sf(:, j - 1, k - ((p + 1)/2))
-
-                            do i = E_idx, sys_size
-                                q_cons_vf(i)%sf(:, -j, k) = &
-                                    q_cons_vf(i)%sf(:, j - 1, k - ((p + 1)/2))
-                            end do
-                        end if
-                    end do
-                end do
-
-                ! Symmetry BC at the beginning
-            elseif (bc_y%beg == -2) then
-
-                do j = 1, buff_size
-
-                    ! Density or partial densities and x-momentum component
-                    do i = 1, mom_idx%beg
-                        q_cons_vf(i)%sf(:, -j, 0:p) = &
-                            q_cons_vf(i)%sf(:, j - 1, 0:p)
-                    end do
-
-                    ! y-component of momentum
-                    q_cons_vf(mom_idx%beg + 1)%sf(:, -j, 0:p) = &
-                        -q_cons_vf(mom_idx%beg + 1)%sf(:, j - 1, 0:p)
-
-                    ! Remaining z-momentum component, if any, as well as the
-                    ! energy and variable(s) from advection equation(s)
-                    do i = mom_idx%beg + 2, sys_size
-                        q_cons_vf(i)%sf(:, -j, 0:p) = &
-                            q_cons_vf(i)%sf(:, j - 1, 0:p)
-                    end do
-
-                end do
-
-                ! Periodic BC at the beginning
-            elseif (bc_y%beg == -1) then
-
-                do j = 1, buff_size
-                    do i = 1, sys_size
-                        q_cons_vf(i)%sf(:, -j, 0:p) = &
-                            q_cons_vf(i)%sf(:, (n + 1) - j, 0:p)
-                    end do
-                end do
-
-                ! Processor BC at the beginning
-            else
-
-                call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                             'beg', 'y')
-
-            end if
-
-            ! Ghost-cell extrapolation BC at the end
-            if (bc_y%end <= -3) then
-
-                do j = 1, buff_size
-                    do i = 1, sys_size
-                        q_cons_vf(i)%sf(:, n + j, 0:p) = &
-                            q_cons_vf(i)%sf(:, n, 0:p)
-                    end do
-                end do
-
-                ! Symmetry BC at the end
-            elseif (bc_y%end == -2) then
-
-                do j = 1, buff_size
-
-                    ! Density or partial densities and x-momentum component
-                    do i = 1, mom_idx%beg
-                        q_cons_vf(i)%sf(:, n + j, 0:p) = &
-                            q_cons_vf(i)%sf(:, (n + 1) - j, 0:p)
-                    end do
-
-                    ! y-component of momentum
-                    q_cons_vf(mom_idx%beg + 1)%sf(:, n + j, 0:p) = &
-                        -q_cons_vf(mom_idx%beg + 1)%sf(:, (n + 1) - j, 0:p)
-
-                    ! Remaining z-momentum component, if any, as well as the
-                    ! energy and variable(s) from advection equation(s)
-                    do i = mom_idx%beg + 2, sys_size
-                        q_cons_vf(i)%sf(:, n + j, 0:p) = &
-                            q_cons_vf(i)%sf(:, (n + 1) - j, 0:p)
-                    end do
-
-                end do
-
-                ! Perodic BC at the end
-            elseif (bc_y%end == -1) then
-
-                do j = 1, buff_size
-                    do i = 1, sys_size
-                        q_cons_vf(i)%sf(:, n + j, 0:p) = &
-                            q_cons_vf(i)%sf(:, j - 1, 0:p)
-                    end do
-                end do
-
-                ! Processor BC at the end
-            else
-
-                call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                             'end', 'y')
-
-            end if
-
-            ! END: Populating Buffer Regions in the y-direction ================
-
-            ! Populating Buffer Regions in the z-direction =====================
-
-            if (p > 0) then
-
-                ! Ghost-cell extrapolation BC at the beginning
-                if (bc_z%beg <= -3) then
-
-                    do j = 1, buff_size
-                        do i = 1, sys_size
-                            q_cons_vf(i)%sf(:, :, -j) = q_cons_vf(i)%sf(:, :, 0)
-                        end do
-                    end do
-
-                    ! Symmetry BC at the beginning
-                elseif (bc_z%beg == -2) then
-
-                    do j = 1, buff_size
-
-                        ! Density or the partial densities and the momentum
-                        ! components in x- and y-directions
-                        do i = 1, mom_idx%beg + 1
-                            q_cons_vf(i)%sf(:, :, -j) = &
-                                q_cons_vf(i)%sf(:, :, j - 1)
-                        end do
-
-                        ! z-component of momentum
-                        q_cons_vf(mom_idx%end)%sf(:, :, -j) = &
-                            -q_cons_vf(mom_idx%end)%sf(:, :, j - 1)
-
-                        ! Energy and advection equation(s) variable(s)
-                        do i = E_idx, sys_size
-                            q_cons_vf(i)%sf(:, :, -j) = &
-                                q_cons_vf(i)%sf(:, :, j - 1)
-                        end do
-
-                    end do
-
-                    ! Periodic BC at the beginning
-                elseif (bc_z%beg == -1) then
-
-                    do j = 1, buff_size
-                        do i = 1, sys_size
-                            q_cons_vf(i)%sf(:, :, -j) = &
-                                q_cons_vf(i)%sf(:, :, (p + 1) - j)
-                        end do
-                    end do
-
-                    ! Processor BC at the beginning
-                else
-
-                    call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                                 'beg', 'z')
-
-                end if
-
-                ! Ghost-cell extrapolation BC at the end
-                if (bc_z%end <= -3) then
-
-                    do j = 1, buff_size
-                        do i = 1, sys_size
-                            q_cons_vf(i)%sf(:, :, p + j) = &
-                                q_cons_vf(i)%sf(:, :, p)
-                        end do
-                    end do
-
-                    ! Symmetry BC at the end
-                elseif (bc_z%end == -2) then
-
-                    do j = 1, buff_size
-
-                        ! Density or the partial densities and the momentum
-                        ! components in x- and y-directions
-                        do i = 1, mom_idx%beg + 1
-                            q_cons_vf(i)%sf(:, :, p + j) = &
-                                q_cons_vf(i)%sf(:, :, (p + 1) - j)
-                        end do
-
-                        ! z-component of momentum
-                        q_cons_vf(mom_idx%end)%sf(:, :, p + j) = &
-                            -q_cons_vf(mom_idx%end)%sf(:, :, (p + 1) - j)
-
-                        ! Energy and advection equation(s) variable(s)
-                        do i = E_idx, sys_size
-                            q_cons_vf(i)%sf(:, :, p + j) = &
-                                q_cons_vf(i)%sf(:, :, (p + 1) - j)
-                        end do
-
-                    end do
-
-                    ! Perodic BC at the end
-                elseif (bc_z%end == -1) then
-
-                    do j = 1, buff_size
-                        do i = 1, sys_size
-                            q_cons_vf(i)%sf(:, :, p + j) = &
-                                q_cons_vf(i)%sf(:, :, j - 1)
-                        end do
-                    end do
-
-                    ! Processor BC at the end
-                else
-
-                    call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                                 'end', 'z')
-
-                end if
-
-            end if
-
-        end if
-
-        ! END: Populating Buffer Regions in the z-direction ================
-
-    end subroutine s_populate_conservative_variables_buffer_regions
-
     !>  Computation of parameters, allocation procedures, and/or
         !!      any other tasks needed to properly setup the module
     subroutine s_initialize_data_input_module
diff --git a/src/post_process/m_derived_variables.fpp b/src/post_process/m_derived_variables.fpp
index e08973bd21..8f9bb01922 100644
--- a/src/post_process/m_derived_variables.fpp
+++ b/src/post_process/m_derived_variables.fpp
@@ -410,6 +410,7 @@ contains
                                      - fd_coeff_z(r, l)* &
                                      q_prim_vf(mom_idx%beg + 1)%sf(j, k, r + l))
                             else
+
                                 q_sf(j, k, l) = &
                                     q_sf(j, k, l) + fd_coeff_y(r, k)* &
                                     q_prim_vf(mom_idx%end)%sf(j, r + k, l) &
diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
index b150f23e7d..7730c7c073 100644
--- a/src/post_process/m_global_parameters.fpp
+++ b/src/post_process/m_global_parameters.fpp
@@ -20,6 +20,8 @@ module m_global_parameters
 
     use m_thermochem, only: num_species, species_names
 
+    use m_global_parameters_common
+
     ! ==========================================================================
 
     implicit none
@@ -109,6 +111,7 @@ module m_global_parameters
     logical :: mixture_err     !< Mixture error limiter
     logical :: alt_soundspeed  !< Alternate sound speed
     logical :: hypoelasticity  !< Turn hypoelasticity on
+    logical :: rdma_mpi        !< Turn on RDMA for MPI
     logical, parameter :: chemistry = .${chemistry}$. !< Chemistry modeling
     !> @}
 
@@ -144,14 +147,20 @@ module m_global_parameters
     !> @name Boundary conditions in the x-, y- and z-coordinate directions
     !> @{
     type(int_bounds_info) :: bc_x, bc_y, bc_z
+    integer :: num_bc_patches
+    type(bc_patch_parameters) :: patch_bc(num_bc_patches_max)
+    type(bounds_info) :: x_domain, y_domain, z_domain
     !> @}
 
     logical :: parallel_io    !< Format of the data files
     logical :: file_per_process !< output format
 
-    integer, allocatable, dimension(:) :: proc_coords !<
+    integer, dimension(1:3) :: proc_coords !<
     !! Processor coordinates in MPI_CART_COMM
 
+    integer, dimension(1:3) :: proc_nums !<
+    !! Processor dimensions in MPI_CART_COMM
+
     integer, allocatable, dimension(:) :: start_idx !<
     !! Starting cell-center index of local processor in global grid
 
@@ -331,18 +340,19 @@ contains
         relax = .false.
         relax_model = dflt_int
         hypoelasticity = .false.
+        rdma_mpi = .false.
 
         bc_x%beg = dflt_int; bc_x%end = dflt_int
         bc_y%beg = dflt_int; bc_y%end = dflt_int
         bc_z%beg = dflt_int; bc_z%end = dflt_int
 
         #:for DIM in ['x', 'y', 'z']
-            #:for DIR in [1, 2, 3]
-                bc_${DIM}$%vb${DIR}$ = 0d0
-                bc_${DIM}$%ve${DIR}$ = 0d0
-            #:endfor
+            bc_${DIM}$%vel_beg = 0d0
+            bc_${DIM}$%vel_end = 0d0
         #:endfor
 
+        call s_bc_assign_default_values_to_user_inputs(num_bc_patches, patch_bc)
+
         ! Fluids physical parameters
         do i = 1, num_fluids_max
             fluid_pp(i)%gamma = dflt_real
@@ -772,8 +782,6 @@ contains
 
         num_dims = 1 + min(1, n) + min(1, p)
 
-        allocate (proc_coords(1:num_dims))
-
         if (parallel_io .neqv. .true.) return
 
 #ifdef MFC_MPI
@@ -818,8 +826,6 @@ contains
 
         end if
 
-        deallocate (proc_coords)
-
         deallocate (adv)
 
 #ifdef MFC_MPI
diff --git a/src/post_process/m_mpi_proxy.fpp b/src/post_process/m_mpi_proxy.fpp
index 20c5346d21..2bfe9c91d1 100644
--- a/src/post_process/m_mpi_proxy.fpp
+++ b/src/post_process/m_mpi_proxy.fpp
@@ -199,1214 +199,6 @@ contains
 
     end subroutine s_mpi_bcast_user_inputs
 
-    !>  This subroutine takes care of efficiently distributing
-        !!      the computational domain among the available processors
-        !!      as well as recomputing some of the global parameters so
-        !!      that they reflect the configuration of sub-domain that
-        !!      is overseen by the local processor.
-    subroutine s_mpi_decompose_computational_domain
-
-#ifdef MFC_MPI
-
-        ! # of processors in the x-, y- and z-coordinate directions
-        integer :: num_procs_x, num_procs_y, num_procs_z
-
-        ! Temporary # of processors in x-, y- and z-coordinate directions
-        ! used during the processor factorization optimization procedure
-        real(kind(0d0)) :: tmp_num_procs_x, tmp_num_procs_y, tmp_num_procs_z
-
-        ! Processor factorization (fct) minimization parameter
-        real(kind(0d0)) :: fct_min
-
-        ! Cartesian processor topology communicator
-        integer :: MPI_COMM_CART
-
-        ! Number of remaining cells for a particular coordinate direction
-        ! after the bulk has evenly been distributed among the available
-        ! processors for that coordinate direction
-        integer :: rem_cells
-
-        ! Generic loop iterators
-        integer :: i, j
-
-        if (num_procs == 1 .and. parallel_io) then
-            do i = 1, num_dims
-                start_idx(i) = 0
-            end do
-            return
-        end if
-
-        ! Performing the computational domain decomposition. The procedure
-        ! is optimized by ensuring that each processor contains a close to
-        ! equivalent piece of the computational domain. Note that explicit
-        ! type-casting is omitted here for code legibility purposes.
-
-        ! Generating 3D Cartesian Processor Topology =======================
-
-        if (n > 0) then
-
-            if (p > 0) then
-
-                if (cyl_coord .and. p > 0) then
-                    ! Implement pencil processor blocking if using cylindrical coordinates so
-                    ! that all cells in azimuthal direction are stored on a single processor.
-                    ! This is necessary for efficient application of Fourier filter near axis.
-
-                    ! Initial values of the processor factorization optimization
-                    num_procs_x = 1
-                    num_procs_y = num_procs
-                    num_procs_z = 1
-                    ierr = -1
-
-                    ! Computing minimization variable for these initial values
-                    tmp_num_procs_x = num_procs_x
-                    tmp_num_procs_y = num_procs_y
-                    tmp_num_procs_z = num_procs_z
-                    fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
-                                       - (n + 1)/tmp_num_procs_y)
-
-                    ! Searching for optimal computational domain distribution
-                    do i = 1, num_procs
-
-                        if (mod(num_procs, i) == 0 &
-                            .and. &
-                            (m + 1)/i >= num_stcls_min*weno_order) then
-
-                            tmp_num_procs_x = i
-                            tmp_num_procs_y = num_procs/i
-
-                            if (fct_min >= abs((m + 1)/tmp_num_procs_x &
-                                               - (n + 1)/tmp_num_procs_y) &
-                                .and. &
-                                (n + 1)/tmp_num_procs_y &
-                                >= &
-                                num_stcls_min*weno_order) then
-
-                                num_procs_x = i
-                                num_procs_y = num_procs/i
-                                fct_min = abs((m + 1)/tmp_num_procs_x &
-                                              - (n + 1)/tmp_num_procs_y)
-                                ierr = 0
-
-                            end if
-
-                        end if
-
-                    end do
-
-                else
-
-                    ! Initial values of the processor factorization optimization
-                    num_procs_x = 1
-                    num_procs_y = 1
-                    num_procs_z = num_procs
-                    ierr = -1
-
-                    ! Computing minimization variable for these initial values
-                    tmp_num_procs_x = num_procs_x
-                    tmp_num_procs_y = num_procs_y
-                    tmp_num_procs_z = num_procs_z
-                    fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
-                                       - (n + 1)/tmp_num_procs_y) &
-                              + 10d0*abs((n + 1)/tmp_num_procs_y &
-                                         - (p + 1)/tmp_num_procs_z)
-
-                    ! Searching for optimal computational domain distribution
-                    do i = 1, num_procs
-
-                        if (mod(num_procs, i) == 0 &
-                            .and. &
-                            (m + 1)/i >= num_stcls_min*weno_order) then
-
-                            do j = 1, (num_procs/i)
-
-                                if (mod(num_procs/i, j) == 0 &
-                                    .and. &
-                                    (n + 1)/j >= num_stcls_min*weno_order) then
-
-                                    tmp_num_procs_x = i
-                                    tmp_num_procs_y = j
-                                    tmp_num_procs_z = num_procs/(i*j)
-
-                                    if (fct_min >= abs((m + 1)/tmp_num_procs_x &
-                                                       - (n + 1)/tmp_num_procs_y) &
-                                        + abs((n + 1)/tmp_num_procs_y &
-                                              - (p + 1)/tmp_num_procs_z) &
-                                        .and. &
-                                        (p + 1)/tmp_num_procs_z &
-                                        >= &
-                                        num_stcls_min*weno_order) &
-                                        then
-
-                                        num_procs_x = i
-                                        num_procs_y = j
-                                        num_procs_z = num_procs/(i*j)
-                                        fct_min = abs((m + 1)/tmp_num_procs_x &
-                                                      - (n + 1)/tmp_num_procs_y) &
-                                                  + abs((n + 1)/tmp_num_procs_y &
-                                                        - (p + 1)/tmp_num_procs_z)
-                                        ierr = 0
-
-                                    end if
-
-                                end if
-
-                            end do
-
-                        end if
-
-                    end do
-
-                end if
-
-                ! Checking whether the decomposition of the computational
-                ! domain was successful
-                if (proc_rank == 0 .and. ierr == -1) then
-                    print '(A)', 'Unable to decompose computational '// &
-                        'domain for selected number of '// &
-                        'processors. Exiting ...'
-                    call MPI_ABORT(MPI_COMM_WORLD, 1, ierr)
-                end if
-
-                ! Creating a new communicator using Cartesian topology
-                call MPI_CART_CREATE(MPI_COMM_WORLD, 3, (/num_procs_x, &
-                                                          num_procs_y, num_procs_z/), &
-                                     (/.true., .true., .true./), &
-                                     .false., MPI_COMM_CART, ierr)
-
-                ! Finding corresponding Cartesian coordinates of the local
-                ! processor rank in newly declared cartesian communicator
-                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 3, &
-                                     proc_coords, ierr)
-
-                ! END: Generating 3D Cartesian Processor Topology ==================
-
-                ! Sub-domain Global Parameters in z-direction ======================
-
-                ! Number of remaining cells after majority is distributed
-                rem_cells = mod(p + 1, num_procs_z)
-
-                ! Optimal number of cells per processor
-                p = (p + 1)/num_procs_z - 1
-
-                ! Distributing any remaining cells
-                do i = 1, rem_cells
-                    if (proc_coords(3) == i - 1) then
-                        p = p + 1
-                        exit
-                    end if
-                end do
-
-                ! Boundary condition at the beginning
-                if (proc_coords(3) > 0 .or. bc_z%beg == -1) then
-                    proc_coords(3) = proc_coords(3) - 1
-                    call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
-                                       bc_z%beg, ierr)
-                    proc_coords(3) = proc_coords(3) + 1
-                end if
-
-                ! Ghost zone at the beginning
-                if (proc_coords(3) > 0 .and. format == 1) then
-                    offset_z%beg = 2
-                else
-                    offset_z%beg = 0
-                end if
-
-                ! Boundary condition at the end
-                if (proc_coords(3) < num_procs_z - 1 .or. bc_z%end == -1) then
-                    proc_coords(3) = proc_coords(3) + 1
-                    call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
-                                       bc_z%end, ierr)
-                    proc_coords(3) = proc_coords(3) - 1
-                end if
-
-                ! Ghost zone at the end
-                if (proc_coords(3) < num_procs_z - 1 .and. format == 1) then
-                    offset_z%end = 2
-                else
-                    offset_z%end = 0
-                end if
-
-                if (parallel_io) then
-                    if (proc_coords(3) < rem_cells) then
-                        start_idx(3) = (p + 1)*proc_coords(3)
-                    else
-                        start_idx(3) = (p + 1)*proc_coords(3) + rem_cells
-                    end if
-                end if
-                ! ==================================================================
-
-                ! Generating 2D Cartesian Processor Topology =======================
-
-            else
-
-                ! Initial values of the processor factorization optimization
-                num_procs_x = 1
-                num_procs_y = num_procs
-                ierr = -1
-
-                ! Computing minimization variable for these initial values
-                tmp_num_procs_x = num_procs_x
-                tmp_num_procs_y = num_procs_y
-                fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
-                                   - (n + 1)/tmp_num_procs_y)
-
-                ! Searching for optimal computational domain distribution
-                do i = 1, num_procs
-
-                    if (mod(num_procs, i) == 0 &
-                        .and. &
-                        (m + 1)/i >= num_stcls_min*weno_order) then
-
-                        tmp_num_procs_x = i
-                        tmp_num_procs_y = num_procs/i
-
-                        if (fct_min >= abs((m + 1)/tmp_num_procs_x &
-                                           - (n + 1)/tmp_num_procs_y) &
-                            .and. &
-                            (n + 1)/tmp_num_procs_y &
-                            >= &
-                            num_stcls_min*weno_order) then
-
-                            num_procs_x = i
-                            num_procs_y = num_procs/i
-                            fct_min = abs((m + 1)/tmp_num_procs_x &
-                                          - (n + 1)/tmp_num_procs_y)
-                            ierr = 0
-
-                        end if
-
-                    end if
-
-                end do
-
-                ! Checking whether the decomposition of the computational
-                ! domain was successful
-                if (proc_rank == 0 .and. ierr == -1) then
-                    print '(A)', 'Unable to decompose computational '// &
-                        'domain for selected number of '// &
-                        'processors. Exiting ...'
-                    call MPI_ABORT(MPI_COMM_WORLD, 1, ierr)
-                end if
-
-                ! Creating a new communicator using Cartesian topology
-                call MPI_CART_CREATE(MPI_COMM_WORLD, 2, (/num_procs_x, &
-                                                          num_procs_y/), (/.true., &
-                                                                           .true./), .false., MPI_COMM_CART, &
-                                     ierr)
-
-                ! Finding corresponding Cartesian coordinates of the local
-                ! processor rank in newly declared cartesian communicator
-                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 2, &
-                                     proc_coords, ierr)
-
-            end if
-
-            ! END: Generating 2D Cartesian Processor Topology ==================
-
-            ! Sub-domain Global Parameters in y-direction ======================
-
-            ! Number of remaining cells after majority has been distributed
-            rem_cells = mod(n + 1, num_procs_y)
-
-            ! Optimal number of cells per processor
-            n = (n + 1)/num_procs_y - 1
-
-            ! Distributing any remaining cells
-            do i = 1, rem_cells
-                if (proc_coords(2) == i - 1) then
-                    n = n + 1
-                    exit
-                end if
-            end do
-
-            ! Boundary condition at the beginning
-            if (proc_coords(2) > 0 .or. bc_y%beg == -1) then
-                proc_coords(2) = proc_coords(2) - 1
-                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
-                                   bc_y%beg, ierr)
-                proc_coords(2) = proc_coords(2) + 1
-            end if
-
-            ! Ghost zone at the beginning
-            if (proc_coords(2) > 0 .and. format == 1) then
-                offset_y%beg = 2
-            else
-                offset_y%beg = 0
-            end if
-
-            ! Boundary condition at the end
-            if (proc_coords(2) < num_procs_y - 1 .or. bc_y%end == -1) then
-                proc_coords(2) = proc_coords(2) + 1
-                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
-                                   bc_y%end, ierr)
-                proc_coords(2) = proc_coords(2) - 1
-            end if
-
-            ! Ghost zone at the end
-            if (proc_coords(2) < num_procs_y - 1 .and. format == 1) then
-                offset_y%end = 2
-            else
-                offset_y%end = 0
-            end if
-
-            if (parallel_io) then
-                if (proc_coords(2) < rem_cells) then
-                    start_idx(2) = (n + 1)*proc_coords(2)
-                else
-                    start_idx(2) = (n + 1)*proc_coords(2) + rem_cells
-                end if
-            end if
-            ! ==================================================================
-
-            ! Generating 1D Cartesian Processor Topology =======================
-
-        else
-
-            ! Number of processors in the coordinate direction is equal to
-            ! the total number of processors available
-            num_procs_x = num_procs
-
-            ! Number of cells in undecomposed computational domain needed
-            ! for sub-domain reassembly during formatted data output
-            m_root = m
-
-            ! Creating a new communicator using Cartesian topology
-            call MPI_CART_CREATE(MPI_COMM_WORLD, 1, (/num_procs_x/), &
-                                 (/.true./), .false., MPI_COMM_CART, &
-                                 ierr)
-
-            ! Finding the corresponding Cartesian coordinates of the local
-            ! processor rank in the newly declared cartesian communicator
-            call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 1, &
-                                 proc_coords, ierr)
-
-        end if
-
-        ! ==================================================================
-
-        ! Sub-domain Global Parameters in x-direction ======================
-
-        ! Number of remaining cells after majority has been distributed
-        rem_cells = mod(m + 1, num_procs_x)
-
-        ! Optimal number of cells per processor
-        m = (m + 1)/num_procs_x - 1
-
-        ! Distributing any remaining cells
-        do i = 1, rem_cells
-            if (proc_coords(1) == i - 1) then
-                m = m + 1
-                exit
-            end if
-        end do
-
-        ! Boundary condition at the beginning
-        if (proc_coords(1) > 0 .or. bc_x%beg == -1) then
-            proc_coords(1) = proc_coords(1) - 1
-            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%beg, ierr)
-            proc_coords(1) = proc_coords(1) + 1
-        end if
-
-        ! Ghost zone at the beginning
-        if (proc_coords(1) > 0 .and. format == 1 .and. n > 0) then
-            offset_x%beg = 2
-        else
-            offset_x%beg = 0
-        end if
-
-        ! Boundary condition at the end
-        if (proc_coords(1) < num_procs_x - 1 .or. bc_x%end == -1) then
-            proc_coords(1) = proc_coords(1) + 1
-            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%end, ierr)
-            proc_coords(1) = proc_coords(1) - 1
-        end if
-
-        ! Ghost zone at the end
-        if (proc_coords(1) < num_procs_x - 1 .and. format == 1 .and. n > 0) then
-            offset_x%end = 2
-        else
-            offset_x%end = 0
-        end if
-
-        if (parallel_io) then
-            if (proc_coords(1) < rem_cells) then
-                start_idx(1) = (m + 1)*proc_coords(1)
-            else
-                start_idx(1) = (m + 1)*proc_coords(1) + rem_cells
-            end if
-        end if
-        ! ==================================================================
-
-#endif
-
-    end subroutine s_mpi_decompose_computational_domain
-
-    !>  Communicates the buffer regions associated with the grid
-        !!      variables with processors in charge of the neighboring
-        !!      sub-domains. Note that only cell-width spacings feature
-        !!      buffer regions so that no information relating to the
-        !!      cell-boundary locations is communicated.
-        !!  @param pbc_loc Processor boundary condition (PBC) location
-        !!  @param sweep_coord Coordinate direction normal to the processor boundary
-    subroutine s_mpi_sendrecv_grid_vars_buffer_regions(pbc_loc, sweep_coord)
-
-        character(LEN=3), intent(in) :: pbc_loc
-        character, intent(in) :: sweep_coord
-
-#ifdef MFC_MPI
-
-        ! Communications in the x-direction ================================
-
-        if (sweep_coord == 'x') then
-
-            if (pbc_loc == 'beg') then    ! Buffer region at the beginning
-
-                ! PBC at both ends of the sub-domain
-                if (bc_x%end >= 0) then
-
-                    ! Sending/receiving the data to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_x%end, 0, &
-                                      dx(-buff_size), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_x%beg, 0, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                    ! PBC only at beginning of the sub-domain
-                else
-
-                    ! Sending/receiving the data to/from bc_x%beg/bc_x%beg
-                    call MPI_SENDRECV(dx(0), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_x%beg, 1, &
-                                      dx(-buff_size), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_x%beg, 0, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                end if
-
-            else                         ! Buffer region at the end
-
-                ! PBC at both ends of the sub-domain
-                if (bc_x%beg >= 0) then
-
-                    ! Sending/receiving the data to/from bc_x%beg/bc_x%end
-                    call MPI_SENDRECV(dx(0), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_x%beg, 1, &
-                                      dx(m + 1), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_x%end, 1, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                    ! PBC only at end of the sub-domain
-                else
-
-                    ! Sending/receiving the data to/from bc_x%end/bc_x%end
-                    call MPI_SENDRECV(dx(m - buff_size + 1), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_x%end, 0, &
-                                      dx(m + 1), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_x%end, 1, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                end if
-
-            end if
-
-            ! END: Communications in the x-direction ===========================
-
-            ! Communications in the y-direction ================================
-
-        elseif (sweep_coord == 'y') then
-
-            if (pbc_loc == 'beg') then    ! Buffer region at the beginning
-
-                ! PBC at both ends of the sub-domain
-                if (bc_y%end >= 0) then
-
-                    ! Sending/receiving the data to/from bc_y%end/bc_y%beg
-                    call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_y%end, 0, &
-                                      dy(-buff_size), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_y%beg, 0, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                    ! PBC only at beginning of the sub-domain
-                else
-
-                    ! Sending/receiving the data to/from bc_y%beg/bc_y%beg
-                    call MPI_SENDRECV(dy(0), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_y%beg, 1, &
-                                      dy(-buff_size), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_y%beg, 0, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                end if
-
-            else                         ! Buffer region at the end
-
-                ! PBC at both ends of the sub-domain
-                if (bc_y%beg >= 0) then
-
-                    ! Sending/receiving the data to/from bc_y%beg/bc_y%end
-                    call MPI_SENDRECV(dy(0), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_y%beg, 1, &
-                                      dy(n + 1), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_y%end, 1, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                    ! PBC only at end of the sub-domain
-                else
-
-                    ! Sending/receiving the data to/from bc_y%end/bc_y%end
-                    call MPI_SENDRECV(dy(n - buff_size + 1), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_y%end, 0, &
-                                      dy(n + 1), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_y%end, 1, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                end if
-
-            end if
-
-            ! END: Communications in the y-direction ===========================
-
-            ! Communications in the z-direction ================================
-
-        else
-
-            if (pbc_loc == 'beg') then    ! Buffer region at the beginning
-
-                ! PBC at both ends of the sub-domain
-                if (bc_z%end >= 0) then
-
-                    ! Sending/receiving the data to/from bc_z%end/bc_z%beg
-                    call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_z%end, 0, &
-                                      dz(-buff_size), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_z%beg, 0, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                    ! PBC only at beginning of the sub-domain
-                else
-
-                    ! Sending/receiving the data to/from bc_z%beg/bc_z%beg
-                    call MPI_SENDRECV(dz(0), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_z%beg, 1, &
-                                      dz(-buff_size), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_z%beg, 0, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                end if
-
-            else                         ! Buffer region at the end
-
-                ! PBC at both ends of the sub-domain
-                if (bc_z%beg >= 0) then
-
-                    ! Sending/receiving the data to/from bc_z%beg/bc_z%end
-                    call MPI_SENDRECV(dz(0), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_z%beg, 1, &
-                                      dz(p + 1), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_z%end, 1, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                    ! PBC only at end of the sub-domain
-                else
-
-                    ! Sending/receiving the data to/from bc_z%end/bc_z%end
-                    call MPI_SENDRECV(dz(p - buff_size + 1), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_z%end, 0, &
-                                      dz(p + 1), buff_size, &
-                                      MPI_DOUBLE_PRECISION, bc_z%end, 1, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                end if
-
-            end if
-
-        end if
-
-        ! END: Communications in the z-direction ===========================
-
-#endif
-
-    end subroutine s_mpi_sendrecv_grid_vars_buffer_regions
-
-    !>  Communicates buffer regions associated with conservative
-        !!      variables with processors in charge of the neighboring
-        !!      sub-domains
-        !!  @param q_cons_vf Conservative variables
-        !!  @param pbc_loc Processor boundary condition (PBC) location
-        !!  @param sweep_coord Coordinate direction normal to the processor boundary
-    subroutine s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, pbc_loc, &
-                                                       sweep_coord)
-
-        type(scalar_field), &
-            dimension(sys_size), &
-            intent(inout) :: q_cons_vf
-
-        character(LEN=3), intent(in) :: pbc_loc
-
-        character, intent(in) :: sweep_coord
-
-#ifdef MFC_MPI
-
-        integer :: i, j, k, l, r !< Generic loop iterators
-
-        ! Communications in the x-direction ================================
-
-        if (sweep_coord == 'x') then
-
-            if (pbc_loc == 'beg') then    ! Buffer region at the beginning
-
-                ! PBC at both ends of the sub-domain
-                if (bc_x%end >= 0) then
-
-                    ! Packing the data to be sent to bc_x%end
-                    do l = 0, p
-                        do k = 0, n
-                            do j = m - buff_size + 1, m
-                                do i = 1, sys_size
-                                    r = sys_size*(j - m + buff_size - 1) &
-                                        + sys_size*buff_size*k + (i - 1) &
-                                        + sys_size*buff_size*(n + 1)*l
-                                    q_cons_buffer_out(r) = &
-                                        q_cons_vf(i)%sf(j, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    ! Sending/receiving the data to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV(q_cons_buffer_out(0), &
-                                      buff_size*sys_size*(n + 1)*(p + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_x%end, 0, &
-                                      q_cons_buffer_in(0), &
-                                      buff_size*sys_size*(n + 1)*(p + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_x%beg, 0, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                    ! PBC only at beginning of the sub-domain
-                else
-
-                    ! Packing the data to be sent to bc_x%beg
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 0, buff_size - 1
-                                do i = 1, sys_size
-                                    r = (i - 1) + sys_size*j &
-                                        + sys_size*buff_size*k &
-                                        + sys_size*buff_size*(n + 1)*l
-                                    q_cons_buffer_out(r) = &
-                                        q_cons_vf(i)%sf(j, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    ! Sending/receiving the data to/from bc_x%beg/bc_x%beg
-                    call MPI_SENDRECV(q_cons_buffer_out(0), &
-                                      buff_size*sys_size*(n + 1)*(p + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_x%beg, 1, &
-                                      q_cons_buffer_in(0), &
-                                      buff_size*sys_size*(n + 1)*(p + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_x%beg, 0, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                end if
-
-                ! Unpacking the data received from bc_x%beg
-                do l = 0, p
-                    do k = 0, n
-                        do j = -buff_size, -1
-                            do i = 1, sys_size
-                                r = sys_size*(j + buff_size) &
-                                    + sys_size*buff_size*k + (i - 1) &
-                                    + sys_size*buff_size*(n + 1)*l
-                                q_cons_vf(i)%sf(j, k, l) = q_cons_buffer_in(r)
-#if defined(__INTEL_COMPILER)
-                                if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then
-                                    print *, "Error", j, k, l, i
-                                    error stop "NaN(s) in recv"
-                                end if
-#endif
-                            end do
-                        end do
-                    end do
-                end do
-
-            else                         ! Buffer region at the end
-
-                ! PBC at both ends of the sub-domain
-                if (bc_x%beg >= 0) then
-
-                    ! Packing the data to be sent to bc_x%beg
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 0, buff_size - 1
-                                do i = 1, sys_size
-                                    r = (i - 1) + sys_size*j &
-                                        + sys_size*buff_size*k &
-                                        + sys_size*buff_size*(n + 1)*l
-                                    q_cons_buffer_out(r) = &
-                                        q_cons_vf(i)%sf(j, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    ! Sending/receiving the data to/from bc_x%beg/bc_x%end
-                    call MPI_SENDRECV(q_cons_buffer_out(0), &
-                                      buff_size*sys_size*(n + 1)*(p + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_x%beg, 1, &
-                                      q_cons_buffer_in(0), &
-                                      buff_size*sys_size*(n + 1)*(p + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_x%end, 1, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                    ! PBC only at end of the sub-domain
-                else
-
-                    ! Packing the data to be sent to bc_x%end
-                    do l = 0, p
-                        do k = 0, n
-                            do j = m - buff_size + 1, m
-                                do i = 1, sys_size
-                                    r = sys_size*(j - m + buff_size - 1) &
-                                        + sys_size*buff_size*k + (i - 1) &
-                                        + sys_size*buff_size*(n + 1)*l
-                                    q_cons_buffer_out(r) = &
-                                        q_cons_vf(i)%sf(j, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    ! Sending/receiving the data to/from bc_x%end/bc_x%end
-                    call MPI_SENDRECV(q_cons_buffer_out(0), &
-                                      buff_size*sys_size*(n + 1)*(p + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_x%end, 0, &
-                                      q_cons_buffer_in(0), &
-                                      buff_size*sys_size*(n + 1)*(p + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_x%end, 1, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                end if
-
-                ! Unpacking the data received from bc_x%end
-                do l = 0, p
-                    do k = 0, n
-                        do j = m + 1, m + buff_size
-                            do i = 1, sys_size
-                                r = (i - 1) + sys_size*(j - m - 1) &
-                                    + sys_size*buff_size*k &
-                                    + sys_size*buff_size*(n + 1)*l
-                                q_cons_vf(i)%sf(j, k, l) = q_cons_buffer_in(r)
-#if defined(__INTEL_COMPILER)
-                                if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then
-                                    print *, "Error", j, k, l, i
-                                    error stop "NaN(s) in recv"
-                                end if
-#endif
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-            ! END: Communications in the x-direction ===========================
-
-            ! Communications in the y-direction ================================
-
-        elseif (sweep_coord == 'y') then
-
-            if (pbc_loc == 'beg') then    ! Buffer region at the beginning
-
-                ! PBC at both ends of the sub-domain
-                if (bc_y%end >= 0) then
-
-                    ! Packing the data to be sent to bc_y%end
-                    do l = 0, p
-                        do k = n - buff_size + 1, n
-                            do j = -buff_size, m + buff_size
-                                do i = 1, sys_size
-                                    r = sys_size*(j + buff_size) &
-                                        + sys_size*(m + 2*buff_size + 1)* &
-                                        (k - n + buff_size - 1) + (i - 1) &
-                                        + sys_size*(m + 2*buff_size + 1)* &
-                                        buff_size*l
-                                    q_cons_buffer_out(r) = &
-                                        q_cons_vf(i)%sf(j, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    ! Sending/receiving the data to/from bc_y%end/bc_y%beg
-                    call MPI_SENDRECV(q_cons_buffer_out(0), buff_size* &
-                                      sys_size*(m + 2*buff_size + 1)* &
-                                      (p + 1), MPI_DOUBLE_PRECISION, &
-                                      bc_y%end, 0, q_cons_buffer_in(0), &
-                                      buff_size*sys_size* &
-                                      (m + 2*buff_size + 1)*(p + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_y%beg, 0, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                    ! PBC only at beginning of the sub-domain
-                else
-
-                    ! Packing the data to be sent to bc_y%beg
-                    do l = 0, p
-                        do k = 0, buff_size - 1
-                            do j = -buff_size, m + buff_size
-                                do i = 1, sys_size
-                                    r = sys_size*(j + buff_size) &
-                                        + sys_size*(m + 2*buff_size + 1)*k &
-                                        + sys_size*(m + 2*buff_size + 1)* &
-                                        buff_size*l + (i - 1)
-                                    q_cons_buffer_out(r) = &
-                                        q_cons_vf(i)%sf(j, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    ! Sending/receiving the data to/from bc_y%beg/bc_y%beg
-                    call MPI_SENDRECV(q_cons_buffer_out(0), buff_size* &
-                                      sys_size*(m + 2*buff_size + 1)* &
-                                      (p + 1), MPI_DOUBLE_PRECISION, &
-                                      bc_y%beg, 1, q_cons_buffer_in(0), &
-                                      buff_size*sys_size* &
-                                      (m + 2*buff_size + 1)*(p + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_y%beg, 0, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                end if
-
-                ! Unpacking the data received from bc_y%beg
-                do l = 0, p
-                    do k = -buff_size, -1
-                        do j = -buff_size, m + buff_size
-                            do i = 1, sys_size
-                                r = (i - 1) + sys_size*(j + buff_size) &
-                                    + sys_size*(m + 2*buff_size + 1)* &
-                                    (k + buff_size) + sys_size* &
-                                    (m + 2*buff_size + 1)*buff_size*l
-                                q_cons_vf(i)%sf(j, k, l) = q_cons_buffer_in(r)
-#if defined(__INTEL_COMPILER)
-                                if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then
-                                    print *, "Error", j, k, l, i
-                                    error stop "NaN(s) in recv"
-                                end if
-#endif
-                            end do
-                        end do
-                    end do
-                end do
-
-            else                         ! Buffer region at the end
-
-                ! PBC at both ends of the sub-domain
-                if (bc_y%beg >= 0) then
-
-                    ! Packing the data to be sent to bc_y%beg
-                    do l = 0, p
-                        do k = 0, buff_size - 1
-                            do j = -buff_size, m + buff_size
-                                do i = 1, sys_size
-                                    r = sys_size*(j + buff_size) &
-                                        + sys_size*(m + 2*buff_size + 1)*k &
-                                        + sys_size*(m + 2*buff_size + 1)* &
-                                        buff_size*l + (i - 1)
-                                    q_cons_buffer_out(r) = &
-                                        q_cons_vf(i)%sf(j, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    ! Sending/receiving the data to/from bc_y%beg/bc_y%end
-                    call MPI_SENDRECV(q_cons_buffer_out(0), buff_size* &
-                                      sys_size*(m + 2*buff_size + 1)* &
-                                      (p + 1), MPI_DOUBLE_PRECISION, &
-                                      bc_y%beg, 1, q_cons_buffer_in(0), &
-                                      buff_size*sys_size* &
-                                      (m + 2*buff_size + 1)*(p + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_y%end, 1, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                    ! PBC only at end of the sub-domain
-                else
-
-                    ! Packing the data to be sent to bc_y%end
-                    do l = 0, p
-                        do k = n - buff_size + 1, n
-                            do j = -buff_size, m + buff_size
-                                do i = 1, sys_size
-                                    r = sys_size*(j + buff_size) &
-                                        + sys_size*(m + 2*buff_size + 1)* &
-                                        (k - n + buff_size - 1) + (i - 1) &
-                                        + sys_size*(m + 2*buff_size + 1)* &
-                                        buff_size*l
-                                    q_cons_buffer_out(r) = &
-                                        q_cons_vf(i)%sf(j, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    ! Sending/receiving the data to/from bc_y%end/bc_y%end
-                    call MPI_SENDRECV(q_cons_buffer_out(0), buff_size* &
-                                      sys_size*(m + 2*buff_size + 1)* &
-                                      (p + 1), MPI_DOUBLE_PRECISION, &
-                                      bc_y%end, 0, q_cons_buffer_in(0), &
-                                      buff_size*sys_size* &
-                                      (m + 2*buff_size + 1)*(p + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_y%end, 1, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                end if
-
-                ! Unpacking the data received form bc_y%end
-                do l = 0, p
-                    do k = n + 1, n + buff_size
-                        do j = -buff_size, m + buff_size
-                            do i = 1, sys_size
-                                r = (i - 1) + sys_size*(j + buff_size) &
-                                    + sys_size*(m + 2*buff_size + 1)* &
-                                    (k - n - 1) + sys_size* &
-                                    (m + 2*buff_size + 1)*buff_size*l
-                                q_cons_vf(i)%sf(j, k, l) = q_cons_buffer_in(r)
-#if defined(__INTEL_COMPILER)
-                                if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then
-                                    print *, "Error", j, k, l, i
-                                    error stop "NaN(s) in recv"
-                                end if
-#endif
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-            ! END: Communications in the y-direction ===========================
-
-            ! Communications in the z-direction ================================
-
-        else
-
-            if (pbc_loc == 'beg') then    ! Buffer region at the beginning
-
-                ! PBC at both ends of the sub-domain
-                if (bc_z%end >= 0) then
-
-                    ! Packing the data to be sent to bc_z%end
-                    do l = p - buff_size + 1, p
-                        do k = -buff_size, n + buff_size
-                            do j = -buff_size, m + buff_size
-                                do i = 1, sys_size
-                                    r = sys_size*(j + buff_size) &
-                                        + sys_size*(m + 2*buff_size + 1)* &
-                                        (k + buff_size) + sys_size* &
-                                        (m + 2*buff_size + 1)* &
-                                        (n + 2*buff_size + 1)* &
-                                        (l - p + buff_size - 1) + (i - 1)
-                                    q_cons_buffer_out(r) = &
-                                        q_cons_vf(i)%sf(j, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    ! Sending/receiving the data to/from bc_z%end/bc_z%beg
-                    call MPI_SENDRECV(q_cons_buffer_out(0), buff_size* &
-                                      sys_size*(m + 2*buff_size + 1)* &
-                                      (n + 2*buff_size + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_z%end, 0, &
-                                      q_cons_buffer_in(0), buff_size* &
-                                      sys_size*(m + 2*buff_size + 1)* &
-                                      (n + 2*buff_size + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_z%beg, 0, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                    ! PBC only at beginning of the sub-domain
-                else
-
-                    ! Packing the data to be sent to bc_z%beg
-                    do l = 0, buff_size - 1
-                        do k = -buff_size, n + buff_size
-                            do j = -buff_size, m + buff_size
-                                do i = 1, sys_size
-                                    r = sys_size*(j + buff_size) &
-                                        + sys_size*(m + 2*buff_size + 1)* &
-                                        (k + buff_size) + (i - 1) &
-                                        + sys_size*(m + 2*buff_size + 1)* &
-                                        (n + 2*buff_size + 1)*l
-                                    q_cons_buffer_out(r) = &
-                                        q_cons_vf(i)%sf(j, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    ! Sending/receiving the data to/from bc_z%beg/bc_z%beg
-                    call MPI_SENDRECV(q_cons_buffer_out(0), buff_size* &
-                                      sys_size*(m + 2*buff_size + 1)* &
-                                      (n + 2*buff_size + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_z%beg, 1, &
-                                      q_cons_buffer_in(0), buff_size* &
-                                      sys_size*(m + 2*buff_size + 1)* &
-                                      (n + 2*buff_size + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_z%beg, 0, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                end if
-
-                ! Unpacking the data from bc_z%beg
-                do l = -buff_size, -1
-                    do k = -buff_size, n + buff_size
-                        do j = -buff_size, m + buff_size
-                            do i = 1, sys_size
-                                r = sys_size*(j + buff_size) &
-                                    + sys_size*(m + 2*buff_size + 1)* &
-                                    (k + buff_size) + (i - 1) &
-                                    + sys_size*(m + 2*buff_size + 1)* &
-                                    (n + 2*buff_size + 1)*(l + buff_size)
-                                q_cons_vf(i)%sf(j, k, l) = q_cons_buffer_in(r)
-#if defined(__INTEL_COMPILER)
-                                if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then
-                                    print *, "Error", j, k, l, i
-                                    error stop "NaN(s) in recv"
-                                end if
-#endif
-                            end do
-                        end do
-                    end do
-                end do
-
-            else                         ! Buffer region at the end
-
-                ! PBC at both ends of the sub-domain
-                if (bc_z%beg >= 0) then
-
-                    ! Packing the data to be sent to bc_z%beg
-                    do l = 0, buff_size - 1
-                        do k = -buff_size, n + buff_size
-                            do j = -buff_size, m + buff_size
-                                do i = 1, sys_size
-                                    r = sys_size*(j + buff_size) &
-                                        + sys_size*(m + 2*buff_size + 1)* &
-                                        (k + buff_size) + (i - 1) &
-                                        + sys_size*(m + 2*buff_size + 1)* &
-                                        (n + 2*buff_size + 1)*l
-                                    q_cons_buffer_out(r) = &
-                                        q_cons_vf(i)%sf(j, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    ! Sending/receiving the data to/from bc_z%beg/bc_z%end
-                    call MPI_SENDRECV(q_cons_buffer_out(0), buff_size* &
-                                      sys_size*(m + 2*buff_size + 1)* &
-                                      (n + 2*buff_size + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_z%beg, 1, &
-                                      q_cons_buffer_in(0), buff_size* &
-                                      sys_size*(m + 2*buff_size + 1)* &
-                                      (n + 2*buff_size + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_z%end, 1, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                    ! PBC only at end of the sub-domain
-                else
-
-                    ! Packing the data to be sent to bc_z%end
-                    do l = p - buff_size + 1, p
-                        do k = -buff_size, n + buff_size
-                            do j = -buff_size, m + buff_size
-                                do i = 1, sys_size
-                                    r = sys_size*(j + buff_size) &
-                                        + sys_size*(m + 2*buff_size + 1)* &
-                                        (k + buff_size) + sys_size* &
-                                        (m + 2*buff_size + 1)* &
-                                        (n + 2*buff_size + 1)* &
-                                        (l - p + buff_size - 1) + (i - 1)
-                                    q_cons_buffer_out(r) = &
-                                        q_cons_vf(i)%sf(j, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    ! Sending/receiving the data to/from bc_z%end/bc_z%end
-                    call MPI_SENDRECV(q_cons_buffer_out(0), buff_size* &
-                                      sys_size*(m + 2*buff_size + 1)* &
-                                      (n + 2*buff_size + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_z%end, 0, &
-                                      q_cons_buffer_in(0), buff_size* &
-                                      sys_size*(m + 2*buff_size + 1)* &
-                                      (n + 2*buff_size + 1), &
-                                      MPI_DOUBLE_PRECISION, bc_z%end, 1, &
-                                      MPI_COMM_WORLD, MPI_STATUS_IGNORE, &
-                                      ierr)
-
-                end if
-
-                ! Unpacking the data received from bc_z%end
-                do l = p + 1, p + buff_size
-                    do k = -buff_size, n + buff_size
-                        do j = -buff_size, m + buff_size
-                            do i = 1, sys_size
-                                r = sys_size*(j + buff_size) &
-                                    + sys_size*(m + 2*buff_size + 1)* &
-                                    (k + buff_size) + (i - 1) &
-                                    + sys_size*(m + 2*buff_size + 1)* &
-                                    (n + 2*buff_size + 1)*(l - p - 1)
-                                q_cons_vf(i)%sf(j, k, l) = q_cons_buffer_in(r)
-#if defined(__INTEL_COMPILER)
-                                if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then
-                                    print *, "Error", j, k, l, i
-                                    error stop "NaN(s) in recv"
-                                end if
-#endif
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-        end if
-
-        ! END: Communications in the z-direction ===========================
-
-#endif
-
-    end subroutine s_mpi_sendrecv_cons_vars_buffer_regions
-
     !>  This subroutine gathers the Silo database metadata for
         !!      the spatial extents in order to boost the performance of
         !!      the multidimensional visualization.
diff --git a/src/post_process/m_start_up.f90 b/src/post_process/m_start_up.f90
index 301e385d33..97e7062dc7 100644
--- a/src/post_process/m_start_up.f90
+++ b/src/post_process/m_start_up.f90
@@ -39,6 +39,8 @@ module m_start_up
 
     use m_finite_differences
 
+    use m_boundary_conditions_common
+
     ! ==========================================================================
 
     implicit none
@@ -66,9 +68,12 @@ subroutine s_read_input_file
         namelist /user_inputs/ case_dir, m, n, p, t_step_start, &
             t_step_stop, t_step_save, model_eqns, &
             num_fluids, mpp_lim, &
-            weno_order, bc_x, &
-            bc_y, bc_z, fluid_pp, format, precision, &
-            hypoelasticity, G, &
+            weno_order, &
+            bc_x, bc_y, bc_z, &
+            num_bc_patches, patch_bc, &
+            fluid_pp, format, precision, &
+            x_domain, y_domain, z_domain, &
+            hypoelasticity, rdma_mpi, G, &
             chem_wrt_Y, chem_wrt_T, avg_state, &
             alpha_rho_wrt, rho_wrt, mom_wrt, vel_wrt, &
             E_wrt, pres_wrt, alpha_wrt, gamma_wrt, &
@@ -170,12 +175,12 @@ subroutine s_perform_time_step(t_step)
         call s_read_data_files(t_step)
         ! Populating the buffer regions of the grid variables
         if (buff_size > 0) then
-            call s_populate_grid_variables_buffer_regions()
+            call s_mpi_sendrecv_grid_spacing_buffers(bc_id_sfs)
         end if
 
         ! Populating the buffer regions of the conservative variables
         if (buff_size > 0) then
-            call s_populate_conservative_variables_buffer_regions()
+            call s_populate_cons_buffers(q_cons_vf)
         end if
 
         ! Converting the conservative variables to the primitive ones
@@ -679,13 +684,17 @@ subroutine s_initialize_modules
         ! Computation of parameters, allocation procedures, and/or any other tasks
         ! needed to properly setup the modules
         call s_initialize_global_parameters_module()
+        call s_initialize_boundary_conditions_module()
         if (bubbles .and. nb > 1) then
             call s_simpson
         end if
         if (bubbles .and. .not. polytropic) then
             call s_initialize_nonpoly()
         end if
-        if (num_procs > 1) call s_initialize_mpi_proxy_module()
+        if (num_procs > 1) then
+            call s_initialize_mpi_common_module()
+            call s_initialize_mpi_proxy_module()
+        end if
         call s_initialize_variables_conversion_module()
         call s_initialize_data_input_module()
         call s_initialize_derived_variables_module()
@@ -733,7 +742,10 @@ subroutine s_finalize_modules
         call s_finalize_derived_variables_module()
         call s_finalize_data_input_module()
         call s_finalize_variables_conversion_module()
-        if (num_procs > 1) call s_finalize_mpi_proxy_module()
+        if (num_procs > 1) then
+            call s_finalize_mpi_proxy_module()
+            call s_finalize_mpi_common_module()
+        end if
         call s_finalize_global_parameters_module()
 
         ! Finalizing the MPI environment
diff --git a/src/pre_process/m_compute_levelset.fpp b/src/pre_process/m_compute_levelset.fpp
index 18faf7a469..499a88cc97 100644
--- a/src/pre_process/m_compute_levelset.fpp
+++ b/src/pre_process/m_compute_levelset.fpp
@@ -315,23 +315,25 @@ contains
                         if (side_dists(2) == 0) then
                             levelset_norm%sf(i, j, 0, ib_patch_id, 1) = 0d0
                         else
-                            levelset_norm%sf(i, j, 0, ib_patch_id, 1) = side_dists(2)/ &
+                            levelset_norm%sf(i, j, 0, ib_patch_id, 1) = -side_dists(2)/ &
                                                                         abs(side_dists(2))
                         end if
 
                     else if (min_dist == abs(side_dists(3))) then
+                        levelset%sf(i, j, 0, ib_patch_id) = side_dists(3)
                         if (side_dists(3) == 0) then
-                            levelset_norm%sf(i, j, 0, ib_patch_id, 1) = 0d0
+                            levelset_norm%sf(i, j, 0, ib_patch_id, 2) = 0d0
                         else
-                            levelset_norm%sf(i, j, 0, ib_patch_id, 1) = side_dists(3)/ &
+                            levelset_norm%sf(i, j, 0, ib_patch_id, 2) = side_dists(3)/ &
                                                                         abs(side_dists(3))
                         end if
 
                     else if (min_dist == abs(side_dists(4))) then
+                        levelset%sf(i, j, 0, ib_patch_id) = side_dists(4)
                         if (side_dists(4) == 0) then
-                            levelset_norm%sf(i, j, 0, ib_patch_id, 1) = 0d0
+                            levelset_norm%sf(i, j, 0, ib_patch_id, 2) = 0d0
                         else
-                            levelset_norm%sf(i, j, 0, ib_patch_id, 1) = side_dists(4)/ &
+                            levelset_norm%sf(i, j, 0, ib_patch_id, 2) = -side_dists(4)/ &
                                                                         abs(side_dists(4))
                         end if
 
diff --git a/src/pre_process/m_data_output.fpp b/src/pre_process/m_data_output.fpp
index 8c6147adef..541b49eba5 100644
--- a/src/pre_process/m_data_output.fpp
+++ b/src/pre_process/m_data_output.fpp
@@ -32,6 +32,8 @@ module m_data_output
 
     use m_thermochem, only: species_names
 
+    use m_boundary_conditions_common
+
     ! ==========================================================================
 
     implicit none
@@ -124,6 +126,8 @@ contains
         integer :: i, j, k, l, r, c, dir !< Generic loop iterator
         integer :: t_step
 
+        integer :: iter_dir, iter_loc
+
         real(kind(0d0)), dimension(nb) :: nRtmp         !< Temporary bubble concentration
         real(kind(0d0)) :: nbub                         !< Temporary bubble number density
         real(kind(0d0)) :: gamma, lit_gamma, pi_inf, qv !< Temporary EOS params
@@ -145,6 +149,9 @@ contains
             status = 'new'
         end if
 
+        ! Boundary Conditions
+        call s_write_boundary_condition_files(t_step_dir)
+
         ! x-coordinate direction
         file_loc = trim(t_step_dir)//'/x_cb.dat'
         open (1, FILE=trim(file_loc), FORM='unformatted', STATUS=status)
@@ -561,6 +568,8 @@ contains
         ! Generic loop iterator
         integer :: i
 
+        call s_write_boundary_condition_files(trim(case_dir)//'/restart_data')
+
         if (file_per_process) then
             if (proc_rank == 0) then
                 file_loc = trim(case_dir)//'/restart_data/lustre_0'
diff --git a/src/pre_process/m_global_parameters.fpp b/src/pre_process/m_global_parameters.fpp
index ef240f21d2..a4ae6b801b 100644
--- a/src/pre_process/m_global_parameters.fpp
+++ b/src/pre_process/m_global_parameters.fpp
@@ -20,6 +20,8 @@ module m_global_parameters
 
     use m_thermochem, only: num_species
 
+    use m_global_parameters_common
+
     ! ==========================================================================
 
     implicit none
@@ -92,6 +94,8 @@ module m_global_parameters
     logical :: hypoelasticity        !< activate hypoelasticity
     logical, parameter :: chemistry = .${chemistry}$. !< Chemistry modeling
 
+    logical :: rdma_mpi              !< Use RDMA for MPI communication
+
     ! Annotations of the structure, i.e. the organization, of the state vectors
     type(int_bounds_info) :: cont_idx              !< Indexes of first & last continuity eqns.
     type(int_bounds_info) :: mom_idx               !< Indexes of first & last momentum eqns.
@@ -118,6 +122,8 @@ module m_global_parameters
     type(int_bounds_info) :: idwbuff(1:3)
 
     type(int_bounds_info) :: bc_x, bc_y, bc_z !<
+    integer :: num_bc_patches
+    type(bc_patch_parameters) :: patch_bc(num_bc_patches_max)
     !! Boundary conditions in the x-, y- and z-coordinate directions
 
     logical :: parallel_io !< Format of the data files
@@ -139,9 +145,12 @@ module m_global_parameters
     integer :: perturb_sph_fluid    !< Fluid to be perturbed with perturb_sph flag
     real(kind(0d0)), dimension(num_fluids_max) :: fluid_rho
 
-    integer, allocatable, dimension(:) :: proc_coords !<
+    integer, dimension(1:3) :: proc_coords !<
     !! Processor coordinates in MPI_CART_COMM
 
+    integer, dimension(1:3) :: proc_nums
+    !! Processor dimensions in MPI_CART_COMM
+
     integer, allocatable, dimension(:) :: start_idx !<
     !! Starting cell-center index of local processor in global grid
 
@@ -309,16 +318,15 @@ contains
         weno_order = dflt_int
 
         hypoelasticity = .false.
+        rdma_mpi = .false.
 
         bc_x%beg = dflt_int; bc_x%end = dflt_int
         bc_y%beg = dflt_int; bc_y%end = dflt_int
         bc_z%beg = dflt_int; bc_z%end = dflt_int
 
         #:for DIM in ['x', 'y', 'z']
-            #:for DIR in [1, 2, 3]
-                bc_${DIM}$%vb${DIR}$ = 0d0
-                bc_${DIM}$%ve${DIR}$ = 0d0
-            #:endfor
+            bc_${DIM}$%vel_beg = 0d0
+            bc_${DIM}$%vel_end = 0d0
         #:endfor
 
         parallel_io = .false.
@@ -386,6 +394,8 @@ contains
             end if
         end do
 
+        call s_bc_assign_default_values_to_user_inputs(num_bc_patches, patch_bc)
+
         ! Tait EOS
         rhoref = dflt_real
         pref = dflt_real
@@ -793,8 +803,6 @@ contains
 
         num_dims = 1 + min(1, n) + min(1, p)
 
-        allocate (proc_coords(1:num_dims))
-
         if (parallel_io .neqv. .true.) return
 
 #ifdef MFC_MPI
@@ -830,8 +838,6 @@ contains
             end if
         end if
 
-        deallocate (proc_coords)
-
 #ifdef MFC_MPI
 
         if (parallel_io) then
diff --git a/src/pre_process/m_initial_condition.fpp b/src/pre_process/m_initial_condition.fpp
index dc51829069..13848be7e5 100644
--- a/src/pre_process/m_initial_condition.fpp
+++ b/src/pre_process/m_initial_condition.fpp
@@ -142,7 +142,7 @@ contains
             do i = 1, num_patches
 
                 if (proc_rank == 0) then
-                    print *, 'Processing patch', i
+                    print *, 'Processing 3D patch', i
                 end if
 
                 !> ICPP Patches
@@ -218,7 +218,7 @@ contains
             do i = 1, num_patches
 
                 if (proc_rank == 0) then
-                    print *, 'Processing patch', i
+                    print *, 'Processing 2D patch', i
                 end if
 
                 !> ICPP Patches
@@ -296,7 +296,7 @@ contains
             do i = 1, num_patches
 
                 if (proc_rank == 0) then
-                    print *, 'Processing patch', i
+                    print *, 'Processing 1D patch', i
                 end if
 
                 ! Line segment patch
diff --git a/src/pre_process/m_mpi_proxy.fpp b/src/pre_process/m_mpi_proxy.fpp
index 33895c7c4c..b84610bd9e 100644
--- a/src/pre_process/m_mpi_proxy.fpp
+++ b/src/pre_process/m_mpi_proxy.fpp
@@ -72,6 +72,25 @@ contains
             call MPI_BCAST(${VAR}$, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
+        #:for VAR in ['x', 'y', 'z']
+            call MPI_BCAST(bc_${VAR}$%vel_beg, 3, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
+            call MPI_BCAST(bc_${VAR}$%vel_end, 3, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
+        #:endfor
+
+        do i = 1, num_bc_patches_max
+            #:for VAR in [ 'type', 'dir', 'loc', 'geometry' ]
+                call MPI_BCAST(patch_bc(i)%${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+
+            #:for VAR in [ 'centroid', 'length', 'vel' ]
+                call MPI_BCAST(patch_bc(i)%${VAR}$, size(patch_bc(i)%${VAR}$), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+
+            #:for VAR in [ 'radius' ]
+                call MPI_BCAST(patch_bc(i)%${VAR}$, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+        end do
+
         do i = 1, num_patches_max
             #:for VAR in [ 'geometry', 'smooth_patch_id']
                 call MPI_BCAST(patch_icpp(i)%${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
@@ -116,421 +135,4 @@ contains
 
     end subroutine s_mpi_bcast_user_inputs
 
-    !> Description: This subroutine takes care of efficiently distributing
-        !!              the computational domain among the available processors
-        !!             as well as recomputing some of the global parameters so
-        !!              that they reflect the configuration of sub-domain that is
-        !!              overseen by the local processor.
-    subroutine s_mpi_decompose_computational_domain
-
-#ifdef MFC_MPI
-
-        ! # of processors in the x-, y- and z-coordinate directions
-        integer :: num_procs_x, num_procs_y, num_procs_z
-
-        ! Temporary # of processors in x-, y- and z-coordinate directions
-        ! used during the processor factorization optimization procedure
-        real(kind(0d0)) :: tmp_num_procs_x, tmp_num_procs_y, tmp_num_procs_z
-
-        ! Processor factorization (fct) minimization parameter
-        real(kind(0d0)) :: fct_min
-
-        ! Cartesian processor topology communicator
-        integer :: MPI_COMM_CART
-
-        ! Number of remaining cells for a particular coordinate direction
-        ! after the bulk has evenly been distributed among the available
-        ! processors for that coordinate direction
-        integer :: rem_cells
-
-        ! Generic loop iterators
-        integer :: i, j
-
-        if (num_procs == 1 .and. parallel_io) then
-            do i = 1, num_dims
-                start_idx(i) = 0
-            end do
-            return
-        end if
-
-        ! Performing the computational domain decomposition. The procedure
-        ! is optimized by ensuring that each processor contains a close to
-        ! equivalent piece of the computational domain. Note that explicit
-        ! type-casting is omitted here for code legibility purposes.
-
-        ! Generating 3D Cartesian Processor Topology =======================
-
-        if (n > 0) then
-
-            if (p > 0) then
-
-                if (cyl_coord .and. p > 0) then
-                    ! Implement pencil processor blocking if using cylindrical coordinates so
-                    ! that all cells in azimuthal direction are stored on a single processor.
-                    ! This is necessary for efficient application of Fourier filter near axis.
-
-                    ! Initial values of the processor factorization optimization
-                    num_procs_x = 1
-                    num_procs_y = num_procs
-                    num_procs_z = 1
-                    ierr = -1
-
-                    ! Computing minimization variable for these initial values
-                    tmp_num_procs_x = num_procs_x
-                    tmp_num_procs_y = num_procs_y
-                    tmp_num_procs_z = num_procs_z
-                    fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
-                                       - (n + 1)/tmp_num_procs_y)
-
-                    ! Searching for optimal computational domain distribution
-                    do i = 1, num_procs
-
-                        if (mod(num_procs, i) == 0 &
-                            .and. &
-                            (m + 1)/i >= num_stcls_min*weno_order) then
-
-                            tmp_num_procs_x = i
-                            tmp_num_procs_y = num_procs/i
-
-                            if (fct_min >= abs((m + 1)/tmp_num_procs_x &
-                                               - (n + 1)/tmp_num_procs_y) &
-                                .and. &
-                                (n + 1)/tmp_num_procs_y &
-                                >= &
-                                num_stcls_min*weno_order) then
-
-                                num_procs_x = i
-                                num_procs_y = num_procs/i
-                                fct_min = abs((m + 1)/tmp_num_procs_x &
-                                              - (n + 1)/tmp_num_procs_y)
-                                ierr = 0
-
-                            end if
-
-                        end if
-
-                    end do
-
-                else
-
-                    ! Initial values of the processor factorization optimization
-                    num_procs_x = 1
-                    num_procs_y = 1
-                    num_procs_z = num_procs
-                    ierr = -1
-
-                    ! Computing minimization variable for these initial values
-                    tmp_num_procs_x = num_procs_x
-                    tmp_num_procs_y = num_procs_y
-                    tmp_num_procs_z = num_procs_z
-                    fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
-                                       - (n + 1)/tmp_num_procs_y) &
-                              + 10d0*abs((n + 1)/tmp_num_procs_y &
-                                         - (p + 1)/tmp_num_procs_z)
-
-                    ! Searching for optimal computational domain distribution
-                    do i = 1, num_procs
-
-                        if (mod(num_procs, i) == 0 &
-                            .and. &
-                            (m + 1)/i >= num_stcls_min*weno_order) then
-
-                            do j = 1, (num_procs/i)
-
-                                if (mod(num_procs/i, j) == 0 &
-                                    .and. &
-                                    (n + 1)/j >= num_stcls_min*weno_order) then
-
-                                    tmp_num_procs_x = i
-                                    tmp_num_procs_y = j
-                                    tmp_num_procs_z = num_procs/(i*j)
-
-                                    if (fct_min >= abs((m + 1)/tmp_num_procs_x &
-                                                       - (n + 1)/tmp_num_procs_y) &
-                                        + abs((n + 1)/tmp_num_procs_y &
-                                              - (p + 1)/tmp_num_procs_z) &
-                                        .and. &
-                                        (p + 1)/tmp_num_procs_z &
-                                        >= &
-                                        num_stcls_min*weno_order) &
-                                        then
-
-                                        num_procs_x = i
-                                        num_procs_y = j
-                                        num_procs_z = num_procs/(i*j)
-                                        fct_min = abs((m + 1)/tmp_num_procs_x &
-                                                      - (n + 1)/tmp_num_procs_y) &
-                                                  + abs((n + 1)/tmp_num_procs_y &
-                                                        - (p + 1)/tmp_num_procs_z)
-                                        ierr = 0
-
-                                    end if
-
-                                end if
-
-                            end do
-
-                        end if
-
-                    end do
-
-                end if
-
-                ! Checking whether the decomposition of the computational
-                ! domain was successful
-                if (proc_rank == 0 .and. ierr == -1) then
-                    print '(A)', 'Unable to decompose computational '// &
-                        'domain for selected number of '// &
-                        'processors. Exiting ...'
-                    call MPI_ABORT(MPI_COMM_WORLD, 1, ierr)
-                end if
-
-                ! Creating a new communicator using Cartesian topology
-                call MPI_CART_CREATE(MPI_COMM_WORLD, 3, (/num_procs_x, &
-                                                          num_procs_y, num_procs_z/), &
-                                     (/.true., .true., .true./), &
-                                     .false., MPI_COMM_CART, ierr)
-
-                ! Finding corresponding Cartesian coordinates of the local
-                ! processor rank in newly declared cartesian communicator
-                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 3, &
-                                     proc_coords, ierr)
-
-                ! END: Generating 3D Cartesian Processor Topology ==================
-
-                ! Sub-domain Global Parameters in z-direction ======================
-
-                ! Number of remaining cells after majority is distributed
-                rem_cells = mod(p + 1, num_procs_z)
-
-                ! Preliminary uniform cell-width spacing
-                if (old_grid .neqv. .true.) then
-                    dz = (z_domain%end - z_domain%beg)/real(p + 1, kind(0d0))
-                end if
-
-                ! Optimal number of cells per processor
-                p = (p + 1)/num_procs_z - 1
-
-                ! Distributing any remaining cells
-                do i = 1, rem_cells
-                    if (proc_coords(3) == i - 1) then
-                        p = p + 1
-                        exit
-                    end if
-                end do
-
-                ! Beginning and end sub-domain boundary locations
-                if (parallel_io .neqv. .true.) then
-                    if (old_grid .neqv. .true.) then
-                        if (proc_coords(3) < rem_cells) then
-                            z_domain%beg = z_domain%beg + dz*real((p + 1)* &
-                                                                  proc_coords(3))
-                            z_domain%end = z_domain%end - dz*real((p + 1)* &
-                                                                  (num_procs_z - proc_coords(3) - 1) &
-                                                                  - (num_procs_z - rem_cells))
-                        else
-                            z_domain%beg = z_domain%beg + dz*real((p + 1)* &
-                                                                  proc_coords(3) + rem_cells)
-                            z_domain%end = z_domain%end - dz*real((p + 1)* &
-                                                                  (num_procs_z - proc_coords(3) - 1))
-                        end if
-                    end if
-                else
-                    if (proc_coords(3) < rem_cells) then
-                        start_idx(3) = (p + 1)*proc_coords(3)
-                    else
-                        start_idx(3) = (p + 1)*proc_coords(3) + rem_cells
-                    end if
-                end if
-
-                ! ==================================================================
-
-                ! Generating 2D Cartesian Processor Topology =======================
-
-            else
-
-                ! Initial values of the processor factorization optimization
-                num_procs_x = 1
-                num_procs_y = num_procs
-                ierr = -1
-
-                ! Computing minimization variable for these initial values
-                tmp_num_procs_x = num_procs_x
-                tmp_num_procs_y = num_procs_y
-                fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
-                                   - (n + 1)/tmp_num_procs_y)
-
-                ! Searching for optimal computational domain distribution
-                do i = 1, num_procs
-
-                    if (mod(num_procs, i) == 0 &
-                        .and. &
-                        (m + 1)/i >= num_stcls_min*weno_order) then
-
-                        tmp_num_procs_x = i
-                        tmp_num_procs_y = num_procs/i
-
-                        if (fct_min >= abs((m + 1)/tmp_num_procs_x &
-                                           - (n + 1)/tmp_num_procs_y) &
-                            .and. &
-                            (n + 1)/tmp_num_procs_y &
-                            >= &
-                            num_stcls_min*weno_order) then
-
-                            num_procs_x = i
-                            num_procs_y = num_procs/i
-                            fct_min = abs((m + 1)/tmp_num_procs_x &
-                                          - (n + 1)/tmp_num_procs_y)
-                            ierr = 0
-
-                        end if
-
-                    end if
-
-                end do
-
-                ! Checking whether the decomposition of the computational
-                ! domain was successful
-                if (proc_rank == 0 .and. ierr == -1) then
-                    print '(A)', 'Unable to decompose computational '// &
-                        'domain for selected number of '// &
-                        'processors. Exiting ...'
-                    call MPI_ABORT(MPI_COMM_WORLD, 1, ierr)
-                end if
-
-                ! Creating a new communicator using Cartesian topology
-                call MPI_CART_CREATE(MPI_COMM_WORLD, 2, (/num_procs_x, &
-                                                          num_procs_y/), (/.true., &
-                                                                           .true./), .false., MPI_COMM_CART, &
-                                     ierr)
-
-                ! Finding corresponding Cartesian coordinates of the local
-                ! processor rank in newly declared cartesian communicator
-                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 2, &
-                                     proc_coords, ierr)
-
-            end if
-
-            ! END: Generating 2D Cartesian Processor Topology ==================
-
-            ! Sub-domain Global Parameters in y-direction ======================
-
-            ! Number of remaining cells after majority has been distributed
-            rem_cells = mod(n + 1, num_procs_y)
-
-            ! Preliminary uniform cell-width spacing
-            if (old_grid .neqv. .true.) then
-                dy = (y_domain%end - y_domain%beg)/real(n + 1, kind(0d0))
-            end if
-
-            ! Optimal number of cells per processor
-            n = (n + 1)/num_procs_y - 1
-
-            ! Distributing any remaining cells
-            do i = 1, rem_cells
-                if (proc_coords(2) == i - 1) then
-                    n = n + 1
-                    exit
-                end if
-            end do
-
-            ! Beginning and end sub-domain boundary locations
-            if (parallel_io .neqv. .true.) then
-                if (old_grid .neqv. .true.) then
-                    if (proc_coords(2) < rem_cells) then
-                        y_domain%beg = y_domain%beg + dy*real((n + 1)* &
-                                                              proc_coords(2))
-                        y_domain%end = y_domain%end - dy*real((n + 1)* &
-                                                              (num_procs_y - proc_coords(2) - 1) &
-                                                              - (num_procs_y - rem_cells))
-                    else
-                        y_domain%beg = y_domain%beg + dy*real((n + 1)* &
-                                                              proc_coords(2) + rem_cells)
-                        y_domain%end = y_domain%end - dy*real((n + 1)* &
-                                                              (num_procs_y - proc_coords(2) - 1))
-                    end if
-                end if
-            else
-                if (proc_coords(2) < rem_cells) then
-                    start_idx(2) = (n + 1)*proc_coords(2)
-                else
-                    start_idx(2) = (n + 1)*proc_coords(2) + rem_cells
-                end if
-            end if
-
-            ! ==================================================================
-
-            ! Generating 1D Cartesian Processor Topology =======================
-
-        else
-
-            ! Number of processors in the coordinate direction is equal to
-            ! the total number of processors available
-            num_procs_x = num_procs
-
-            ! Creating a new communicator using Cartesian topology
-            call MPI_CART_CREATE(MPI_COMM_WORLD, 1, (/num_procs_x/), &
-                                 (/.true./), .false., MPI_COMM_CART, &
-                                 ierr)
-
-            ! Finding the corresponding Cartesian coordinates of the local
-            ! processor rank in the newly declared cartesian communicator
-            call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 1, &
-                                 proc_coords, ierr)
-
-        end if
-
-        ! ==================================================================
-
-        ! Sub-domain Global Parameters in x-direction ======================
-
-        ! Number of remaining cells after majority has been distributed
-        rem_cells = mod(m + 1, num_procs_x)
-
-        ! Preliminary uniform cell-width spacing
-        if (old_grid .neqv. .true.) then
-            dx = (x_domain%end - x_domain%beg)/real(m + 1, kind(0d0))
-        end if
-
-        ! Optimal number of cells per processor
-        m = (m + 1)/num_procs_x - 1
-
-        ! Distributing any remaining cells
-        do i = 1, rem_cells
-            if (proc_coords(1) == i - 1) then
-                m = m + 1
-                exit
-            end if
-        end do
-
-        ! Beginning and end sub-domain boundary locations
-        if (parallel_io .neqv. .true.) then
-            if (old_grid .neqv. .true.) then
-                if (proc_coords(1) < rem_cells) then
-                    x_domain%beg = x_domain%beg + dx*real((m + 1)* &
-                                                          proc_coords(1))
-                    x_domain%end = x_domain%end - dx*real((m + 1)* &
-                                                          (num_procs_x - proc_coords(1) - 1) &
-                                                          - (num_procs_x - rem_cells))
-                else
-                    x_domain%beg = x_domain%beg + dx*real((m + 1)* &
-                                                          proc_coords(1) + rem_cells)
-                    x_domain%end = x_domain%end - dx*real((m + 1)* &
-                                                          (num_procs_x - proc_coords(1) - 1))
-                end if
-            end if
-        else
-            if (proc_coords(1) < rem_cells) then
-                start_idx(1) = (m + 1)*proc_coords(1)
-            else
-                start_idx(1) = (m + 1)*proc_coords(1) + rem_cells
-            end if
-        end if
-
-        ! ==================================================================
-
-#endif
-
-    end subroutine s_mpi_decompose_computational_domain
-
 end module m_mpi_proxy
diff --git a/src/pre_process/m_start_up.fpp b/src/pre_process/m_start_up.fpp
index 3d78432591..6a0738438e 100644
--- a/src/pre_process/m_start_up.fpp
+++ b/src/pre_process/m_start_up.fpp
@@ -50,6 +50,8 @@ module m_start_up
     use m_checker_common
 
     use m_checker
+
+    use m_boundary_conditions_common
     ! ==========================================================================
 
     implicit none
@@ -128,8 +130,8 @@ contains
             stretch_x, stretch_y, stretch_z, a_x, a_y, &
             a_z, x_a, y_a, z_a, x_b, y_b, z_b, &
             model_eqns, num_fluids, mpp_lim, &
-            weno_order, bc_x, bc_y, bc_z, num_patches, &
-            hypoelasticity, patch_icpp, fluid_pp, precision, parallel_io, &
+            weno_order, bc_x, bc_y, bc_z, patch_bc, num_patches, num_bc_patches, &
+            hypoelasticity, rdma_mpi, patch_icpp, fluid_pp, precision, parallel_io, &
             mixlayer_vel_profile, mixlayer_vel_coef, mixlayer_domain, &
             mixlayer_perturb, &
             pi_fac, perturb_flow, perturb_flow_fluid, perturb_flow_mag, &
@@ -735,6 +737,8 @@ contains
             call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting... ')
         end if
 
+        call s_read_boundary_condition_files(restart_dir)
+
         if (ib) then
 
             write (file_loc, '(A)') 'ib.dat'
@@ -768,6 +772,8 @@ contains
         ! Computation of parameters, allocation procedures, and/or any other tasks
         ! needed to properly setup the modules
         call s_initialize_global_parameters_module()
+        call s_initialize_boundary_conditions_module()
+
         !Quadrature weights and nodes for polydisperse simulations
         if (bubbles .and. nb > 1) then
             call s_simpson
@@ -823,6 +829,8 @@ contains
                 call s_read_grid_data_files()
                 call s_check_grid_data_files()
             end if
+
+            call s_generate_boundary_condition_patch_buffers()
         end if
 
     end subroutine s_read_grid
@@ -922,6 +930,7 @@ contains
         call s_mpi_bcast_user_inputs()
         call s_initialize_parallel_io()
         call s_mpi_decompose_computational_domain()
+
     end subroutine s_initialize_mpi_domain
 
     subroutine s_finalize_modules
diff --git a/src/simulation/m_boundary_conditions.fpp b/src/simulation/m_boundary_conditions.fpp
index 266be8ed00..85a85da31a 100644
--- a/src/simulation/m_boundary_conditions.fpp
+++ b/src/simulation/m_boundary_conditions.fpp
@@ -2,6 +2,8 @@
 !! @file m_boundary_conditions.fpp
 !! @brief Contains module m_boundary_conditions
 
+#:include 'macros.fpp'
+
 !> @brief The purpose of the module is to apply noncharacteristic and processor
 !! boundary condiitons
 module m_boundary_conditions
@@ -11,1628 +13,49 @@ module m_boundary_conditions
 
     use m_global_parameters    !< Definitions of the global parameters
 
-    use m_mpi_proxy
+    use m_mpi_proxy            !< Definitions of the MPI proxy
 
     use m_constants
+
+    use m_boundary_conditions_common
+
     ! ==========================================================================
 
     implicit none
 
     private; 
-    public :: s_populate_variables_buffers, &
-              s_populate_capillary_buffers
+    public :: s_populate_prim_buffers, &
+              s_populate_capillary_buffers, &
+              s_initialize_boundary_conditions_module
 
 contains
 
-    !>  The purpose of this procedure is to populate the buffers
-    !!      of the primitive variables, depending on the selected
-    !!      boundary conditions.
-    subroutine s_populate_variables_buffers(q_prim_vf, pb, mv)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(inout) :: pb, mv
-
-        integer :: bc_loc, bc_dir
-
-        ! Population of Buffers in x-direction =============================
-
-        select case (bc_x%beg)
-        case (-13:-3) ! Ghost-cell extrap. BC at beginning
-            call s_ghost_cell_extrapolation(q_prim_vf, pb, mv, 1, -1)
-        case (-2)     ! Symmetry BC at beginning
-            call s_symmetry(q_prim_vf, pb, mv, 1, -1)
-        case (-1)     ! Periodic BC at beginning
-            call s_periodic(q_prim_vf, pb, mv, 1, -1)
-        case (-15)    ! Slip wall BC at beginning
-            call s_slip_wall(q_prim_vf, pb, mv, 1, -1)
-        case (-16)    ! No-slip wall BC at beginning
-            call s_no_slip_wall(q_prim_vf, pb, mv, 1, -1)
-        case default ! Processor BC at beginning
-            call s_mpi_sendrecv_variables_buffers( &
-                q_prim_vf, pb, mv, 1, -1)
-        end select
-
-        select case (bc_x%end)
-        case (-13:-3) ! Ghost-cell extrap. BC at end
-            call s_ghost_cell_extrapolation(q_prim_vf, pb, mv, 1, 1)
-        case (-2)     ! Symmetry BC at end
-            call s_symmetry(q_prim_vf, pb, mv, 1, 1)
-        case (-1)     ! Periodic BC at end
-            call s_periodic(q_prim_vf, pb, mv, 1, 1)
-        case (-15)    ! Slip wall BC at end
-            call s_slip_wall(q_prim_vf, pb, mv, 1, 1)
-        case (-16)    ! No-slip wall bc at end
-            call s_no_slip_wall(q_prim_vf, pb, mv, 1, 1)
-        case default ! Processor BC at end
-            call s_mpi_sendrecv_variables_buffers( &
-                q_prim_vf, pb, mv, 1, 1)
-        end select
-
-        if (qbmm .and. .not. polytropic) then
-            select case (bc_x%beg)
-            case (-13:-3) ! Ghost-cell extrap. BC at beginning
-                call s_qbmm_extrapolation(pb, mv, 1, -1)
-            case (-15)    ! Slip wall BC at beginning
-                call s_qbmm_extrapolation(pb, mv, 1, -1)
-            case (-16)    ! No-slip wall BC at beginning
-                call s_qbmm_extrapolation(pb, mv, 1, -1)
-            end select
-
-            select case (bc_x%end)
-            case (-13:-3) ! Ghost-cell extrap. BC at end
-                call s_qbmm_extrapolation(pb, mv, 1, 1)
-            case (-15)    ! Slip wall BC at end
-                call s_qbmm_extrapolation(pb, mv, 1, 1)
-            case (-16)    ! No-slip wall bc at end
-                call s_qbmm_extrapolation(pb, mv, 1, 1)
-            end select
-        end if
-
-        ! END: Population of Buffers in x-direction ========================
-
-        ! Population of Buffers in y-direction =============================
-
-        if (n == 0) return
-
-        select case (bc_y%beg)
-        case (-13:-3) ! Ghost-cell extrap. BC at beginning
-            call s_ghost_cell_extrapolation(q_prim_vf, pb, mv, 2, -1)
-        case (-14)    ! Axis BC at beginning
-            call s_axis(q_prim_vf, pb, mv, 2, -1)
-        case (-2)     ! Symmetry BC at beginning
-            call s_symmetry(q_prim_vf, pb, mv, 2, -1)
-        case (-1)     ! Periodic BC at beginning
-            call s_periodic(q_prim_vf, pb, mv, 2, -1)
-        case (-15)    ! Slip wall BC at beginning
-            call s_slip_wall(q_prim_vf, pb, mv, 2, -1)
-        case (-16)    ! No-slip wall BC at beginning
-            call s_no_slip_wall(q_prim_vf, pb, mv, 2, -1)
-        case default ! Processor BC at beginning
-            call s_mpi_sendrecv_variables_buffers( &
-                q_prim_vf, pb, mv, 2, -1)
-        end select
-
-        select case (bc_y%end)
-        case (-13:-3) ! Ghost-cell extrap. BC at end
-            call s_ghost_cell_extrapolation(q_prim_vf, pb, mv, 2, 1)
-        case (-2)     ! Symmetry BC at end
-            call s_symmetry(q_prim_vf, pb, mv, 2, 1)
-        case (-1)     ! Periodic BC at end
-            call s_periodic(q_prim_vf, pb, mv, 2, 1)
-        case (-15)    ! Slip wall BC at end
-            call s_slip_wall(q_prim_vf, pb, mv, 2, 1)
-        case (-16)    ! No-slip wall BC at end
-            call s_no_slip_wall(q_prim_vf, pb, mv, 2, 1)
-        case default ! Processor BC at end
-            call s_mpi_sendrecv_variables_buffers( &
-                q_prim_vf, pb, mv, 2, 1)
-        end select
-
-        if (qbmm .and. .not. polytropic) then
-
-            select case (bc_y%beg)
-            case (-13:-3) ! Ghost-cell extrap. BC at beginning
-                call s_qbmm_extrapolation(pb, mv, 2, -1)
-            case (-15)    ! Slip wall BC at beginning
-                call s_qbmm_extrapolation(pb, mv, 2, -1)
-            case (-16)    ! No-slip wall BC at beginning
-                call s_qbmm_extrapolation(pb, mv, 2, -1)
-            end select
-
-            select case (bc_y%end)
-            case (-13:-3) ! Ghost-cell extrap. BC at end
-                call s_qbmm_extrapolation(pb, mv, 2, 1)
-            case (-15)    ! Slip wall BC at end
-                call s_qbmm_extrapolation(pb, mv, 2, 1)
-            case (-16)    ! No-slip wall BC at end
-                call s_qbmm_extrapolation(pb, mv, 2, 1)
-            end select
-
-        end if
-
-        ! END: Population of Buffers in y-direction ========================
-
-        ! Population of Buffers in z-direction =============================
-
-        if (p == 0) return
-
-        select case (bc_z%beg)
-        case (-13:-3) ! Ghost-cell extrap. BC at beginning
-            call s_ghost_cell_extrapolation(q_prim_vf, pb, mv, 3, -1)
-        case (-2)     ! Symmetry BC at beginning
-            call s_symmetry(q_prim_vf, pb, mv, 3, -1)
-        case (-1)     ! Periodic BC at beginning
-            call s_periodic(q_prim_vf, pb, mv, 3, -1)
-        case (-15)    ! Slip wall BC at beginning
-            call s_slip_wall(q_prim_vf, pb, mv, 3, -1)
-        case (-16)    ! No-slip wall BC at beginning
-            call s_no_slip_wall(q_prim_vf, pb, mv, 3, -1)
-        case default ! Processor BC at beginning
-            call s_mpi_sendrecv_variables_buffers( &
-                q_prim_vf, pb, mv, 3, -1)
-        end select
-
-        select case (bc_z%end)
-        case (-13:-3) ! Ghost-cell extrap. BC at end
-            call s_ghost_cell_extrapolation(q_prim_vf, pb, mv, 3, 1)
-        case (-2)     ! Symmetry BC at end
-            call s_symmetry(q_prim_vf, pb, mv, 3, 1)
-        case (-1)     ! Periodic BC at end
-            call s_periodic(q_prim_vf, pb, mv, 3, 1)
-        case (-15)    ! Slip wall BC at end
-            call s_slip_wall(q_prim_vf, pb, mv, 3, 1)
-        case (-16)    ! No-slip wall BC at end
-            call s_no_slip_wall(q_prim_vf, pb, mv, 3, 1)
-        case default ! Processor BC at end
-            call s_mpi_sendrecv_variables_buffers( &
-                q_prim_vf, pb, mv, 3, 1)
-        end select
-
-        if (qbmm .and. .not. polytropic) then
-
-            select case (bc_z%beg)
-            case (-13:-3) ! Ghost-cell extrap. BC at beginning
-                call s_qbmm_extrapolation(pb, mv, 3, -1)
-            case (-15)    ! Slip wall BC at beginning
-                call s_qbmm_extrapolation(pb, mv, 3, -1)
-            case (-16)    ! No-slip wall BC at beginning
-                call s_qbmm_extrapolation(pb, mv, 3, -1)
-            end select
-
-            select case (bc_z%end)
-            case (-13:-3) ! Ghost-cell extrap. BC at end
-                call s_qbmm_extrapolation(pb, mv, 3, 1)
-            case (-15)    ! Slip wall BC at end
-                call s_qbmm_extrapolation(pb, mv, 3, 1)
-            case (-16)    ! No-slip wall BC at end
-                call s_qbmm_extrapolation(pb, mv, 3, 1)
-            end select
-
-        end if
-
-        ! END: Population of Buffers in z-direction ========================
-
-    end subroutine s_populate_variables_buffers
-
-    subroutine s_ghost_cell_extrapolation(q_prim_vf, pb, mv, bc_dir, bc_loc)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(inout) :: pb, mv
-        integer, intent(in) :: bc_dir, bc_loc
-        integer :: j, k, l, q, i
-
-        !< x-direction =========================================================
-        if (bc_dir == 1) then !< x-direction
-
-            if (bc_loc == -1) then !bc_x%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 1, buff_size
-                                q_prim_vf(i)%sf(-j, k, l) = &
-                                    q_prim_vf(i)%sf(0, k, l)
-                            end do
-                        end do
-                    end do
-                end do
-
-            else !< bc_x%end
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 1, buff_size
-                                q_prim_vf(i)%sf(m + j, k, l) = &
-                                    q_prim_vf(i)%sf(m, k, l)
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-            !< y-direction =========================================================
-        elseif (bc_dir == 2) then !< y-direction
-
-            if (bc_loc == -1) then !< bc_y%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do k = 0, p
-                        do j = 1, buff_size
-                            do l = -buff_size, m + buff_size
-                                q_prim_vf(i)%sf(l, -j, k) = &
-                                    q_prim_vf(i)%sf(l, 0, k)
-                            end do
-                        end do
-                    end do
-                end do
-
-            else !< bc_y%end
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do k = 0, p
-                        do j = 1, buff_size
-                            do l = -buff_size, m + buff_size
-                                q_prim_vf(i)%sf(l, n + j, k) = &
-                                    q_prim_vf(i)%sf(l, n, k)
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-            !< z-direction =========================================================
-        elseif (bc_dir == 3) then !< z-direction
-
-            if (bc_loc == -1) then !< bc_z%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do j = 1, buff_size
-                        do l = -buff_size, n + buff_size
-                            do k = -buff_size, m + buff_size
-                                q_prim_vf(i)%sf(k, l, -j) = &
-                                    q_prim_vf(i)%sf(k, l, 0)
-                            end do
-                        end do
-                    end do
-                end do
-
-            else !< bc_z%end
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do j = 1, buff_size
-                        do l = -buff_size, n + buff_size
-                            do k = -buff_size, m + buff_size
-                                q_prim_vf(i)%sf(k, l, p + j) = &
-                                    q_prim_vf(i)%sf(k, l, p)
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-        end if
-        !< =====================================================================
-
-    end subroutine s_ghost_cell_extrapolation
-
-    subroutine s_symmetry(q_prim_vf, pb, mv, bc_dir, bc_loc)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(inout) :: pb, mv
-        integer, intent(in) :: bc_dir, bc_loc
-
-        integer :: j, k, l, q, i
-
-        !< x-direction =========================================================
-        if (bc_dir == 1) then
-
-            if (bc_loc == -1) then !< bc_x%beg
-
-                !$acc parallel loop collapse(3) gang vector default(present)
-                do l = 0, p
-                    do k = 0, n
-                        do j = 1, buff_size
-                            !$acc loop seq
-                            do i = 1, contxe
-                                q_prim_vf(i)%sf(-j, k, l) = &
-                                    q_prim_vf(i)%sf(j - 1, k, l)
-                            end do
-
-                            q_prim_vf(momxb)%sf(-j, k, l) = &
-                                -q_prim_vf(momxb)%sf(j - 1, k, l)
-
-                            !$acc loop seq
-                            do i = momxb + 1, sys_size
-                                q_prim_vf(i)%sf(-j, k, l) = &
-                                    q_prim_vf(i)%sf(j - 1, k, l)
-                            end do
-                        end do
-                    end do
-                end do
-
-                if (qbmm .and. .not. polytropic) then
-                    !$acc parallel loop collapse(5) gang vector default(present)
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do l = 0, p
-                                do k = 0, n
-                                    do j = 1, buff_size
-                                        pb(-j, k, l, q, i) = &
-                                            pb(j - 1, k, l, q, i)
-                                        mv(-j, k, l, q, i) = &
-                                            mv(j - 1, k, l, q, i)
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                end if
-
-            else !< bc_x%end
-
-                !$acc parallel loop collapse(3) default(present)
-                do l = 0, p
-                    do k = 0, n
-                        do j = 1, buff_size
-
-                            !$acc loop seq
-                            do i = 1, contxe
-                                q_prim_vf(i)%sf(m + j, k, l) = &
-                                    q_prim_vf(i)%sf(m - (j - 1), k, l)
-                            end do
-
-                            q_prim_vf(momxb)%sf(m + j, k, l) = &
-                                -q_prim_vf(momxb)%sf(m - (j - 1), k, l)
-
-                            !$acc loop seq
-                            do i = momxb + 1, sys_size
-                                q_prim_vf(i)%sf(m + j, k, l) = &
-                                    q_prim_vf(i)%sf(m - (j - 1), k, l)
-                            end do
-
-                        end do
-                    end do
-                end do
-
-                if (qbmm .and. .not. polytropic) then
-                    !$acc parallel loop collapse(5) gang vector default(present)
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do l = 0, p
-                                do k = 0, n
-                                    do j = 1, buff_size
-                                        pb(m + j, k, l, q, i) = &
-                                            pb(m - (j - 1), k, l, q, i)
-                                        mv(m + j, k, l, q, i) = &
-                                            mv(m - (j - 1), k, l, q, i)
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                end if
-
-            end if
-
-            !< y-direction =========================================================
-        elseif (bc_dir == 2) then
-
-            if (bc_loc == -1) then !< bc_y%beg
-
-                !$acc parallel loop collapse(3) gang vector default(present)
-                do k = 0, p
-                    do j = 1, buff_size
-                        do l = -buff_size, m + buff_size
-                            !$acc loop seq
-                            do i = 1, momxb
-                                q_prim_vf(i)%sf(l, -j, k) = &
-                                    q_prim_vf(i)%sf(l, j - 1, k)
-                            end do
-
-                            q_prim_vf(momxb + 1)%sf(l, -j, k) = &
-                                -q_prim_vf(momxb + 1)%sf(l, j - 1, k)
-
-                            !$acc loop seq
-                            do i = momxb + 2, sys_size
-                                q_prim_vf(i)%sf(l, -j, k) = &
-                                    q_prim_vf(i)%sf(l, j - 1, k)
-                            end do
-                        end do
-                    end do
-                end do
-
-                if (qbmm .and. .not. polytropic) then
-                    !$acc parallel loop collapse(5) gang vector default(present)
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do k = 0, p
-                                do j = 1, buff_size
-                                    do l = -buff_size, m + buff_size
-                                        pb(l, -j, k, q, i) = &
-                                            pb(l, j - 1, k, q, i)
-                                        mv(l, -j, k, q, i) = &
-                                            mv(l, j - 1, k, q, i)
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                end if
-
-            else !< bc_y%end
-
-                !$acc parallel loop collapse(3) gang vector default(present)
-                do k = 0, p
-                    do j = 1, buff_size
-                        do l = -buff_size, m + buff_size
-                            !$acc loop seq
-                            do i = 1, momxb
-                                q_prim_vf(i)%sf(l, n + j, k) = &
-                                    q_prim_vf(i)%sf(l, n - (j - 1), k)
-                            end do
-
-                            q_prim_vf(momxb + 1)%sf(l, n + j, k) = &
-                                -q_prim_vf(momxb + 1)%sf(l, n - (j - 1), k)
-
-                            !$acc loop seq
-                            do i = momxb + 2, sys_size
-                                q_prim_vf(i)%sf(l, n + j, k) = &
-                                    q_prim_vf(i)%sf(l, n - (j - 1), k)
-                            end do
-                        end do
-                    end do
-                end do
-
-                if (qbmm .and. .not. polytropic) then
-                    !$acc parallel loop collapse(5) gang vector default(present)
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do k = 0, p
-                                do j = 1, buff_size
-                                    do l = -buff_size, m + buff_size
-                                        pb(l, n + j, k, q, i) = &
-                                            pb(l, n - (j - 1), k, q, i)
-                                        mv(l, n + j, k, q, i) = &
-                                            mv(l, n - (j - 1), k, q, i)
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                end if
-
-            end if
-
-            !< z-direction =========================================================
-        elseif (bc_dir == 3) then
-
-            if (bc_loc == -1) then !< bc_z%beg
-
-                !$acc parallel loop collapse(3) gang vector default(present)
-                do j = 1, buff_size
-                    do l = -buff_size, n + buff_size
-                        do k = -buff_size, m + buff_size
-                            !$acc loop seq
-                            do i = 1, momxb + 1
-                                q_prim_vf(i)%sf(k, l, -j) = &
-                                    q_prim_vf(i)%sf(k, l, j - 1)
-                            end do
-
-                            q_prim_vf(momxe)%sf(k, l, -j) = &
-                                -q_prim_vf(momxe)%sf(k, l, j - 1)
-
-                            !$acc loop seq
-                            do i = E_idx, sys_size
-                                q_prim_vf(i)%sf(k, l, -j) = &
-                                    q_prim_vf(i)%sf(k, l, j - 1)
-                            end do
-                        end do
-                    end do
-                end do
-
-                if (qbmm .and. .not. polytropic) then
-                    !$acc parallel loop collapse(5) gang vector default(present)
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do j = 1, buff_size
-                                do l = -buff_size, n + buff_size
-                                    do k = -buff_size, m + buff_size
-                                        pb(k, l, -j, q, i) = &
-                                            pb(k, l, j - 1, q, i)
-                                        mv(k, l, -j, q, i) = &
-                                            mv(k, l, j - 1, q, i)
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                end if
-
-            else !< bc_z%end
-
-                !$acc parallel loop collapse(3) gang vector default(present)
-                do j = 1, buff_size
-                    do l = -buff_size, n + buff_size
-                        do k = -buff_size, m + buff_size
-                            !$acc loop seq
-                            do i = 1, momxb + 1
-                                q_prim_vf(i)%sf(k, l, p + j) = &
-                                    q_prim_vf(i)%sf(k, l, p - (j - 1))
-                            end do
-
-                            q_prim_vf(momxe)%sf(k, l, p + j) = &
-                                -q_prim_vf(momxe)%sf(k, l, p - (j - 1))
-
-                            !$acc loop seq
-                            do i = E_idx, sys_size
-                                q_prim_vf(i)%sf(k, l, p + j) = &
-                                    q_prim_vf(i)%sf(k, l, p - (j - 1))
-                            end do
-                        end do
-                    end do
-                end do
-
-                if (qbmm .and. .not. polytropic) then
-                    !$acc parallel loop collapse(5) gang vector default(present)
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do j = 1, buff_size
-                                do l = -buff_size, n + buff_size
-                                    do k = -buff_size, m + buff_size
-                                        pb(k, l, p + j, q, i) = &
-                                            pb(k, l, p - (j - 1), q, i)
-                                        mv(k, l, p + j, q, i) = &
-                                            mv(k, l, p - (j - 1), q, i)
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                end if
-
-            end if
-
-        end if
-        !< =====================================================================
-
-    end subroutine s_symmetry
-
-    subroutine s_periodic(q_prim_vf, pb, mv, bc_dir, bc_loc)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(inout) :: pb, mv
-        integer, intent(in) :: bc_dir, bc_loc
-
-        integer :: j, k, l, q, i
-
-        !< x-direction =========================================================
-        if (bc_dir == 1) then
-
-            if (bc_loc == -1) then !< bc_x%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 1, buff_size
-                                q_prim_vf(i)%sf(-j, k, l) = &
-                                    q_prim_vf(i)%sf(m - (j - 1), k, l)
-                            end do
-                        end do
-                    end do
-                end do
-
-                if (qbmm .and. .not. polytropic) then
-                    !$acc parallel loop collapse(5) gang vector default(present)
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do l = 0, p
-                                do k = 0, n
-                                    do j = 1, buff_size
-                                        pb(-j, k, l, q, i) = &
-                                            pb(m - (j - 1), k, l, q, i)
-                                        mv(-j, k, l, q, i) = &
-                                            mv(m - (j - 1), k, l, q, i)
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                end if
-
-            else !< bc_x%end
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 1, buff_size
-                                q_prim_vf(i)%sf(m + j, k, l) = &
-                                    q_prim_vf(i)%sf(j - 1, k, l)
-                            end do
-                        end do
-                    end do
-                end do
-
-                if (qbmm .and. .not. polytropic) then
-                    !$acc parallel loop collapse(5) gang vector default(present)
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do l = 0, p
-                                do k = 0, n
-                                    do j = 1, buff_size
-                                        pb(m + j, k, l, q, i) = &
-                                            pb(j - 1, k, l, q, i)
-                                        mv(m + j, k, l, q, i) = &
-                                            mv(j - 1, k, l, q, i)
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                end if
-
-            end if
-
-            !< y-direction =========================================================
-        elseif (bc_dir == 2) then
-
-            if (bc_loc == -1) then !< bc_y%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do k = 0, p
-                        do j = 1, buff_size
-                            do l = -buff_size, m + buff_size
-                                q_prim_vf(i)%sf(l, -j, k) = &
-                                    q_prim_vf(i)%sf(l, n - (j - 1), k)
-                            end do
-                        end do
-                    end do
-                end do
-
-                if (qbmm .and. .not. polytropic) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do k = 0, p
-                                do j = 1, buff_size
-                                    do l = -buff_size, m + buff_size
-                                        pb(l, -j, k, q, i) = &
-                                            pb(l, n - (j - 1), k, q, i)
-                                        mv(l, -j, k, q, i) = &
-                                            mv(l, n - (j - 1), k, q, i)
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                end if
-
-            else !< bc_y%end
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do k = 0, p
-                        do j = 1, buff_size
-                            do l = -buff_size, m + buff_size
-                                q_prim_vf(i)%sf(l, n + j, k) = &
-                                    q_prim_vf(i)%sf(l, j - 1, k)
-                            end do
-                        end do
-                    end do
-                end do
-
-                if (qbmm .and. .not. polytropic) then
-                    !$acc parallel loop collapse(5) gang vector default(present)
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do k = 0, p
-                                do j = 1, buff_size
-                                    do l = -buff_size, m + buff_size
-                                        pb(l, n + j, k, q, i) = &
-                                            pb(l, (j - 1), k, q, i)
-                                        mv(l, n + j, k, q, i) = &
-                                            mv(l, (j - 1), k, q, i)
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                end if
-
-            end if
-
-            !< z-direction =========================================================
-        elseif (bc_dir == 3) then
-
-            if (bc_loc == -1) then !< bc_z%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do j = 1, buff_size
-                        do l = -buff_size, n + buff_size
-                            do k = -buff_size, m + buff_size
-                                q_prim_vf(i)%sf(k, l, -j) = &
-                                    q_prim_vf(i)%sf(k, l, p - (j - 1))
-                            end do
-                        end do
-                    end do
-                end do
-
-                if (qbmm .and. .not. polytropic) then
-                    !$acc parallel loop collapse(5) gang vector default(present)
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do j = 1, buff_size
-                                do l = -buff_size, n + buff_size
-                                    do k = -buff_size, m + buff_size
-                                        pb(k, l, -j, q, i) = &
-                                            pb(k, l, p - (j - 1), q, i)
-                                        mv(k, l, -j, q, i) = &
-                                            mv(k, l, p - (j - 1), q, i)
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                end if
-
-            else !< bc_z%end
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do j = 1, buff_size
-                        do l = -buff_size, n + buff_size
-                            do k = -buff_size, m + buff_size
-                                q_prim_vf(i)%sf(k, l, p + j) = &
-                                    q_prim_vf(i)%sf(k, l, j - 1)
-                            end do
-                        end do
-                    end do
-                end do
-
-                if (qbmm .and. .not. polytropic) then
-                    !$acc parallel loop collapse(5) gang vector default(present)
-                    do i = 1, nb
-                        do q = 1, nnode
-                            do j = 1, buff_size
-                                do l = -buff_size, n + buff_size
-                                    do k = -buff_size, m + buff_size
-                                        pb(k, l, p + j, q, i) = &
-                                            pb(k, l, j - 1, q, i)
-                                        mv(k, l, p + j, q, i) = &
-                                            mv(k, l, j - 1, q, i)
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end do
-                end if
-
-            end if
-
-        end if
-        !< =====================================================================
-
-    end subroutine s_periodic
-
-    subroutine s_axis(q_prim_vf, pb, mv, bc_dir, bc_loc)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(inout) :: pb, mv
-        integer, intent(in) :: bc_dir, bc_loc
-
-        integer :: j, k, l, q, i
-
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do k = 0, p
-            do j = 1, buff_size
-                do l = -buff_size, m + buff_size
-                    if (z_cc(k) < pi) then
-                        !$acc loop seq
-                        do i = 1, momxb
-                            q_prim_vf(i)%sf(l, -j, k) = &
-                                q_prim_vf(i)%sf(l, j - 1, k + ((p + 1)/2))
-                        end do
+    subroutine s_populate_capillary_buffers(c_divs)
 
-                        q_prim_vf(momxb + 1)%sf(l, -j, k) = &
-                            -q_prim_vf(momxb + 1)%sf(l, j - 1, k + ((p + 1)/2))
+        type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
 
-                        q_prim_vf(momxe)%sf(l, -j, k) = &
-                            -q_prim_vf(momxe)%sf(l, j - 1, k + ((p + 1)/2))
+        @:BOUNDARY_CONDITION_INTEGER_DECLARATIONS()
 
-                        !$acc loop seq
-                        do i = E_idx, sys_size
-                            q_prim_vf(i)%sf(l, -j, k) = &
-                                q_prim_vf(i)%sf(l, j - 1, k + ((p + 1)/2))
-                        end do
+        #:for dir, loc in itertools.product([1, 2, 3], [-1, 1])
+            #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir=dir, loc=loc, outer_loops=[("i", 1, "num_dims + 1")])
+                select case (bc_id_sfs(${dir}$, ${loc}$)%sf(exlhs, eylhs, ezlhs)%type)
+                case (-13:-3); 
+                    c_divs(i)%sf(x, y, z) = c_divs(i)%sf(ex, ey, ez)
+                case (-2); !< slip wall or reflective
+                    if (i == 1) then
+                        c_divs(i)%sf(x, y, z) = -c_divs(i)%sf(sx, sy, sz)
                     else
-                        !$acc loop seq
-                        do i = 1, momxb
-                            q_prim_vf(i)%sf(l, -j, k) = &
-                                q_prim_vf(i)%sf(l, j - 1, k - ((p + 1)/2))
-                        end do
-
-                        q_prim_vf(momxb + 1)%sf(l, -j, k) = &
-                            -q_prim_vf(momxb + 1)%sf(l, j - 1, k - ((p + 1)/2))
-
-                        q_prim_vf(momxe)%sf(l, -j, k) = &
-                            -q_prim_vf(momxe)%sf(l, j - 1, k - ((p + 1)/2))
-
-                        !$acc loop seq
-                        do i = E_idx, sys_size
-                            q_prim_vf(i)%sf(l, -j, k) = &
-                                q_prim_vf(i)%sf(l, j - 1, k - ((p + 1)/2))
-                        end do
+                        c_divs(i)%sf(x, y, z) = +c_divs(i)%sf(sx, sy, sz)
                     end if
-                end do
-            end do
-        end do
-
-        if (qbmm .and. .not. polytropic) then
-            !$acc parallel loop collapse(5) gang vector default(present)
-            do i = 1, nb
-                do q = 1, nnode
-                    do k = 0, p
-                        do j = 1, buff_size
-                            do l = -buff_size, m + buff_size
-                                pb(l, -j, k, q, i) = &
-                                    pb(l, j - 1, k - ((p + 1)/2), q, i)
-                                mv(l, -j, k, q, i) = &
-                                    mv(l, j - 1, k - ((p + 1)/2), q, i)
-                            end do
-                        end do
-                    end do
-                end do
-            end do
-        end if
-
-    end subroutine s_axis
-
-    subroutine s_slip_wall(q_prim_vf, pb, mv, bc_dir, bc_loc)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(inout) :: pb, mv
-        integer, intent(in) :: bc_dir, bc_loc
-
-        integer :: j, k, l, q, i
-
-        !< x-direction =========================================================
-        if (bc_dir == 1) then
-
-            if (bc_loc == -1) then !< bc_x%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 1, buff_size
-                                if (i == momxb) then
-                                    q_prim_vf(i)%sf(-j, k, l) = &
-                                        -q_prim_vf(i)%sf(j - 1, k, l) + 2d0*bc_x%vb1
-                                else
-                                    q_prim_vf(i)%sf(-j, k, l) = &
-                                        q_prim_vf(i)%sf(0, k, l)
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-
-            else !< bc_x%end
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 1, buff_size
-                                if (i == momxb) then
-                                    q_prim_vf(i)%sf(m + j, k, l) = &
-                                        -q_prim_vf(i)%sf(m - (j - 1), k, l) + 2d0*bc_x%ve1
-                                else
-                                    q_prim_vf(i)%sf(m + j, k, l) = &
-                                        q_prim_vf(i)%sf(m, k, l)
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-            !< y-direction =========================================================
-        elseif (bc_dir == 2) then
-
-            if (bc_loc == -1) then !< bc_y%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do k = 0, p
-                        do j = 1, buff_size
-                            do l = -buff_size, m + buff_size
-                                if (i == momxb + 1) then
-                                    q_prim_vf(i)%sf(l, -j, k) = &
-                                        -q_prim_vf(i)%sf(l, j - 1, k) + 2d0*bc_y%vb2
-                                else
-                                    q_prim_vf(i)%sf(l, -j, k) = &
-                                        q_prim_vf(i)%sf(l, 0, k)
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-
-            else !< bc_y%end
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do k = 0, p
-                        do j = 1, buff_size
-                            do l = -buff_size, m + buff_size
-                                if (i == momxb + 1) then
-                                    q_prim_vf(i)%sf(l, n + j, k) = &
-                                        -q_prim_vf(i)%sf(l, n - (j - 1), k) + 2d0*bc_y%ve2
-                                else
-                                    q_prim_vf(i)%sf(l, n + j, k) = &
-                                        q_prim_vf(i)%sf(l, n, k)
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
+                case (-1); 
+                    c_divs(i)%sf(x, y, z) = c_divs(i)%sf(px, py, pz)
+                end select
+            #:endblock
 
+            if (${dir}$ <= num_dims) then
+                call s_mpi_sendrecv_capilary_variables_buffers(c_divs, bc_id_sfs, ${dir}$, ${loc}$)
             end if
-
-            !< z-direction =========================================================
-        elseif (bc_dir == 3) then
-
-            if (bc_loc == -1) then !< bc_z%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do j = 1, buff_size
-                        do l = -buff_size, n + buff_size
-                            do k = -buff_size, m + buff_size
-                                if (i == momxe) then
-                                    q_prim_vf(i)%sf(k, l, -j) = &
-                                        -q_prim_vf(i)%sf(k, l, j - 1) + 2d0*bc_z%vb3
-                                else
-                                    q_prim_vf(i)%sf(k, l, -j) = &
-                                        q_prim_vf(i)%sf(k, l, 0)
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-
-            else !< bc_z%end
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do j = 1, buff_size
-                        do l = -buff_size, n + buff_size
-                            do k = -buff_size, m + buff_size
-                                if (i == momxe) then
-                                    q_prim_vf(i)%sf(k, l, p + j) = &
-                                        -q_prim_vf(i)%sf(k, l, p - (j - 1)) + 2d0*bc_z%ve3
-                                else
-                                    q_prim_vf(i)%sf(k, l, p + j) = &
-                                        q_prim_vf(i)%sf(k, l, p)
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-        end if
-        !< =====================================================================
-
-    end subroutine s_slip_wall
-
-    subroutine s_no_slip_wall(q_prim_vf, pb, mv, bc_dir, bc_loc)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(inout) :: pb, mv
-        integer, intent(in) :: bc_dir, bc_loc
-
-        integer :: j, k, l, q, i
-
-        !< x-direction =========================================================
-        if (bc_dir == 1) then
-
-            if (bc_loc == -1) then !< bc_x%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 1, buff_size
-                                if (i == momxb) then
-                                    q_prim_vf(i)%sf(-j, k, l) = &
-                                        -q_prim_vf(i)%sf(j - 1, k, l) + 2d0*bc_x%vb1
-                                elseif (i == momxb + 1 .and. num_dims > 1) then
-                                    q_prim_vf(i)%sf(-j, k, l) = &
-                                        -q_prim_vf(i)%sf(j - 1, k, l) + 2d0*bc_x%vb2
-                                elseif (i == momxb + 2 .and. num_dims > 2) then
-                                    q_prim_vf(i)%sf(-j, k, l) = &
-                                        -q_prim_vf(i)%sf(j - 1, k, l) + 2d0*bc_x%vb3
-                                else
-                                    q_prim_vf(i)%sf(-j, k, l) = &
-                                        q_prim_vf(i)%sf(0, k, l)
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-
-            else !< bc_x%end
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 1, buff_size
-                                if (i == momxb) then
-                                    q_prim_vf(i)%sf(m + j, k, l) = &
-                                        -q_prim_vf(i)%sf(m - (j - 1), k, l) + 2d0*bc_x%ve1
-                                elseif (i == momxb + 1 .and. num_dims > 1) then
-                                    q_prim_vf(i)%sf(m + j, k, l) = &
-                                        -q_prim_vf(i)%sf(m - (j - 1), k, l) + 2d0*bc_x%ve2
-                                elseif (i == momxb + 2 .and. num_dims > 2) then
-                                    q_prim_vf(i)%sf(m + j, k, l) = &
-                                        -q_prim_vf(i)%sf(m - (j - 1), k, l) + 2d0*bc_x%ve3
-                                else
-                                    q_prim_vf(i)%sf(m + j, k, l) = &
-                                        q_prim_vf(i)%sf(m, k, l)
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-            !< y-direction =========================================================
-        elseif (bc_dir == 2) then
-
-            if (bc_loc == -1) then !< bc_y%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do k = 0, p
-                        do j = 1, buff_size
-                            do l = -buff_size, m + buff_size
-                                if (i == momxb) then
-                                    q_prim_vf(i)%sf(l, -j, k) = &
-                                        -q_prim_vf(i)%sf(l, j - 1, k) + 2d0*bc_y%vb1
-                                elseif (i == momxb + 1 .and. num_dims > 1) then
-                                    q_prim_vf(i)%sf(l, -j, k) = &
-                                        -q_prim_vf(i)%sf(l, j - 1, k) + 2d0*bc_y%vb2
-                                elseif (i == momxb + 2 .and. num_dims > 2) then
-                                    q_prim_vf(i)%sf(l, -j, k) = &
-                                        -q_prim_vf(i)%sf(l, j - 1, k) + 2d0*bc_y%vb3
-                                else
-                                    q_prim_vf(i)%sf(l, -j, k) = &
-                                        q_prim_vf(i)%sf(l, 0, k)
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-
-            else !< bc_y%end
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do k = 0, p
-                        do j = 1, buff_size
-                            do l = -buff_size, m + buff_size
-                                if (i == momxb) then
-                                    q_prim_vf(i)%sf(l, n + j, k) = &
-                                        -q_prim_vf(i)%sf(l, n - (j - 1), k) + 2d0*bc_y%ve1
-                                elseif (i == momxb + 1 .and. num_dims > 1) then
-                                    q_prim_vf(i)%sf(l, n + j, k) = &
-                                        -q_prim_vf(i)%sf(l, n - (j - 1), k) + 2d0*bc_y%ve2
-                                elseif (i == momxb + 2 .and. num_dims > 2) then
-                                    q_prim_vf(i)%sf(l, n + j, k) = &
-                                        -q_prim_vf(i)%sf(l, n - (j - 1), k) + 2d0*bc_y%ve3
-                                else
-                                    q_prim_vf(i)%sf(l, n + j, k) = &
-                                        q_prim_vf(i)%sf(l, n, k)
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-            !< z-direction =========================================================
-        elseif (bc_dir == 3) then
-
-            if (bc_loc == -1) then !< bc_z%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do j = 1, buff_size
-                        do l = -buff_size, n + buff_size
-                            do k = -buff_size, m + buff_size
-                                if (i == momxb) then
-                                    q_prim_vf(i)%sf(k, l, -j) = &
-                                        -q_prim_vf(i)%sf(k, l, j - 1) + 2d0*bc_z%vb1
-                                elseif (i == momxb + 1 .and. num_dims > 1) then
-                                    q_prim_vf(i)%sf(k, l, -j) = &
-                                        -q_prim_vf(i)%sf(k, l, j - 1) + 2d0*bc_z%vb2
-                                elseif (i == momxb + 2 .and. num_dims > 2) then
-                                    q_prim_vf(i)%sf(k, l, -j) = &
-                                        -q_prim_vf(i)%sf(k, l, j - 1) + 2d0*bc_z%vb3
-                                else
-                                    q_prim_vf(i)%sf(k, l, -j) = &
-                                        q_prim_vf(i)%sf(k, l, 0)
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-
-            else !< bc_z%end
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, sys_size
-                    do j = 1, buff_size
-                        do l = -buff_size, n + buff_size
-                            do k = -buff_size, m + buff_size
-                                if (i == momxb) then
-                                    q_prim_vf(i)%sf(k, l, p + j) = &
-                                        -q_prim_vf(i)%sf(k, l, p - (j - 1)) + 2d0*bc_z%ve1
-                                elseif (i == momxb + 1 .and. num_dims > 1) then
-                                    q_prim_vf(i)%sf(k, l, p + j) = &
-                                        -q_prim_vf(i)%sf(k, l, p - (j - 1)) + 2d0*bc_z%ve2
-                                elseif (i == momxb + 2 .and. num_dims > 2) then
-                                    q_prim_vf(i)%sf(k, l, p + j) = &
-                                        -q_prim_vf(i)%sf(k, l, p - (j - 1)) + 2d0*bc_z%ve3
-                                else
-                                    q_prim_vf(i)%sf(k, l, p + j) = &
-                                        q_prim_vf(i)%sf(k, l, p)
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-        end if
-        !< =====================================================================
-
-    end subroutine s_no_slip_wall
-
-    subroutine s_qbmm_extrapolation(pb, mv, bc_dir, bc_loc)
-
-        real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(inout) :: pb, mv
-        integer, intent(in) :: bc_dir, bc_loc
-
-        integer :: j, k, l, q, i
-
-        !< x-direction =========================================================
-        if (bc_dir == 1) then
-
-            if (bc_loc == -1) then !< bc_x%beg
-
-                !$acc parallel loop collapse(4) gang vector default(present)
-                do i = 1, nb
-                    do q = 1, nnode
-                        do l = 0, p
-                            do k = 0, n
-                                do j = 1, buff_size
-                                    pb(-j, k, l, q, i) = &
-                                        pb(0, k, l, q, i)
-                                    mv(-j, k, l, q, i) = &
-                                        mv(0, k, l, q, i)
-                                end do
-                            end do
-                        end do
-                    end do
-                end do
-
-            else !< bc_x%end
-
-                !$acc parallel loop collapse(5) gang vector default(present)
-                do i = 1, nb
-                    do q = 1, nnode
-                        do l = 0, p
-                            do k = 0, n
-                                do j = 1, buff_size
-                                    pb(m + j, k, l, q, i) = &
-                                        pb(m, k, l, q, i)
-                                    mv(m + j, k, l, q, i) = &
-                                        mv(m, k, l, q, i)
-                                end do
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-            !< y-direction =========================================================
-        elseif (bc_dir == 2) then
-
-            if (bc_loc == -1) then !< bc_y%beg
-
-                !$acc parallel loop collapse(5) gang vector default(present)
-                do i = 1, nb
-                    do q = 1, nnode
-                        do k = 0, p
-                            do j = 1, buff_size
-                                do l = -buff_size, m + buff_size
-                                    pb(l, -j, k, q, i) = &
-                                        pb(l, 0, k, q, i)
-                                    mv(l, -j, k, q, i) = &
-                                        mv(l, 0, k, q, i)
-                                end do
-                            end do
-                        end do
-                    end do
-                end do
-
-            else !< bc_y%end
-
-                !$acc parallel loop collapse(5) gang vector default(present)
-                do i = 1, nb
-                    do q = 1, nnode
-                        do k = 0, p
-                            do j = 1, buff_size
-                                do l = -buff_size, m + buff_size
-                                    pb(l, n + j, k, q, i) = &
-                                        pb(l, n, k, q, i)
-                                    mv(l, n + j, k, q, i) = &
-                                        mv(l, n, k, q, i)
-                                end do
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-            !< z-direction =========================================================
-        elseif (bc_dir == 3) then
-
-            if (bc_loc == -1) then !< bc_z%beg
-
-                !$acc parallel loop collapse(5) gang vector default(present)
-                do i = 1, nb
-                    do q = 1, nnode
-                        do j = 1, buff_size
-                            do l = -buff_size, n + buff_size
-                                do k = -buff_size, m + buff_size
-                                    pb(k, l, -j, q, i) = &
-                                        pb(k, l, 0, q, i)
-                                    mv(k, l, -j, q, i) = &
-                                        mv(k, l, 0, q, i)
-                                end do
-                            end do
-                        end do
-                    end do
-                end do
-
-            else !< bc_z%end
-
-                !$acc parallel loop collapse(5) gang vector default(present)
-                do i = 1, nb
-                    do q = 1, nnode
-                        do j = 1, buff_size
-                            do l = -buff_size, n + buff_size
-                                do k = -buff_size, m + buff_size
-                                    pb(k, l, p + j, q, i) = &
-                                        pb(k, l, p, q, i)
-                                    mv(k, l, p + j, q, i) = &
-                                        mv(k, l, p, q, i)
-                                end do
-                            end do
-                        end do
-                    end do
-                end do
-
-            end if
-
-        end if
-
-    end subroutine
-
-    subroutine s_populate_capillary_buffers(c_divs)
-
-        type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
-        integer :: i, j, k, l
-
-        ! x - direction
-        if (bc_x%beg <= -3) then !< ghost cell extrapolation
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do l = 0, p
-                    do k = 0, n
-                        do j = 1, buff_size
-                            c_divs(i)%sf(-j, k, l) = &
-                                c_divs(i)%sf(0, k, l)
-                        end do
-                    end do
-                end do
-            end do
-        elseif (bc_x%beg == -2) then !< slip wall or reflective
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do l = 0, p
-                    do k = 0, n
-                        do j = 1, buff_size
-                            if (i == 1) then
-                                c_divs(i)%sf(-j, k, l) = &
-                                    -c_divs(i)%sf(j - 1, k, l)
-                            else
-                                c_divs(i)%sf(-j, k, l) = &
-                                    c_divs(i)%sf(j - 1, k, l)
-                            end if
-                        end do
-                    end do
-                end do
-            end do
-        elseif (bc_x%beg == -1) then
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do l = 0, p
-                    do k = 0, n
-                        do j = 1, buff_size
-                            c_divs(i)%sf(-j, k, l) = &
-                                c_divs(i)%sf(m - (j - 1), k, l)
-                        end do
-                    end do
-                end do
-            end do
-        else
-            call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 1, -1)
-        end if
-
-        if (bc_x%end <= -3) then !< ghost-cell extrapolation
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do l = 0, p
-                    do k = 0, n
-                        do j = 1, buff_size
-                            c_divs(i)%sf(m + j, k, l) = &
-                                c_divs(i)%sf(m, k, l)
-                        end do
-                    end do
-                end do
-            end do
-        elseif (bc_x%end == -2) then
-            !$acc parallel loop collapse(4) default(present)
-            do i = 1, num_dims + 1
-                do l = 0, p
-                    do k = 0, n
-                        do j = 1, buff_size
-                            if (i == 1) then
-                                c_divs(i)%sf(m + j, k, l) = &
-                                    -c_divs(i)%sf(m - (j - 1), k, l)
-                            else
-                                c_divs(i)%sf(m + j, k, l) = &
-                                    c_divs(i)%sf(m - (j - 1), k, l)
-                            end if
-                        end do
-                    end do
-                end do
-            end do
-        else if (bc_x%end == -1) then
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do l = 0, p
-                    do k = 0, n
-                        do j = 1, buff_size
-                            c_divs(i)%sf(m + j, k, l) = &
-                                c_divs(i)%sf(j - 1, k, l)
-                        end do
-                    end do
-                end do
-            end do
-        else
-            call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 1, 1)
-        end if
-
-        if (n == 0) then
-            return
-        elseif (bc_y%beg <= -3) then !< ghost-cell extrapolation
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do k = 0, p
-                    do j = 1, buff_size
-                        do l = -buff_size, m + buff_size
-                            c_divs(i)%sf(l, -j, k) = &
-                                c_divs(i)%sf(l, 0, k)
-                        end do
-                    end do
-                end do
-            end do
-        elseif (bc_y%beg == -2) then !< slip wall or reflective
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do k = 0, p
-                    do j = 1, buff_size
-                        do l = -buff_size, m + buff_size
-                            if (i == 2) then
-                                c_divs(i)%sf(l, -j, k) = &
-                                    -c_divs(i)%sf(l, j - 1, k)
-                            else
-                                c_divs(i)%sf(l, -j, k) = &
-                                    c_divs(i)%sf(l, j - 1, k)
-                            end if
-                        end do
-                    end do
-                end do
-            end do
-        elseif (bc_y%beg == -1) then
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do k = 0, p
-                    do j = 1, buff_size
-                        do l = -buff_size, m + buff_size
-                            c_divs(i)%sf(l, -j, k) = &
-                                c_divs(i)%sf(l, n - (j - 1), k)
-                        end do
-                    end do
-                end do
-            end do
-        else
-            call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 2, -1)
-        end if
-
-        if (bc_y%end <= -3) then !< ghost-cell extrapolation
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do k = 0, p
-                    do j = 1, buff_size
-                        do l = -buff_size, m + buff_size
-                            c_divs(i)%sf(l, n + j, k) = &
-                                c_divs(i)%sf(l, n, k)
-                        end do
-                    end do
-                end do
-            end do
-        elseif (bc_y%end == -2) then !< slip wall or reflective
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do k = 0, p
-                    do j = 1, buff_size
-                        do l = -buff_size, m + buff_size
-                            if (i == 2) then
-                                c_divs(i)%sf(l, n + j, k) = &
-                                    -c_divs(i)%sf(l, n - (j - 1), k)
-                            else
-                                c_divs(i)%sf(l, n + j, k) = &
-                                    c_divs(i)%sf(l, n - (j - 1), k)
-                            end if
-                        end do
-                    end do
-                end do
-            end do
-        elseif (bc_y%end == -1) then
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do k = 0, p
-                    do j = 1, buff_size
-                        do l = -buff_size, m + buff_size
-                            c_divs(i)%sf(l, n + j, k) = &
-                                c_divs(i)%sf(l, j - 1, k)
-                        end do
-                    end do
-                end do
-            end do
-        else
-            call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 2, 1)
-        end if
-
-        if (p == 0) then
-            return
-        elseif (bc_z%beg <= -3) then !< ghost-cell extrapolation
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do j = 1, buff_size
-                    do l = -buff_size, n + buff_size
-                        do k = -buff_size, m + buff_size
-                            c_divs(i)%sf(k, l, -j) = &
-                                c_divs(i)%sf(k, l, 0)
-                        end do
-                    end do
-                end do
-            end do
-        elseif (bc_z%beg == -2) then !< symmetry
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do j = 1, buff_size
-                    do l = -buff_size, n + buff_size
-                        do k = -buff_size, m + buff_size
-                            if (i == 3) then
-                                c_divs(i)%sf(k, l, -j) = &
-                                    -c_divs(i)%sf(k, l, j - 1)
-                            else
-                                c_divs(i)%sf(k, l, -j) = &
-                                    c_divs(i)%sf(k, l, j - 1)
-                            end if
-                        end do
-                    end do
-                end do
-            end do
-        elseif (bc_z%beg == -1) then
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do j = 1, buff_size
-                    do l = -buff_size, n + buff_size
-                        do k = -buff_size, m + buff_size
-                            c_divs(i)%sf(k, l, -j) = &
-                                c_divs(i)%sf(k, l, p - (j - 1))
-                        end do
-                    end do
-                end do
-            end do
-        else
-            call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 3, -1)
-        end if
-
-        if (bc_z%end <= -3) then !< ghost-cell extrapolation
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do j = 1, buff_size
-                    do l = -buff_size, n + buff_size
-                        do k = -buff_size, m + buff_size
-                            c_divs(i)%sf(k, l, p + j) = &
-                                c_divs(i)%sf(k, l, p)
-                        end do
-                    end do
-                end do
-            end do
-        elseif (bc_z%end == -2) then !< symmetry
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do j = 1, buff_size
-                    do l = -buff_size, n + buff_size
-                        do k = -buff_size, m + buff_size
-                            if (i == 3) then
-                                c_divs(i)%sf(k, l, p + j) = &
-                                    -c_divs(i)%sf(k, l, p - (j - 1))
-                            else
-                                c_divs(i)%sf(k, l, p + j) = &
-                                    c_divs(i)%sf(k, l, p - (j - 1))
-                            end if
-                        end do
-                    end do
-                end do
-            end do
-        elseif (bc_z%end == -1) then
-            !$acc parallel loop collapse(4) gang vector default(present)
-            do i = 1, num_dims + 1
-                do j = 1, buff_size
-                    do l = -buff_size, n + buff_size
-                        do k = -buff_size, m + buff_size
-                            c_divs(i)%sf(k, l, p + j) = &
-                                c_divs(i)%sf(k, l, j - 1)
-                        end do
-                    end do
-                end do
-            end do
-        else
-            call s_mpi_sendrecv_capilary_variables_buffers(c_divs, 3, 1)
-        end if
+        #:endfor
 
     end subroutine s_populate_capillary_buffers
 
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index db5d17656e..fef58ba7b3 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -22,6 +22,8 @@ module m_global_parameters
 
     use m_helper_basic         !< Functions to compare floating point numbers
 
+    use m_global_parameters_common
+
 #ifdef MFC_OpenACC
     use openacc
 #endif
@@ -184,6 +186,8 @@ module m_global_parameters
     !> @name Boundary conditions (BC) in the x-, y- and z-directions, respectively
     !> @{
     type(int_bounds_info) :: bc_x, bc_y, bc_z
+    integer :: num_bc_patches
+    type(bc_patch_parameters) :: patch_bc(num_bc_patches_max)
     !> @}
     type(bounds_info) :: x_domain, y_domain, z_domain
 
@@ -191,9 +195,12 @@ module m_global_parameters
     logical :: file_per_process !< shared file or not when using parallel io
     integer :: precision !< Precision of output files
 
-    integer, allocatable, dimension(:) :: proc_coords !<
+    integer, dimension(1:3) :: proc_coords !<
     !! Processor coordinates in MPI_CART_COMM
 
+    integer, dimension(1:3) :: proc_nums !<
+    !! Processor dimensions in MPI_CART_COMM
+
     integer, allocatable, dimension(:) :: start_idx !<
     !! Starting cell-center index of local processor in global grid
 
@@ -295,9 +302,9 @@ module m_global_parameters
     !! in the flow. These include the stiffened gas equation of state parameters,
     !! the Reynolds numbers and the Weber numbers.
 
-    !$acc declare create(bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3)
-    !$acc declare create(bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3)
-    !$acc declare create(bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3)
+    !$acc declare create(bc_x%vel_beg, bc_x%vel_end)
+    !$acc declare create(bc_y%vel_beg, bc_y%vel_end)
+    !$acc declare create(bc_z%vel_beg, bc_z%vel_end)
 
     ! ==========================================================================
 
@@ -535,16 +542,16 @@ contains
         bc_z%beg = dflt_int; bc_z%end = dflt_int
 
         #:for DIM in ['x', 'y', 'z']
-            #:for DIR in [1, 2, 3]
-                bc_${DIM}$%vb${DIR}$ = 0d0
-                bc_${DIM}$%ve${DIR}$ = 0d0
-            #:endfor
+            bc_${DIM}$%vel_beg = 0d0
+            bc_${DIM}$%vel_end = 0d0
         #:endfor
 
         x_domain%beg = dflt_int; x_domain%end = dflt_int
         y_domain%beg = dflt_int; y_domain%end = dflt_int
         z_domain%beg = dflt_int; z_domain%end = dflt_int
 
+        call s_bc_assign_default_values_to_user_inputs(num_bc_patches, patch_bc)
+
         ! Fluids physical parameters
         do i = 1, num_fluids_max
             fluid_pp(i)%gamma = dflt_real
@@ -1130,8 +1137,6 @@ contains
             num_dims = 1 + min(1, n) + min(1, p)
         #:endif
 
-        allocate (proc_coords(1:num_dims))
-
         if (parallel_io .neqv. .true.) return
 
 #ifdef MFC_MPI
@@ -1166,7 +1171,6 @@ contains
             @:DEALLOCATE(Re_idx)
         end if
 
-        deallocate (proc_coords)
         if (parallel_io) then
             deallocate (start_idx)
             do i = 1, sys_size
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 8421fda52e..a8b1043f9f 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -19,6 +19,10 @@ module m_ibm
 
     use m_helper
 
+    use m_constants
+
+    use m_boundary_conditions_common
+
     ! ==========================================================================
 
     implicit none
@@ -42,7 +46,6 @@ module m_ibm
     type(ghost_point), dimension(:), allocatable :: inner_points
     !$acc declare create(ghost_points, inner_points)
 
-    integer :: gp_layers !< Number of ghost point layers
     integer :: num_gps !< Number of ghost points
     integer :: num_inner_gps !< Number of ghost points
     !$acc declare create(gp_layers, num_gps, num_inner_gps)
@@ -52,8 +55,6 @@ contains
     !>  Allocates memory for the variables in the IBM module
     subroutine s_initialize_ibm_module()
 
-        gp_layers = 3
-
         if (p > 0) then
             @:ALLOCATE(ib_markers%sf(-gp_layers:m+gp_layers, &
                 -gp_layers:n+gp_layers, -gp_layers:p+gp_layers))
@@ -74,7 +75,7 @@ contains
         @:ACC_SETUP_SFs(levelset)
         ! @:ALLOCATE(ib_markers%sf(0:m, 0:n, 0:p))
 
-        !$acc enter data copyin(gp_layers, num_gps, num_inner_gps)
+        !$acc enter data copyin(num_gps, num_inner_gps)
 
     end subroutine s_initialize_ibm_module
 
@@ -87,7 +88,7 @@ contains
         !$acc update device(ib_markers%sf)
 
         ! Get neighboring IB variables from other processors
-        call s_mpi_sendrecv_ib_buffers(ib_markers, gp_layers)
+        call s_mpi_sendrecv_ib_buffers(ib_markers, bc_id_sfs)
 
         !$acc update host(ib_markers%sf)
 
@@ -212,7 +213,7 @@ contains
 
             ! Calculate velocity of ghost cell
             if (gp%slip) then
-                norm = gp%ip_loc - physical_loc !
+                norm = levelset_norm%sf(gp%loc(1), gp%loc(2), gp%loc(3), gp%ib_patch_id, :)
                 buf = sqrt(sum(norm**2))
                 norm = norm/buf
                 vel_norm_IP = sum(vel_IP*norm)*norm
@@ -447,6 +448,7 @@ contains
         integer :: i, j, k, l, q !< Iterator variables
 
         num_gps = 0
+        num_inner_gps = 0
 
         do i = 0, m
             do j = 0, n
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 8b1ea7749e..94dbb1364b 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -30,19 +30,12 @@ module m_mpi_proxy
     use m_nvtx
 
     use ieee_arithmetic
-    ! ==========================================================================
 
-    implicit none
+    use m_constants
 
-    real(kind(0d0)), private, allocatable, dimension(:), target :: q_cons_buff_send !<
-    !! This variable is utilized to pack and send the buffer of the cell-average
-    !! conservative variables, for a single computational domain boundary at the
-    !! time, to the relevant neighboring processor.
+    ! ==========================================================================
 
-    real(kind(0d0)), private, allocatable, dimension(:), target :: q_cons_buff_recv !<
-    !! q_cons_buff_recv is utilized to receive and unpack the buffer of the cell-
-    !! average conservative variables, for a single computational domain boundary
-    !! at the time, from the relevant neighboring processor.
+    implicit none
 
     real(kind(0d0)), private, allocatable, dimension(:), target :: c_divs_buff_send !<
     !! c_divs_buff_send is utilized to send and unpack the buffer of the cell-
@@ -64,15 +57,13 @@ module m_mpi_proxy
     !! immersed boundary markers, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
-    !$acc declare create(q_cons_buff_send, q_cons_buff_recv)
-    !$acc declare create( ib_buff_send, ib_buff_recv)
+    !$acc declare create(ib_buff_send, ib_buff_recv)
     !$acc declare create(c_divs_buff_send, c_divs_buff_recv)
 
     !> @name Generic flags used to identify and report MPI errors
     !> @{
-    integer, private :: err_code, ierr, v_size
+    integer, private :: err_code, ierr
     !> @}
-    !$acc declare create(v_size)
 
     !real :: s_time, e_time
     !real :: compress_time, mpi_time, decompress_time
@@ -90,70 +81,45 @@ contains
 
 #ifdef MFC_MPI
 
-        ! Allocating q_cons_buff_send/recv and ib_buff_send/recv. Please note that
-        ! for the sake of simplicity, both variables are provided sufficient
-        ! storage to hold the largest buffer in the computational domain.
-
-        if (qbmm .and. .not. polytropic) then
-            if (n > 0) then
-                if (p > 0) then
-                    @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*(sys_size + 2*nb*4)* &
-                                             & (m + 2*buff_size + 1)* &
-                                             & (n + 2*buff_size + 1)* &
-                                             & (p + 2*buff_size + 1)/ &
-                                             & (min(m, n, p) + 2*buff_size + 1)))
-                else
-                    @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*(sys_size + 2*nb*4)* &
-                                             & (max(m, n) + 2*buff_size + 1)))
-                end if
-            else
-                @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*(sys_size + 2*nb*4)))
-            end if
-
-            @:ALLOCATE(q_cons_buff_recv(0:ubound(q_cons_buff_send, 1)))
-
-            v_size = sys_size + 2*nb*4
-        else
+        if (surface_tension) then
+            nVars = num_dims + 1
             if (n > 0) then
                 if (p > 0) then
-                    @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*sys_size* &
+                    @:ALLOCATE(c_divs_buff_send(0:-1 + buff_size*(num_dims+1)* &
                                              & (m + 2*buff_size + 1)* &
                                              & (n + 2*buff_size + 1)* &
                                              & (p + 2*buff_size + 1)/ &
                                              & (min(m, n, p) + 2*buff_size + 1)))
                 else
-                    @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*sys_size* &
+                    @:ALLOCATE(c_divs_buff_send(0:-1 + buff_size*(num_dims+1)* &
                                              & (max(m, n) + 2*buff_size + 1)))
                 end if
             else
-                @:ALLOCATE(q_cons_buff_send(0:-1 + buff_size*sys_size))
+                @:ALLOCATE(c_divs_buff_send(0:-1 + buff_size*(num_dims+1)))
             end if
 
-            @:ALLOCATE(q_cons_buff_recv(0:ubound(q_cons_buff_send, 1)))
-
-            v_size = sys_size
+            @:ALLOCATE(c_divs_buff_recv(0:ubound(c_divs_buff_send, 1)))
         end if
+        !$acc update device(nVars)
 
-        if (surface_tension) then
-            nVars = num_dims + 1
+        if (ib) then
             if (n > 0) then
                 if (p > 0) then
-                    @:ALLOCATE(c_divs_buff_send(0:-1 + buff_size*(num_dims+1)* &
-                                             & (m + 2*buff_size + 1)* &
-                                             & (n + 2*buff_size + 1)* &
-                                             & (p + 2*buff_size + 1)/ &
-                                             & (min(m, n, p) + 2*buff_size + 1)))
+                    @:ALLOCATE(ib_buff_send(0:-1 + gp_layers * &
+                                            & (m + 2*gp_layers + 1)* &
+                                            & (n + 2*gp_layers + 1)* &
+                                            & (p + 2*gp_layers + 1)/ &
+                                            & (min(m, n, p) + 2*gp_layers + 1)))
                 else
-                    @:ALLOCATE(c_divs_buff_send(0:-1 + buff_size*(num_dims+1)* &
-                                             & (max(m, n) + 2*buff_size + 1)))
+                    @:ALLOCATE(ib_buff_send(0:-1 + gp_layers* &
+                                            & (max(m, n) + 2*gp_layers + 1)))
                 end if
             else
-                @:ALLOCATE(c_divs_buff_send(0:-1 + buff_size*(num_dims+1)))
+                @:ALLOCATE(ib_buff_send(0:-1 + gp_layers))
             end if
 
-            @:ALLOCATE(c_divs_buff_recv(0:ubound(c_divs_buff_send, 1)))
+            @:ALLOCATE(ib_buff_recv(0:ubound(ib_buff_send, 1)))
         end if
-        !$acc update device(v_size, nVars)
 
 #endif
 
@@ -214,9 +180,6 @@ contains
 
         #:for VAR in [ 'dt','weno_eps','teno_CT','pref','rhoref','R0ref','Web','Ca', 'sigma', &
             & 'Re_inv', 'poly_sigma', 'palpha_eps', 'ptgalpha_eps', 'pi_fac',    &
-            & 'bc_x%vb1','bc_x%vb2','bc_x%vb3','bc_x%ve1','bc_x%ve2','bc_x%ve2', &
-            & 'bc_y%vb1','bc_y%vb2','bc_y%vb3','bc_y%ve1','bc_y%ve2','bc_y%ve3', &
-            & 'bc_z%vb1','bc_z%vb2','bc_z%vb3','bc_z%ve1','bc_z%ve2','bc_z%ve3', &
             & 'bc_x%pres_in','bc_x%pres_out','bc_y%pres_in','bc_y%pres_out', 'bc_z%pres_in','bc_z%pres_out', &
             & 'x_domain%beg', 'x_domain%end', 'y_domain%beg', 'y_domain%end',    &
             & 'z_domain%beg', 'z_domain%end', 't_stop',  't_save', 'cfl_target']
@@ -294,812 +257,146 @@ contains
 
     end subroutine s_mpi_bcast_user_inputs
 
-    !>  The purpose of this procedure is to optimally decompose
-        !!      the computational domain among the available processors.
-        !!      This is performed by attempting to award each processor,
-        !!      in each of the coordinate directions, approximately the
-        !!      same number of cells, and then recomputing the affected
-        !!      global parameters.
-    subroutine s_mpi_decompose_computational_domain
-
-#ifdef MFC_MPI
-
-        integer :: num_procs_x, num_procs_y, num_procs_z !<
-            !! Optimal number of processors in the x-, y- and z-directions
-
-        real(kind(0d0)) :: tmp_num_procs_x, tmp_num_procs_y, tmp_num_procs_z !<
-            !! Non-optimal number of processors in the x-, y- and z-directions
-
-        real(kind(0d0)) :: fct_min !<
-            !! Processor factorization (fct) minimization parameter
-
-        integer :: MPI_COMM_CART !<
-            !! Cartesian processor topology communicator
-
-        integer :: rem_cells !<
-            !! Remaining number of cells, in a particular coordinate direction,
-            !! after the majority is divided up among the available processors
-
-        integer :: i, j !< Generic loop iterators
-
-        if (num_procs == 1 .and. parallel_io) then
-            do i = 1, num_dims
-                start_idx(i) = 0
-            end do
-            return
-        end if
-
-        ! 3D Cartesian Processor Topology ==================================
-        if (n > 0) then
-
-            if (p > 0) then
-
-                if (cyl_coord .and. p > 0) then
-                    ! Implement pencil processor blocking if using cylindrical coordinates so
-                    ! that all cells in azimuthal direction are stored on a single processor.
-                    ! This is necessary for efficient application of Fourier filter near axis.
-
-                    ! Initial values of the processor factorization optimization
-                    num_procs_x = 1
-                    num_procs_y = num_procs
-                    num_procs_z = 1
-                    ierr = -1
-
-                    ! Computing minimization variable for these initial values
-                    tmp_num_procs_x = num_procs_x
-                    tmp_num_procs_y = num_procs_y
-                    tmp_num_procs_z = num_procs_z
-                    fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
-                                       - (n + 1)/tmp_num_procs_y)
-
-                    ! Searching for optimal computational domain distribution
-                    do i = 1, num_procs
-
-                        if (mod(num_procs, i) == 0 &
-                            .and. &
-                            (m + 1)/i >= num_stcls_min*weno_order) then
-
-                            tmp_num_procs_x = i
-                            tmp_num_procs_y = num_procs/i
-
-                            if (fct_min >= abs((m + 1)/tmp_num_procs_x &
-                                               - (n + 1)/tmp_num_procs_y) &
-                                .and. &
-                                (n + 1)/tmp_num_procs_y &
-                                >= &
-                                num_stcls_min*weno_order) then
-
-                                num_procs_x = i
-                                num_procs_y = num_procs/i
-                                fct_min = abs((m + 1)/tmp_num_procs_x &
-                                              - (n + 1)/tmp_num_procs_y)
-                                ierr = 0
-
-                            end if
-
-                        end if
-
-                    end do
+    subroutine s_mpi_sendrecv_ib_buffers(ib_markers, bc_id_sfs)
 
-                else
-
-                    ! Initial estimate of optimal processor topology
-                    num_procs_x = 1
-                    num_procs_y = 1
-                    num_procs_z = num_procs
-                    ierr = -1
-
-                    ! Benchmarking the quality of this initial guess
-                    tmp_num_procs_x = num_procs_x
-                    tmp_num_procs_y = num_procs_y
-                    tmp_num_procs_z = num_procs_z
-                    fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
-                                       - (n + 1)/tmp_num_procs_y) &
-                              + 10d0*abs((n + 1)/tmp_num_procs_y &
-                                         - (p + 1)/tmp_num_procs_z)
-
-                    ! Optimization of the initial processor topology
-                    do i = 1, num_procs
-
-                        if (mod(num_procs, i) == 0 &
-                            .and. &
-                            (m + 1)/i >= num_stcls_min*weno_order) then
-
-                            do j = 1, num_procs/i
-
-                                if (mod(num_procs/i, j) == 0 &
-                                    .and. &
-                                    (n + 1)/j >= num_stcls_min*weno_order) then
-
-                                    tmp_num_procs_x = i
-                                    tmp_num_procs_y = j
-                                    tmp_num_procs_z = num_procs/(i*j)
-
-                                    if (fct_min >= abs((m + 1)/tmp_num_procs_x &
-                                                       - (n + 1)/tmp_num_procs_y) &
-                                        + abs((n + 1)/tmp_num_procs_y &
-                                              - (p + 1)/tmp_num_procs_z) &
-                                        .and. &
-                                        (p + 1)/tmp_num_procs_z &
-                                        >= &
-                                        num_stcls_min*weno_order) &
-                                        then
-
-                                        num_procs_x = i
-                                        num_procs_y = j
-                                        num_procs_z = num_procs/(i*j)
-                                        fct_min = abs((m + 1)/tmp_num_procs_x &
-                                                      - (n + 1)/tmp_num_procs_y) &
-                                                  + abs((n + 1)/tmp_num_procs_y &
-                                                        - (p + 1)/tmp_num_procs_z)
-                                        ierr = 0
+        type(integer_field), intent(inout) :: ib_markers
+        type(t_bc_id_sf), dimension(1:3, -1:1), intent(in) :: bc_id_sfs
 
-                                    end if
+        @:BOUNDARY_CONDITION_INTEGER_DECLARATIONS()
 
-                                end if
+        integer :: buffer_counts(1:3)
+        integer :: iter_dir, iter_loc
+        integer, pointer :: p_send, p_recv
 
-                            end do
+#ifdef MFC_MPI
 
-                        end if
+        buffer_counts = (/ &
+                        gp_layers*(n + 1)*(p + 1), &
+                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
+                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1) &
+                        /)
 
-                    end do
+        do iter_dir = 1, num_dims
 
-                end if
+            do iter_loc = -1, 1, 2
 
-                ! Verifying that a valid decomposition of the computational
-                ! domain has been established. If not, the simulation exits.
-                if (proc_rank == 0 .and. ierr == -1) then
-                    call s_mpi_abort('Unsupported combination of values '// &
-                                     'of num_procs, m, n, p and '// &
-                                     'weno_order. Exiting ...')
-                end if
+                call nvtxStartRange("RHS-COMM-PACKBUF")
 
-                ! Creating new communicator using the Cartesian topology
-                call MPI_CART_CREATE(MPI_COMM_WORLD, 3, (/num_procs_x, &
-                                                          num_procs_y, num_procs_z/), &
-                                     (/.true., .true., .true./), &
-                                     .false., MPI_COMM_CART, ierr)
+                #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="iter_dir", loc="-iter_loc", pack_v_size='1', thickness='gp_layers', inner_loops=[("i", 1, 1)])
+                    ib_buff_send(pack_idr) = ib_markers%sf(sx, sy, sz)
+                #:endblock
 
-                ! Finding the Cartesian coordinates of the local process
-                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 3, &
-                                     proc_coords, ierr)
-                ! END: 3D Cartesian Processor Topology =============================
+                call nvtxEndRange ! Packbuf
 
-                ! Global Parameters for z-direction ================================
+                p_send => ib_buff_send(0)
+                p_recv => ib_buff_recv(0)
+                #:for rdma_mpi in [False, True]
+                    if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
+                        #:if rdma_mpi
+                            !$acc data attach(p_send, p_recv)
+                            !$acc host_data use_device(p_send, p_recv)
 
-                ! Number of remaining cells
-                rem_cells = mod(p + 1, num_procs_z)
+                            call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
+                        #:else
+                            call nvtxStartRange("RHS-COMM-DEV2HOST")
+                            !$acc update host(ib_buff_send)
+                            call nvtxEndRange ! Dev2Host
 
-                ! Optimal number of cells per processor
-                p = (p + 1)/num_procs_z - 1
+                            call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
+                        #:endif
 
-                ! Distributing the remaining cells
-                do i = 1, rem_cells
-                    if (proc_coords(3) == i - 1) then
-                        p = p + 1; exit
-                    end if
-                end do
-
-                ! Boundary condition at the beginning
-                if (proc_coords(3) > 0 .or. bc_z%beg == -1) then
-                    proc_coords(3) = proc_coords(3) - 1
-                    call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
-                                       bc_z%beg, ierr)
-                    proc_coords(3) = proc_coords(3) + 1
-                end if
+                        call MPI_SENDRECV( &
+                            p_send, buffer_counts(iter_dir), MPI_INTEGER, neighbor_procs(iter_dir, -iter_loc), 0, &
+                            p_recv, buffer_counts(iter_dir), MPI_INTEGER, neighbor_procs(iter_dir, +iter_loc), 0, &
+                            MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
-                ! Boundary condition at the end
-                if (proc_coords(3) < num_procs_z - 1 .or. bc_z%end == -1) then
-                    proc_coords(3) = proc_coords(3) + 1
-                    call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
-                                       bc_z%end, ierr)
-                    proc_coords(3) = proc_coords(3) - 1
-                end if
+                        call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
 
-                if (parallel_io) then
-                    if (proc_coords(3) < rem_cells) then
-                        start_idx(3) = (p + 1)*proc_coords(3)
-                    else
-                        start_idx(3) = (p + 1)*proc_coords(3) + rem_cells
+                        #:if rdma_mpi
+                            !$acc end host_data
+                            !$acc end data
+                            !$acc wait
+                        #:else
+                            call nvtxStartRange("RHS-COMM-HOST2DEV")
+                            !$acc update device(ib_buff_recv)
+                            call nvtxEndRange ! Host2Dev
+                        #:endif
                     end if
-                end if
-                ! ==================================================================
-
-                ! 2D Cartesian Processor Topology ==================================
-            else
+                #:endfor
 
-                ! Initial estimate of optimal processor topology
-                num_procs_x = 1
-                num_procs_y = num_procs
-                ierr = -1
-
-                ! Benchmarking the quality of this initial guess
-                tmp_num_procs_x = num_procs_x
-                tmp_num_procs_y = num_procs_y
-                fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
-                                   - (n + 1)/tmp_num_procs_y)
-
-                ! Optimization of the initial processor topology
-                do i = 1, num_procs
-
-                    if (mod(num_procs, i) == 0 &
-                        .and. &
-                        (m + 1)/i >= num_stcls_min*weno_order) then
-
-                        tmp_num_procs_x = i
-                        tmp_num_procs_y = num_procs/i
-
-                        if (fct_min >= abs((m + 1)/tmp_num_procs_x &
-                                           - (n + 1)/tmp_num_procs_y) &
-                            .and. &
-                            (n + 1)/tmp_num_procs_y &
-                            >= &
-                            num_stcls_min*weno_order) then
-
-                            num_procs_x = i
-                            num_procs_y = num_procs/i
-                            fct_min = abs((m + 1)/tmp_num_procs_x &
-                                          - (n + 1)/tmp_num_procs_y)
-                            ierr = 0
-
-                        end if
+                call nvtxStartRange("RHS-COMM-UNPACKBUF")
 
+                #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="iter_dir", loc="iter_loc", pack_v_size='1', thickness='gp_layers', inner_loops=[("i", 1, 1)])
+                    if (bc_id_sfs(iter_dir, iter_loc)%sf(exlhs, eylhs, ezlhs)%type >= 0) then
+                        ib_markers%sf(x, y, z) = ib_buff_recv(pack_idr)
                     end if
+                #:endblock
 
-                end do
-
-                ! Verifying that a valid decomposition of the computational
-                ! domain has been established. If not, the simulation exits.
-                if (proc_rank == 0 .and. ierr == -1) then
-                    call s_mpi_abort('Unsupported combination of values '// &
-                                     'of num_procs, m, n and '// &
-                                     'weno_order. Exiting ...')
-                end if
-
-                ! Creating new communicator using the Cartesian topology
-                call MPI_CART_CREATE(MPI_COMM_WORLD, 2, (/num_procs_x, &
-                                                          num_procs_y/), (/.true., &
-                                                                           .true./), .false., MPI_COMM_CART, &
-                                     ierr)
-
-                ! Finding the Cartesian coordinates of the local process
-                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 2, &
-                                     proc_coords, ierr)
-
-            end if
-            ! END: 2D Cartesian Processor Topology =============================
-
-            ! Global Parameters for y-direction ================================
-
-            ! Number of remaining cells
-            rem_cells = mod(n + 1, num_procs_y)
-
-            ! Optimal number of cells per processor
-            n = (n + 1)/num_procs_y - 1
+                call nvtxEndRange ! Unpackbuf
 
-            ! Distributing the remaining cells
-            do i = 1, rem_cells
-                if (proc_coords(2) == i - 1) then
-                    n = n + 1; exit
-                end if
             end do
 
-            ! Boundary condition at the beginning
-            if (proc_coords(2) > 0 .or. bc_y%beg == -1) then
-                proc_coords(2) = proc_coords(2) - 1
-                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
-                                   bc_y%beg, ierr)
-                proc_coords(2) = proc_coords(2) + 1
-            end if
-
-            ! Boundary condition at the end
-            if (proc_coords(2) < num_procs_y - 1 .or. bc_y%end == -1) then
-                proc_coords(2) = proc_coords(2) + 1
-                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
-                                   bc_y%end, ierr)
-                proc_coords(2) = proc_coords(2) - 1
-            end if
-
-            if (parallel_io) then
-                if (proc_coords(2) < rem_cells) then
-                    start_idx(2) = (n + 1)*proc_coords(2)
-                else
-                    start_idx(2) = (n + 1)*proc_coords(2) + rem_cells
-                end if
-            end if
-
-            ! ==================================================================
-
-            ! 1D Cartesian Processor Topology ==================================
-        else
-
-            ! Optimal processor topology
-            num_procs_x = num_procs
-
-            ! Creating new communicator using the Cartesian topology
-            call MPI_CART_CREATE(MPI_COMM_WORLD, 1, (/num_procs_x/), &
-                                 (/.true./), .false., MPI_COMM_CART, &
-                                 ierr)
-
-            ! Finding the Cartesian coordinates of the local process
-            call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 1, &
-                                 proc_coords, ierr)
-
-        end if
-        ! ==================================================================
-
-        ! Global Parameters for x-direction ================================
-
-        ! Number of remaining cells
-        rem_cells = mod(m + 1, num_procs_x)
-
-        ! Optimal number of cells per processor
-        m = (m + 1)/num_procs_x - 1
-
-        ! Distributing the remaining cells
-        do i = 1, rem_cells
-            if (proc_coords(1) == i - 1) then
-                m = m + 1; exit
-            end if
         end do
 
-        ! Boundary condition at the beginning
-        if (proc_coords(1) > 0 .or. bc_x%beg == -1) then
-            proc_coords(1) = proc_coords(1) - 1
-            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%beg, ierr)
-            proc_coords(1) = proc_coords(1) + 1
-        end if
-
-        ! Boundary condition at the end
-        if (proc_coords(1) < num_procs_x - 1 .or. bc_x%end == -1) then
-            proc_coords(1) = proc_coords(1) + 1
-            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%end, ierr)
-            proc_coords(1) = proc_coords(1) - 1
-        end if
-
-        if (parallel_io) then
-            if (proc_coords(1) < rem_cells) then
-                start_idx(1) = (m + 1)*proc_coords(1)
-            else
-                start_idx(1) = (m + 1)*proc_coords(1) + rem_cells
-            end if
-        end if
-        ! ==================================================================
-
 #endif
 
-    end subroutine s_mpi_decompose_computational_domain
-
-    !>  The goal of this procedure is to populate the buffers of
-        !!      the grid variables by communicating with the neighboring
-        !!      processors. Note that only the buffers of the cell-width
-        !!      distributions are handled in such a way. This is because
-        !!      the buffers of cell-boundary locations may be calculated
-        !!      directly from those of the cell-width distributions.
-        !!  @param mpi_dir MPI communication coordinate direction
-        !!  @param pbc_loc Processor boundary condition (PBC) location
-    subroutine s_mpi_sendrecv_grid_variables_buffers(mpi_dir, pbc_loc)
-
-        integer, intent(in) :: mpi_dir
-        integer, intent(in) :: pbc_loc
-
-        integer :: dst_proc(1:3)
-
-#ifdef MFC_MPI
-
-        ! MPI Communication in x-direction =================================
-        if (mpi_dir == 1) then
-
-            if (pbc_loc == -1) then      ! PBC at the beginning
-
-                if (bc_x%end >= 0) then      ! PBC at the beginning and end
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        dx(m - buff_size + 1), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_x%end, 0, &
-                        dx(-buff_size), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_x%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                else                        ! PBC at the beginning only
-
-                    ! Send/receive buffer to/from bc_x%beg/bc_x%beg
-                    call MPI_SENDRECV( &
-                        dx(0), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_x%beg, 1, &
-                        dx(-buff_size), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_x%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                end if
-
-            else                        ! PBC at the end
-
-                if (bc_x%beg >= 0) then      ! PBC at the end and beginning
-
-                    ! Send/receive buffer to/from bc_x%beg/bc_x%end
-                    call MPI_SENDRECV( &
-                        dx(0), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_x%beg, 1, &
-                        dx(m + 1), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_x%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                else                        ! PBC at the end only
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%end
-                    call MPI_SENDRECV( &
-                        dx(m - buff_size + 1), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_x%end, 0, &
-                        dx(m + 1), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_x%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                end if
-
-            end if
-            ! END: MPI Communication in x-direction ============================
-
-            ! MPI Communication in y-direction =================================
-        elseif (mpi_dir == 2) then
-
-            if (pbc_loc == -1) then      ! PBC at the beginning
-
-                if (bc_y%end >= 0) then      ! PBC at the beginning and end
-
-                    ! Send/receive buffer to/from bc_y%end/bc_y%beg
-                    call MPI_SENDRECV( &
-                        dy(n - buff_size + 1), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_y%end, 0, &
-                        dy(-buff_size), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_y%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                else                        ! PBC at the beginning only
-
-                    ! Send/receive buffer to/from bc_y%beg/bc_y%beg
-                    call MPI_SENDRECV( &
-                        dy(0), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_y%beg, 1, &
-                        dy(-buff_size), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_y%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                end if
-
-            else                        ! PBC at the end
-
-                if (bc_y%beg >= 0) then      ! PBC at the end and beginning
-
-                    ! Send/receive buffer to/from bc_y%beg/bc_y%end
-                    call MPI_SENDRECV( &
-                        dy(0), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_y%beg, 1, &
-                        dy(n + 1), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_y%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                else                        ! PBC at the end only
-
-                    ! Send/receive buffer to/from bc_y%end/bc_y%end
-                    call MPI_SENDRECV( &
-                        dy(n - buff_size + 1), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_y%end, 0, &
-                        dy(n + 1), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_y%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                end if
-
-            end if
-            ! END: MPI Communication in y-direction ============================
-
-            ! MPI Communication in z-direction =================================
-        else
-
-            if (pbc_loc == -1) then      ! PBC at the beginning
-
-                if (bc_z%end >= 0) then      ! PBC at the beginning and end
-
-                    ! Send/receive buffer to/from bc_z%end/bc_z%beg
-                    call MPI_SENDRECV( &
-                        dz(p - buff_size + 1), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_z%end, 0, &
-                        dz(-buff_size), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_z%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                else                        ! PBC at the beginning only
-
-                    ! Send/receive buffer to/from bc_z%beg/bc_z%beg
-                    call MPI_SENDRECV( &
-                        dz(0), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_z%beg, 1, &
-                        dz(-buff_size), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_z%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                end if
-
-            else                        ! PBC at the end
-
-                if (bc_z%beg >= 0) then      ! PBC at the end and beginning
-
-                    ! Send/receive buffer to/from bc_z%beg/bc_z%end
-                    call MPI_SENDRECV( &
-                        dz(0), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_z%beg, 1, &
-                        dz(p + 1), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_z%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                else                        ! PBC at the end only
-
-                    ! Send/receive buffer to/from bc_z%end/bc_z%end
-                    call MPI_SENDRECV( &
-                        dz(p - buff_size + 1), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_z%end, 0, &
-                        dz(p + 1), buff_size, &
-                        MPI_DOUBLE_PRECISION, bc_z%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                end if
-
-            end if
+    end subroutine s_mpi_sendrecv_ib_buffers
 
-        end if
-        ! END: MPI Communication in z-direction ============================
+    subroutine s_mpi_sendrecv_capilary_variables_buffers(c_divs_vf, bc_id_sfs, mpi_dir, pbc_loc)
 
-#endif
+        type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs_vf
+        type(t_bc_id_sf), dimension(1:3, -1:1), intent(in) :: bc_id_sfs
 
-    end subroutine s_mpi_sendrecv_grid_variables_buffers
-
-    !>  The goal of this procedure is to populate the buffers of
-        !!      the cell-average conservative variables by communicating
-        !!      with the neighboring processors.
-        !!  @param q_cons_vf Cell-average conservative variables
-        !!  @param mpi_dir MPI communication coordinate direction
-        !!  @param pbc_loc Processor boundary condition (PBC) location
-    subroutine s_mpi_sendrecv_variables_buffers(q_cons_vf, &
-                                                pb, mv, &
-                                                mpi_dir, &
-                                                pbc_loc)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        real(kind(0d0)), dimension(startx:, starty:, startz:, 1:, 1:), intent(inout) :: pb, mv
         integer, intent(in) :: mpi_dir, pbc_loc
 
-        integer :: i, j, k, l, r, q !< Generic loop iterators
-
         integer :: buffer_counts(1:3), buffer_count
 
         type(int_bounds_info) :: boundary_conditions(1:3)
         integer :: beg_end(1:2), grid_dims(1:3)
         integer :: dst_proc, src_proc, recv_tag, send_tag
 
-        logical :: beg_end_geq_0
-
         integer :: pack_offset, unpack_offset
         real(kind(0d0)), pointer :: p_send, p_recv
 
-#ifdef MFC_MPI
-
-        call nvtxStartRange("RHS-COMM-PACKBUF")
-        !$acc update device(v_size)
-
-        if (qbmm .and. .not. polytropic) then
-            buffer_counts = (/ &
-                            buff_size*(sys_size + 2*nb*4)*(n + 1)*(p + 1), &
-                            buff_size*(sys_size + 2*nb*4)*(m + 2*buff_size + 1)*(p + 1), &
-                            buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
-                            /)
-        else
-            buffer_counts = (/ &
-                            buff_size*sys_size*(n + 1)*(p + 1), &
-                            buff_size*sys_size*(m + 2*buff_size + 1)*(p + 1), &
-                            buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
-                            /)
-        end if
-
-        buffer_count = buffer_counts(mpi_dir)
-        boundary_conditions = (/bc_x, bc_y, bc_z/)
-        beg_end = (/boundary_conditions(mpi_dir)%beg, boundary_conditions(mpi_dir)%end/)
-        beg_end_geq_0 = beg_end(max(pbc_loc, 0) - pbc_loc + 1) >= 0
-
-        ! Implements:
-        ! pbc_loc  bc_x >= 0 -> [send/recv]_tag  [dst/src]_proc
-        ! -1 (=0)      0            ->     [1,0]       [0,0]      | 0 0 [1,0] [beg,beg]
-        ! -1 (=0)      1            ->     [0,0]       [1,0]      | 0 1 [0,0] [end,beg]
-        ! +1 (=1)      0            ->     [0,1]       [1,1]      | 1 0 [0,1] [end,end]
-        ! +1 (=1)      1            ->     [1,1]       [0,1]      | 1 1 [1,1] [beg,end]
-
-        send_tag = f_logical_to_int(.not. f_xor(beg_end_geq_0, pbc_loc == 1))
-        recv_tag = f_logical_to_int(pbc_loc == 1)
+        @:BOUNDARY_CONDITION_INTEGER_DECLARATIONS()
 
-        dst_proc = beg_end(1 + f_logical_to_int(f_xor(pbc_loc == 1, beg_end_geq_0)))
-        src_proc = beg_end(1 + f_logical_to_int(pbc_loc == 1))
-
-        grid_dims = (/m, n, p/)
+#ifdef MFC_MPI
 
-        pack_offset = 0
-        if (f_xor(pbc_loc == 1, beg_end_geq_0)) then
-            pack_offset = grid_dims(mpi_dir) - buff_size + 1
-        end if
+        nVars = num_dims + 1
+        !$acc update device(nVars)
 
-        unpack_offset = 0
-        if (pbc_loc == 1) then
-            unpack_offset = grid_dims(mpi_dir) + buff_size + 1
-        end if
+        buffer_counts = (/ &
+                        buff_size*nVars*(n + 1)*(p + 1), &
+                        buff_size*nVars*(m + 2*buff_size + 1)*(p + 1), &
+                        buff_size*nVars*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
+                        /)
 
-        ! Pack Buffer to Send
-        #:for mpi_dir in [1, 2, 3]
-            if (mpi_dir == ${mpi_dir}$) then
-                #:if mpi_dir == 1
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 0, buff_size - 1
-                                do i = 1, sys_size
-                                    r = (i - 1) + v_size*(j + buff_size*(k + (n + 1)*l))
-                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j + pack_offset, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    if (qbmm .and. .not. polytropic) then
-                        !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                        do l = 0, p
-                            do k = 0, n
-                                do j = 0, buff_size - 1
-                                    do i = sys_size + 1, sys_size + 4
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*4 + v_size* &
-                                                (j + buff_size*(k + (n + 1)*l))
-                                            q_cons_buff_send(r) = pb(j + pack_offset, k, l, i - sys_size, q)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
-                        do l = 0, p
-                            do k = 0, n
-                                do j = 0, buff_size - 1
-                                    do i = sys_size + 1, sys_size + 4
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*4 + nb*4 + v_size* &
-                                                (j + buff_size*(k + (n + 1)*l))
-                                            q_cons_buff_send(r) = mv(j + pack_offset, k, l, i - sys_size, q)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end if
-                #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                    do i = 1, sys_size
-                        do l = 0, p
-                            do k = 0, buff_size - 1
-                                do j = -buff_size, m + buff_size
-                                    r = (i - 1) + v_size* &
-                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                         (k + buff_size*l))
-                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k + pack_offset, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    if (qbmm .and. .not. polytropic) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
-                        do i = sys_size + 1, sys_size + 4
-                            do l = 0, p
-                                do k = 0, buff_size - 1
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*4 + v_size* &
-                                                ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                                 (k + buff_size*l))
-                                            q_cons_buff_send(r) = pb(j, k + pack_offset, l, i - sys_size, q)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
-                        do i = sys_size + 1, sys_size + 4
-                            do l = 0, p
-                                do k = 0, buff_size - 1
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*4 + nb*4 + v_size* &
-                                                ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                                 (k + buff_size*l))
-                                            q_cons_buff_send(r) = mv(j, k + pack_offset, l, i - sys_size, q)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end if
-                #:else
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                    do i = 1, sys_size
-                        do l = 0, buff_size - 1
-                            do k = -buff_size, n + buff_size
-                                do j = -buff_size, m + buff_size
-                                    r = (i - 1) + v_size* &
-                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                         ((k + buff_size) + (n + 2*buff_size + 1)*l))
-                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l + pack_offset)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    if (qbmm .and. .not. polytropic) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
-                        do i = sys_size + 1, sys_size + 4
-                            do l = 0, buff_size - 1
-                                do k = -buff_size, n + buff_size
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*4 + v_size* &
-                                                ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                                 ((k + buff_size) + (n + 2*buff_size + 1)*l))
-                                            q_cons_buff_send(r) = pb(j, k, l + pack_offset, i - sys_size, q)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
-                        do i = sys_size + 1, sys_size + 4
-                            do l = 0, buff_size - 1
-                                do k = -buff_size, n + buff_size
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*4 + nb*4 + v_size* &
-                                                ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                                 ((k + buff_size) + (n + 2*buff_size + 1)*l))
-                                            q_cons_buff_send(r) = mv(j, k, l + pack_offset, i - sys_size, q)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end if
-                #:endif
-            end if
-        #:endfor
-        call nvtxEndRange ! Packbuf
+        #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="mpi_dir", loc="-pbc_loc", inner_loops=[("i", 1, "nVars")], pack_v_size="nVars")
+            c_divs_buff_send(pack_idr) = c_divs_vf(i)%sf(sx, sy, sz)
+        #:endblock
 
-        ! Send/Recv
+        p_send => c_divs_buff_send(0)
+        p_recv => c_divs_buff_recv(0)
         #:for rdma_mpi in [False, True]
             if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
-                p_send => q_cons_buff_send(0)
-                p_recv => q_cons_buff_recv(0)
                 #:if rdma_mpi
                     !$acc data attach(p_send, p_recv)
                     !$acc host_data use_device(p_send, p_recv)
+
                     call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
                 #:else
                     call nvtxStartRange("RHS-COMM-DEV2HOST")
-                    !$acc update host(q_cons_buff_send, ib_buff_send)
+                    !$acc update host(c_divs_buff_send)
                     call nvtxEndRange
+
                     call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
                 #:endif
 
                 call MPI_SENDRECV( &
-                    p_send, buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, &
-                    p_recv, buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, &
+                    p_send, buffer_counts(mpi_dir), MPI_DOUBLE_PRECISION, neighbor_procs(mpi_dir, -pbc_loc), 0, &
+                    p_recv, buffer_counts(mpi_dir), MPI_DOUBLE_PRECISION, neighbor_procs(mpi_dir, +pbc_loc), 0, &
                     MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
                 call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
 
                 #:if rdma_mpi
@@ -1108,1243 +405,17 @@ contains
                     !$acc wait
                 #:else
                     call nvtxStartRange("RHS-COMM-HOST2DEV")
-                    !$acc update device(q_cons_buff_recv)
+                    !$acc update device(c_divs_buff_recv)
                     call nvtxEndRange
                 #:endif
             end if
         #:endfor
 
-        ! Unpack Received Buffer
-        call nvtxStartRange("RHS-COMM-UNPACKBUF")
-        #:for mpi_dir in [1, 2, 3]
-            if (mpi_dir == ${mpi_dir}$) then
-                #:if mpi_dir == 1
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                    do l = 0, p
-                        do k = 0, n
-                            do j = -buff_size, -1
-                                do i = 1, sys_size
-                                    r = (i - 1) + v_size* &
-                                        (j + buff_size*((k + 1) + (n + 1)*l))
-                                    q_cons_vf(i)%sf(j + unpack_offset, k, l) = q_cons_buff_recv(r)
-#if defined(__INTEL_COMPILER)
-                                    if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then
-                                        print *, "Error", j, k, l, i
-                                        error stop "NaN(s) in recv"
-                                    end if
-#endif
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    if (qbmm .and. .not. polytropic) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
-                        do l = 0, p
-                            do k = 0, n
-                                do j = -buff_size, -1
-                                    do i = sys_size + 1, sys_size + 4
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*4 + v_size* &
-                                                (j + buff_size*((k + 1) + (n + 1)*l))
-                                            pb(j + unpack_offset, k, l, i - sys_size, q) = q_cons_buff_recv(r)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
-                        do l = 0, p
-                            do k = 0, n
-                                do j = -buff_size, -1
-                                    do i = sys_size + 1, sys_size + 4
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*4 + nb*4 + v_size* &
-                                                (j + buff_size*((k + 1) + (n + 1)*l))
-                                            mv(j + unpack_offset, k, l, i - sys_size, q) = q_cons_buff_recv(r)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end if
-                #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                    do i = 1, sys_size
-                        do l = 0, p
-                            do k = -buff_size, -1
-                                do j = -buff_size, m + buff_size
-                                    r = (i - 1) + v_size* &
-                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                         ((k + buff_size) + buff_size*l))
-                                    q_cons_vf(i)%sf(j, k + unpack_offset, l) = q_cons_buff_recv(r)
-#if defined(__INTEL_COMPILER)
-                                    if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then
-                                        print *, "Error", j, k, l, i
-                                        error stop "NaN(s) in recv"
-                                    end if
-#endif
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    if (qbmm .and. .not. polytropic) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
-                        do i = sys_size + 1, sys_size + 4
-                            do l = 0, p
-                                do k = -buff_size, -1
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*4 + v_size* &
-                                                ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                                 ((k + buff_size) + buff_size*l))
-                                            pb(j, k + unpack_offset, l, i - sys_size, q) = q_cons_buff_recv(r)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
-                        do i = sys_size + 1, sys_size + 4
-                            do l = 0, p
-                                do k = -buff_size, -1
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*4 + nb*4 + v_size* &
-                                                ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                                 ((k + buff_size) + buff_size*l))
-                                            mv(j, k + unpack_offset, l, i - sys_size, q) = q_cons_buff_recv(r)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end if
-                #:else
-                    ! Unpacking buffer from bc_z%beg
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                    do i = 1, sys_size
-                        do l = -buff_size, -1
-                            do k = -buff_size, n + buff_size
-                                do j = -buff_size, m + buff_size
-                                    r = (i - 1) + v_size* &
-                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                         ((k + buff_size) + (n + 2*buff_size + 1)* &
-                                          (l + buff_size)))
-                                    q_cons_vf(i)%sf(j, k, l + unpack_offset) = q_cons_buff_recv(r)
-#if defined(__INTEL_COMPILER)
-                                    if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then
-                                        print *, "Error", j, k, l, i
-                                        error stop "NaN(s) in recv"
-                                    end if
-#endif
-                                end do
-                            end do
-                        end do
-                    end do
-
-                    if (qbmm .and. .not. polytropic) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
-                        do i = sys_size + 1, sys_size + 4
-                            do l = -buff_size, -1
-                                do k = -buff_size, n + buff_size
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*4 + v_size* &
-                                                ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                                 ((k + buff_size) + (n + 2*buff_size + 1)* &
-                                                  (l + buff_size)))
-                                            pb(j, k, l + unpack_offset, i - sys_size, q) = q_cons_buff_recv(r)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
-                        do i = sys_size + 1, sys_size + 4
-                            do l = -buff_size, -1
-                                do k = -buff_size, n + buff_size
-                                    do j = -buff_size, m + buff_size
-                                        do q = 1, nb
-                                            r = (i - 1) + (q - 1)*4 + nb*4 + v_size* &
-                                                ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                                 ((k + buff_size) + (n + 2*buff_size + 1)* &
-                                                  (l + buff_size)))
-                                            mv(j, k, l + unpack_offset, i - sys_size, q) = q_cons_buff_recv(r)
-                                        end do
-                                    end do
-                                end do
-                            end do
-                        end do
-                    end if
-                #:endif
-            end if
-        #:endfor
-        call nvtxEndRange
-
-#endif
-
-    end subroutine s_mpi_sendrecv_variables_buffers
-
-    !>  The goal of this procedure is to populate the buffers of
-        !!      the cell-average conservative variables by communicating
-        !!      with the neighboring processors.
-    subroutine s_mpi_sendrecv_ib_buffers(ib_markers, gp_layers)
-
-        type(integer_field), intent(inout) :: ib_markers
-        integer, intent(in) :: gp_layers
-
-        integer :: i, j, k, l, r !< Generic loop iterators
-        integer, pointer, dimension(:) :: p_i_send, p_i_recv
-
-#ifdef MFC_MPI
-
-        if (n > 0) then
-            if (p > 0) then
-                @:ALLOCATE(ib_buff_send(0:-1 + gp_layers * &
-                                        & (m + 2*gp_layers + 1)* &
-                                        & (n + 2*gp_layers + 1)* &
-                                        & (p + 2*gp_layers + 1)/ &
-                                        & (min(m, n, p) + 2*gp_layers + 1)))
-            else
-                @:ALLOCATE(ib_buff_send(0:-1 + gp_layers* &
-                                        & (max(m, n) + 2*gp_layers + 1)))
+        #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir="mpi_dir", loc="pbc_loc", inner_loops=[("i", 1, "nVars")], pack_v_size="nVars")
+            if (bc_id_sfs(mpi_dir, pbc_loc)%sf(exlhs, eylhs, ezlhs)%type >= 0) then
+                c_divs_vf(i)%sf(x, y, z) = c_divs_buff_recv(pack_idr)
             end if
-        else
-            @:ALLOCATE(ib_buff_send(0:-1 + gp_layers))
-        end if
-        @:ALLOCATE(ib_buff_recv(0:ubound(ib_buff_send, 1)))
-
-        !nCalls_time = nCalls_time + 1
-
-        ! MPI Communication in x-direction =================================
-        if (bc_x%beg >= 0) then      ! PBC at the beginning
-
-            if (bc_x%end >= 0) then      ! PBC at the beginning and end
-
-                ! Packing buffer to be sent to bc_x%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
-                do l = 0, p
-                    do k = 0, n
-                        do j = m - gp_layers + 1, m
-                            r = ((j - m - 1) + gp_layers*((k + 1) + (n + 1)*l))
-                            ib_buff_send(r) = ib_markers%sf(j, k, l)
-                        end do
-                    end do
-                end do
-
-                !call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-#if defined(MFC_OpenACC)
-                if (rdma_mpi) then
-                    p_i_send => ib_buff_send
-                    p_i_recv => ib_buff_recv
-
-                    !$acc data attach(p_i_send, p_i_recv)
-                    !$acc host_data use_device(p_i_send, p_i_recv)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        p_i_send(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%end, 0, &
-                        p_i_recv(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                else
-#endif
-
-                    !$acc update host(ib_buff_send, ib_buff_send)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        ib_buff_send(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%end, 0, &
-                        ib_buff_recv(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-#if defined(MFC_OpenACC)
-                end if
-#endif
-
-            else                        ! PBC at the beginning only
-
-                ! Packing buffer to be sent to bc_x%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
-                do l = 0, p
-                    do k = 0, n
-                        do j = 0, gp_layers - 1
-                            r = (j + gp_layers*(k + (n + 1)*l))
-                            ib_buff_send(r) = ib_markers%sf(j, k, l)
-                        end do
-                    end do
-                end do
-
-                !call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-#if defined(MFC_OpenACC)
-                if (rdma_mpi) then
-                    p_i_send => ib_buff_send
-                    p_i_recv => ib_buff_recv
-
-                    !$acc data attach(p_i_send, p_i_recv)
-                    !$acc host_data use_device(p_i_send, p_i_recv)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        p_i_send(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%beg, 1, &
-                        p_i_recv(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                else
-#endif
-                    !$acc update host(ib_buff_send)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        ib_buff_send(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%beg, 1, &
-                        ib_buff_recv(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-#if defined(MFC_OpenACC)
-                end if
-#endif
-
-            end if
-
-#if defined(MFC_OpenACC)
-            if (rdma_mpi .eqv. .false.) then
-                !$acc update device(ib_buff_recv)
-            end if
-#endif
-
-            ! Unpacking buffer received from bc_x%beg
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
-            do l = 0, p
-                do k = 0, n
-                    do j = -gp_layers, -1
-                        r = (j + gp_layers*((k + 1) + (n + 1)*l))
-                        ib_markers%sf(j, k, l) = ib_buff_recv(r)
-                    end do
-                end do
-            end do
-
-        end if
-
-        if (bc_x%end >= 0) then                ! PBC at the end
-
-            if (bc_x%beg >= 0) then      ! PBC at the end and beginning
-
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
-                ! Packing buffer to be sent to bc_x%beg
-                do l = 0, p
-                    do k = 0, n
-                        do j = 0, gp_layers - 1
-                            r = (j + gp_layers*(k + (n + 1)*l))
-                            ib_buff_send(r) = ib_markers%sf(j, k, l)
-                        end do
-                    end do
-                end do
-
-                !call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-#if defined(MFC_OpenACC)
-                if (rdma_mpi) then
-                    p_i_send => ib_buff_send
-                    p_i_recv => ib_buff_recv
-
-                    !$acc data attach(p_i_send, p_i_recv)
-                    !$acc host_data use_device(p_i_send, p_i_recv)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        p_i_send(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%beg, 1, &
-                        p_i_recv(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                else
-#endif
-
-                    !$acc update host(ib_buff_send)
-                    call MPI_SENDRECV( &
-                        ib_buff_send(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%beg, 1, &
-                        ib_buff_recv(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-#if defined(MFC_OpenACC)
-                end if
-#endif
-
-            else                        ! PBC at the end only
-
-                ! Packing buffer to be sent to bc_x%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
-                do l = 0, p
-                    do k = 0, n
-                        do j = m - gp_layers + 1, m
-                            r = ((j - m - 1) + gp_layers*((k + 1) + (n + 1)*l))
-                            ib_buff_send(r) = ib_markers%sf(j, k, l)
-                        end do
-                    end do
-                end do
-
-                !call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-#if defined(MFC_OpenACC)
-                if (rdma_mpi) then
-                    p_i_send => ib_buff_send
-                    p_i_recv => ib_buff_recv
-
-                    !$acc data attach(p_i_send, p_i_recv)
-                    !$acc host_data use_device(p_i_send, p_i_recv)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        p_i_send(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%end, 0, &
-                        p_i_recv(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                else
-#endif
-
-                    !$acc update host(ib_buff_send)
-
-                    call MPI_SENDRECV( &
-                        ib_buff_send(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%end, 0, &
-                        ib_buff_recv(0), &
-                        gp_layers*(n + 1)*(p + 1), &
-                        MPI_INTEGER, bc_x%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-#if defined(MFC_OpenACC)
-                end if
-#endif
-
-            end if
-
-            if (rdma_mpi .eqv. .false.) then
-                !$acc update device(ib_buff_recv)
-            end if
-
-            ! Unpacking buffer received from bc_x%end
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
-            do l = 0, p
-                do k = 0, n
-                    do j = m + 1, m + gp_layers
-                        r = ((j - m - 1) + gp_layers*(k + (n + 1)*l))
-                        ib_markers%sf(j, k, l) = ib_buff_recv(r)
-                    end do
-                end do
-            end do
-
-        end if
-        ! END: MPI Communication in x-direction ============================
-
-        ! MPI Communication in y-direction =================================
-
-        if (bc_y%beg >= 0) then      ! PBC at the beginning
-
-            if (bc_y%end >= 0) then      ! PBC at the beginning and end
-
-                ! Packing buffer to be sent to bc_y%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
-                do l = 0, p
-                    do k = n - gp_layers + 1, n
-                        do j = -gp_layers, m + gp_layers
-                            r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                                 ((k - n + gp_layers - 1) + gp_layers*l))
-                            ib_buff_send(r) = ib_markers%sf(j, k, l)
-                        end do
-                    end do
-                end do
-
-                !call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-#if defined(MFC_OpenACC)
-                if (rdma_mpi) then
-                    p_i_send => ib_buff_send
-                    p_i_recv => ib_buff_recv
-
-                    !$acc data attach(p_i_send, p_i_recv)
-                    !$acc host_data use_device(p_i_send, p_i_recv)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        p_i_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%end, 0, &
-                        p_i_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                else
-#endif
-
-                    !$acc update host(ib_buff_send)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        ib_buff_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%end, 0, &
-                        ib_buff_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-#if defined(MFC_OpenACC)
-                end if
-#endif
-
-            else                        ! PBC at the beginning only
-
-                ! Packing buffer to be sent to bc_y%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
-                do l = 0, p
-                    do k = 0, gp_layers - 1
-                        do j = -gp_layers, m + gp_layers
-                            r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                                 (k + gp_layers*l))
-                            ib_buff_send(r) = ib_markers%sf(j, k, l)
-                        end do
-                    end do
-                end do
-
-                !call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-#if defined(MFC_OpenACC)
-                if (rdma_mpi) then
-                    p_i_send => ib_buff_send
-                    p_i_recv => ib_buff_recv
-
-                    !$acc data attach(p_i_send, p_i_recv)
-                    !$acc host_data use_device(p_i_send, p_i_recv)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        p_i_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%beg, 1, &
-                        p_i_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                else
-#endif
-
-                    !$acc update host(ib_buff_send)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        ib_buff_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%beg, 1, &
-                        ib_buff_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-#if defined(MFC_OpenACC)
-                end if
-#endif
-
-            end if
-
-#if defined(MFC_OpenACC)
-            if (rdma_mpi .eqv. .false.) then
-                !$acc update device(ib_buff_recv)
-            end if
-#endif
-
-            ! Unpacking buffer received from bc_y%beg
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
-            do l = 0, p
-                do k = -gp_layers, -1
-                    do j = -gp_layers, m + gp_layers
-                        r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                             ((k + gp_layers) + gp_layers*l))
-                        ib_markers%sf(j, k, l) = ib_buff_recv(r)
-                    end do
-                end do
-            end do
-
-        end if
-
-        if (bc_y%end >= 0) then                   ! PBC at the end
-
-            if (bc_y%beg >= 0) then      ! PBC at the end and beginning
-
-                ! Packing buffer to be sent to bc_y%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
-                do l = 0, p
-                    do k = 0, gp_layers - 1
-                        do j = -gp_layers, m + gp_layers
-                            r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                                 (k + gp_layers*l))
-                            ib_buff_send(r) = ib_markers%sf(j, k, l)
-                        end do
-                    end do
-                end do
-
-                !call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-#if defined(MFC_OpenACC)
-                if (rdma_mpi) then
-                    p_i_send => ib_buff_send
-                    p_i_recv => ib_buff_recv
-
-                    !$acc data attach(p_i_send, p_i_recv)
-                    !$acc host_data use_device(p_i_send, p_i_recv)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        p_i_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%beg, 1, &
-                        p_i_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                else
-#endif
-
-                    !$acc update host(ib_buff_send)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        ib_buff_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%beg, 1, &
-                        ib_buff_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-#if defined(MFC_OpenACC)
-                end if
-#endif
-
-            else                        ! PBC at the end only
-
-                ! Packing buffer to be sent to bc_y%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
-                do l = 0, p
-                    do k = n - gp_layers + 1, n
-                        do j = -gp_layers, m + gp_layers
-                            r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                                 ((k - n + gp_layers - 1) + gp_layers*l))
-                            ib_buff_send(r) = ib_markers%sf(j, k, l)
-                        end do
-                    end do
-                end do
-
-                !call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-#if defined(MFC_OpenACC)
-                if (rdma_mpi) then
-                    p_i_send => ib_buff_send
-                    p_i_recv => ib_buff_recv
-
-                    !$acc data attach(p_i_send, p_i_recv)
-                    !$acc host_data use_device(p_i_send, p_i_recv)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        p_i_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%end, 0, &
-                        p_i_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                else
-#endif
-
-                    !$acc update host(ib_buff_send)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        ib_buff_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%end, 0, &
-                        ib_buff_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(p + 1), &
-                        MPI_INTEGER, bc_y%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-#if defined(MFC_OpenACC)
-                end if
-#endif
-
-            end if
-
-#if defined(MFC_OpenACC)
-            if (rdma_mpi .eqv. .false.) then
-                !$acc update device(ib_buff_recv)
-            end if
-#endif
-
-            ! Unpacking buffer received form bc_y%end
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
-            do l = 0, p
-                do k = n + 1, n + gp_layers
-                    do j = -gp_layers, m + gp_layers
-                        r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                             ((k - n - 1) + gp_layers*l))
-                        ib_markers%sf(j, k, l) = ib_buff_recv(r)
-                    end do
-                end do
-            end do
-
-        end if
-        ! END: MPI Communication in y-direction ============================
-
-        ! MPI Communication in z-direction =================================
-        if (bc_z%beg >= 0) then      ! PBC at the beginning
-
-            if (bc_z%end >= 0) then      ! PBC at the beginning and end
-
-                ! Packing buffer to be sent to bc_z%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
-                do l = p - gp_layers + 1, p
-                    do k = -gp_layers, n + gp_layers
-                        do j = -gp_layers, m + gp_layers
-                            r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                                 ((k + gp_layers) + (n + 2*gp_layers + 1)* &
-                                  (l - p + gp_layers - 1)))
-                            ib_buff_send(r) = ib_markers%sf(j, k, l)
-                        end do
-                    end do
-                end do
-
-                !call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-#if defined(MFC_OpenACC)
-                if (rdma_mpi) then
-                    p_i_send => ib_buff_send
-                    p_i_recv => ib_buff_recv
-
-                    !$acc data attach(p_i_send, p_i_recv)
-                    !$acc host_data use_device(p_i_send, p_i_recv)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        p_i_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%end, 0, &
-                        p_i_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                else
-#endif
-
-                    !$acc update host(ib_buff_send)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        ib_buff_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%end, 0, &
-                        ib_buff_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-#if defined(MFC_OpenACC)
-                end if
-#endif
-
-            else                        ! PBC at the beginning only
-
-                ! Packing buffer to be sent to bc_z%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
-                do l = 0, gp_layers - 1
-                    do k = -gp_layers, n + gp_layers
-                        do j = -gp_layers, m + gp_layers
-                            r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                                 ((k + gp_layers) + (n + 2*gp_layers + 1)*l))
-                            ib_buff_send(r) = ib_markers%sf(j, k, l)
-                        end do
-                    end do
-                end do
-
-                !call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-#if defined(MFC_OpenACC)
-                if (rdma_mpi) then
-                    p_i_send => ib_buff_send
-                    p_i_recv => ib_buff_recv
-
-                    !$acc data attach(p_i_send, p_i_recv)
-                    !$acc host_data use_device(p_i_send, p_i_recv)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        p_i_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%beg, 1, &
-                        p_i_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                else
-#endif
-
-                    !$acc update host(ib_buff_send)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        ib_buff_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%beg, 1, &
-                        ib_buff_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%beg, 0, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-#if defined(MFC_OpenACC)
-                end if
-#endif
-
-            end if
-
-#if defined(MFC_OpenACC)
-            if (rdma_mpi .eqv. .false.) then
-                !$acc update device(ib_buff_recv)
-            end if
-#endif
-
-            ! Unpacking buffer from bc_z%beg
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
-            do l = -gp_layers, -1
-                do k = -gp_layers, n + gp_layers
-                    do j = -gp_layers, m + gp_layers
-                        r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                             ((k + gp_layers) + (n + 2*gp_layers + 1)* &
-                              (l + gp_layers)))
-                        ib_markers%sf(j, k, l) = ib_buff_recv(r)
-                    end do
-                end do
-            end do
-
-        end if
-
-        if (bc_z%end >= 0) then                       ! PBC at the end
-
-            if (bc_z%beg >= 0) then      ! PBC at the end and beginning
-
-                ! Packing buffer to be sent to bc_z%beg
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
-                do l = 0, gp_layers - 1
-                    do k = -gp_layers, n + gp_layers
-                        do j = -gp_layers, m + gp_layers
-                            r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                                 ((k + gp_layers) + (n + 2*gp_layers + 1)*l))
-                            ib_buff_send(r) = ib_markers%sf(j, k, l)
-                        end do
-                    end do
-                end do
-
-                !call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-#if defined(MFC_OpenACC)
-                if (rdma_mpi) then
-                    p_i_send => ib_buff_send
-                    p_i_recv => ib_buff_recv
-
-                    !$acc data attach(p_i_send, p_i_recv)
-                    !$acc host_data use_device(p_i_send, p_i_recv)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        p_i_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%beg, 1, &
-                        p_i_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                else
-#endif
-                    !$acc update host(ib_buff_send)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        ib_buff_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%beg, 1, &
-                        ib_buff_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-#if defined(MFC_OpenACC)
-                end if
-#endif
-
-            else                        ! PBC at the end only
-
-                ! Packing buffer to be sent to bc_z%end
-                !$acc parallel loop collapse(3) gang vector default(present) private(r)
-                do l = p - gp_layers + 1, p
-                    do k = -gp_layers, n + gp_layers
-                        do j = -gp_layers, m + gp_layers
-                            r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                                 ((k + gp_layers) + (n + 2*gp_layers + 1)* &
-                                  (l - p + gp_layers - 1)))
-                            ib_buff_send(r) = ib_markers%sf(j, k, l)
-                        end do
-                    end do
-                end do
-
-                !call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
-#if defined(MFC_OpenACC)
-                if (rdma_mpi) then
-                    p_i_send => ib_buff_send
-                    p_i_recv => ib_buff_recv
-
-                    !$acc data attach(p_i_send, p_i_recv)
-                    !$acc host_data use_device(p_i_send, p_i_recv)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        p_i_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%end, 0, &
-                        p_i_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                else
-#endif
-                    !$acc update host(ib_buff_send)
-
-                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
-                    call MPI_SENDRECV( &
-                        ib_buff_send(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%end, 0, &
-                        ib_buff_recv(0), &
-                        gp_layers*(m + 2*gp_layers + 1)*(n + 2*gp_layers + 1), &
-                        MPI_INTEGER, bc_z%end, 1, &
-                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-#if defined(MFC_OpenACC)
-                end if
-#endif
-
-            end if
-
-#if defined(MFC_OpenACC)
-            if (rdma_mpi .eqv. .false.) then
-                !$acc update device(ib_buff_recv)
-            end if
-#endif
-
-            ! Unpacking buffer received from bc_z%end
-            !$acc parallel loop collapse(3) gang vector default(present) private(r)
-            do l = p + 1, p + gp_layers
-                do k = -gp_layers, n + gp_layers
-                    do j = -gp_layers, m + gp_layers
-                        r = ((j + gp_layers) + (m + 2*gp_layers + 1)* &
-                             ((k + gp_layers) + (n + 2*gp_layers + 1)* &
-                              (l - p - 1)))
-                        ib_markers%sf(j, k, l) = ib_buff_recv(r)
-                    end do
-                end do
-            end do
-
-        end if
-
-        ! END: MPI Communication in z-direction ============================
-
-#endif
-
-    end subroutine s_mpi_sendrecv_ib_buffers
-
-    subroutine s_mpi_sendrecv_capilary_variables_buffers(c_divs_vf, mpi_dir, pbc_loc)
-
-        type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs_vf
-        integer, intent(in) :: mpi_dir, pbc_loc
-
-        integer :: i, j, k, l, r, q !< Generic loop iterators
-
-        integer :: buffer_counts(1:3), buffer_count
-
-        type(int_bounds_info) :: boundary_conditions(1:3)
-        integer :: beg_end(1:2), grid_dims(1:3)
-        integer :: dst_proc, src_proc, recv_tag, send_tag
-
-        logical :: beg_end_geq_0
-
-        integer :: pack_offset, unpack_offset
-        real(kind(0d0)), pointer :: p_send, p_recv
-
-#ifdef MFC_MPI
-
-        nVars = num_dims + 1
-        !$acc update device(nVars)
-
-        buffer_counts = (/ &
-                        buff_size*nVars*(n + 1)*(p + 1), &
-                        buff_size*nVars*(m + 2*buff_size + 1)*(p + 1), &
-                        buff_size*nVars*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
-                        /)
-
-        buffer_count = buffer_counts(mpi_dir)
-        boundary_conditions = (/bc_x, bc_y, bc_z/)
-        beg_end = (/boundary_conditions(mpi_dir)%beg, boundary_conditions(mpi_dir)%end/)
-        beg_end_geq_0 = beg_end(max(pbc_loc, 0) - pbc_loc + 1) >= 0
-
-        ! Implements:
-        ! pbc_loc  bc_x >= 0 -> [send/recv]_tag  [dst/src]_proc
-        ! -1 (=0)      0            ->     [1,0]       [0,0]      | 0 0 [1,0] [beg,beg]
-        ! -1 (=0)      1            ->     [0,0]       [1,0]      | 0 1 [0,0] [end,beg]
-        ! +1 (=1)      0            ->     [0,1]       [1,1]      | 1 0 [0,1] [end,end]
-        ! +1 (=1)      1            ->     [1,1]       [0,1]      | 1 1 [1,1] [beg,end]
-
-        send_tag = f_logical_to_int(.not. f_xor(beg_end_geq_0, pbc_loc == 1))
-        recv_tag = f_logical_to_int(pbc_loc == 1)
-
-        dst_proc = beg_end(1 + f_logical_to_int(f_xor(pbc_loc == 1, beg_end_geq_0)))
-        src_proc = beg_end(1 + f_logical_to_int(pbc_loc == 1))
-
-        grid_dims = (/m, n, p/)
-
-        pack_offset = 0
-        if (f_xor(pbc_loc == 1, beg_end_geq_0)) then
-            pack_offset = grid_dims(mpi_dir) - buff_size + 1
-        end if
-
-        unpack_offset = 0
-        if (pbc_loc == 1) then
-            unpack_offset = grid_dims(mpi_dir) + buff_size + 1
-        end if
-
-        ! Pack Buffer to Send
-        #:for mpi_dir in [1, 2, 3]
-            if (mpi_dir == ${mpi_dir}$) then
-                #:if mpi_dir == 1
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 0, buff_size - 1
-                                do i = 1, nVars
-                                    r = (i - 1) + nVars*(j + buff_size*(k + (n + 1)*l))
-                                    c_divs_buff_send(r) = c_divs_vf(i)%sf(j + pack_offset, k, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                    do i = 1, nVars
-                        do l = 0, p
-                            do k = 0, buff_size - 1
-                                do j = -buff_size, m + buff_size
-                                    r = (i - 1) + nVars* &
-                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                         (k + buff_size*l))
-                                    c_divs_buff_send(r) = c_divs_vf(i)%sf(j, k + pack_offset, l)
-                                end do
-                            end do
-                        end do
-                    end do
-
-                #:else
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                    do i = 1, nVars
-                        do l = 0, buff_size - 1
-                            do k = -buff_size, n + buff_size
-                                do j = -buff_size, m + buff_size
-                                    r = (i - 1) + nVars* &
-                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                         ((k + buff_size) + (n + 2*buff_size + 1)*l))
-                                    c_divs_buff_send(r) = c_divs_vf(i)%sf(j, k, l + pack_offset)
-                                end do
-                            end do
-                        end do
-                    end do
-                #:endif
-            end if
-        #:endfor
-
-        ! Send/Recv
-        #:for rdma_mpi in [False, True]
-            if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
-                p_send => c_divs_buff_send(0)
-                p_recv => c_divs_buff_recv(0)
-
-                #:if rdma_mpi
-                    !$acc data attach(p_send, p_recv)
-                    !$acc host_data use_device(p_send, p_recv)
-                #:else
-                    !$acc update host(c_divs_buff_send)
-                #:endif
-
-                call MPI_SENDRECV( &
-                    p_send, buffer_count, MPI_DOUBLE_PRECISION, dst_proc, send_tag, &
-                    p_recv, buffer_count, MPI_DOUBLE_PRECISION, src_proc, recv_tag, &
-                    MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
-
-                #:if rdma_mpi
-                    !$acc end host_data
-                    !$acc end data
-                    !$acc wait
-                #:else
-                    !$acc update device(c_divs_buff_recv)
-                #:endif
-            end if
-        #:endfor
-
-        ! Unpack Received Buffer
-        #:for mpi_dir in [1, 2, 3]
-            if (mpi_dir == ${mpi_dir}$) then
-                #:if mpi_dir == 1
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                    do l = 0, p
-                        do k = 0, n
-                            do j = -buff_size, -1
-                                do i = 1, nVars
-                                    r = (i - 1) + nVars* &
-                                        (j + buff_size*((k + 1) + (n + 1)*l))
-                                    c_divs_vf(i)%sf(j + unpack_offset, k, l) = c_divs_buff_recv(r)
-#if defined(__INTEL_COMPILER)
-                                    if (ieee_is_nan(c_divs_vf(i)%sf(j, k, l))) then
-                                        print *, "Error", j, k, l, i
-                                        error stop "NaN(s) in recv"
-                                    end if
-#endif
-                                end do
-                            end do
-                        end do
-                    end do
-
-                #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                    do i = 1, nVars
-                        do l = 0, p
-                            do k = -buff_size, -1
-                                do j = -buff_size, m + buff_size
-                                    r = (i - 1) + nVars* &
-                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                         ((k + buff_size) + buff_size*l))
-                                    c_divs_vf(i)%sf(j, k + unpack_offset, l) = c_divs_buff_recv(r)
-#if defined(__INTEL_COMPILER)
-                                    if (ieee_is_nan(c_divs_vf(i)%sf(j, k, l))) then
-                                        print *, "Error", j, k, l, i
-                                        error stop "NaN(s) in recv"
-                                    end if
-#endif
-                                end do
-                            end do
-                        end do
-                    end do
-
-                #:else
-                    ! Unpacking buffer from bc_z%beg
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
-                    do i = 1, nVars
-                        do l = -buff_size, -1
-                            do k = -buff_size, n + buff_size
-                                do j = -buff_size, m + buff_size
-                                    r = (i - 1) + nVars* &
-                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
-                                         ((k + buff_size) + (n + 2*buff_size + 1)* &
-                                          (l + buff_size)))
-                                    c_divs_vf(i)%sf(j, k, l + unpack_offset) = c_divs_buff_recv(r)
-#if defined(__INTEL_COMPILER)
-                                    if (ieee_is_nan(c_divs_vf(i)%sf(j, k, l))) then
-                                        print *, "Error", j, k, l, i
-                                        error stop "NaN(s) in recv"
-                                    end if
-#endif
-                                end do
-                            end do
-                        end do
-                    end do
-
-                #:endif
-            end if
-        #:endfor
+        #:endblock
 
 #endif
 
@@ -2363,8 +434,6 @@ contains
 
 #ifdef MFC_MPI
 
-        ! Deallocating q_cons_buff_send and q_cons_buff_recv
-        @:DEALLOCATE(q_cons_buff_send, q_cons_buff_recv)
         if (ib) then
             @:DEALLOCATE(ib_buff_send, ib_buff_recv)
         end if
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 389847dd75..1d6c20f3fb 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -670,7 +670,7 @@ contains
         call nvtxEndRange
 
         call nvtxStartRange("RHS-COMMUNICATION")
-        call s_populate_variables_buffers(q_prim_qp%vf, pb, mv)
+        call s_populate_prim_buffers(q_prim_qp%vf, pb, mv)
         call nvtxEndRange
 
         if (cfl_dt) then
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 4ecb6127c0..cabc2f5de4 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -61,6 +61,8 @@ module m_start_up
 
     use m_helper_basic          !< Functions to compare floating point numbers
 
+    use m_boundary_conditions_common
+
 #ifdef MFC_OpenACC
     use openacc
 #endif
@@ -78,6 +80,8 @@ module m_start_up
     use m_surface_tension
 
     use m_body_forces
+
+    use m_boundary_conditions
     ! ==========================================================================
 
     implicit none
@@ -87,7 +91,7 @@ module m_start_up
  s_read_data_files, &
  s_read_serial_data_files, &
  s_read_parallel_data_files, &
- s_populate_grid_variables_buffers, &
+ s_mpi_sendrecv_grid_spacing_buffers, &
  s_initialize_internal_energy_equations, &
  s_initialize_modules, s_initialize_gpu_vars, &
  s_initialize_mpi_domain, s_finalize_modules, &
@@ -141,6 +145,7 @@ contains
             teno_CT, mp_weno, weno_avg, &
             riemann_solver, low_Mach, wave_speeds, avg_state, &
             bc_x, bc_y, bc_z, &
+            num_bc_patches, patch_bc, &
             x_domain, y_domain, z_domain, &
             hypoelasticity, &
             ib, num_ibs, patch_ib, &
@@ -252,6 +257,8 @@ contains
 
         integer :: i, r !< Generic loop iterator
 
+        integer :: iter_dir, iter_loc
+
         ! Confirming that the directory from which the initial condition and
         ! the grid data files are to be read in exists and exiting otherwise
         if (cfl_dt) then
@@ -269,6 +276,8 @@ contains
             call s_mpi_abort(trim(file_path)//' is missing. Exiting ...')
         end if
 
+        call s_read_boundary_condition_files(t_step_dir)
+
         ! Cell-boundary Locations in x-direction ===========================
         file_path = trim(t_step_dir)//'/x_cb.dat'
 
@@ -502,6 +511,7 @@ contains
         logical :: file_exist
 
         character(len=10) :: t_step_start_string
+        character(len=path_len) :: step_dirpath
 
         integer :: i, j
 
@@ -509,8 +519,10 @@ contains
         allocate (y_cb_glb(-1:n_glb))
         allocate (z_cb_glb(-1:p_glb))
 
+        step_dirpath = trim(case_dir)//'/restart_data'
+
         ! Read in cell boundary locations in x-direction
-        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//'x_cb.dat'
+        file_loc = trim(step_dirpath)//trim(mpiiofs)//'x_cb.dat'
         inquire (FILE=trim(file_loc), EXIST=file_exist)
 
         if (file_exist) then
@@ -541,7 +553,7 @@ contains
 
         if (n > 0) then
             ! Read in cell boundary locations in y-direction
-            file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//'y_cb.dat'
+            file_loc = trim(step_dirpath)//trim(mpiiofs)//'y_cb.dat'
             inquire (FILE=trim(file_loc), EXIST=file_exist)
 
             if (file_exist) then
@@ -584,6 +596,8 @@ contains
             end if
         end if
 
+        call s_read_boundary_condition_files(trim(step_dirpath))
+
         if (file_per_process) then
             if (cfl_dt) then
                 call s_int_to_str(n_start, t_step_start_string)
@@ -592,7 +606,7 @@ contains
                 call s_int_to_str(t_step_start, t_step_start_string)
                 write (file_loc, '(I0,A1,I7.7,A)') t_step_start, '_', proc_rank, '.dat'
             end if
-            file_loc = trim(case_dir)//'/restart_data/lustre_'//trim(t_step_start_string)//trim(mpiiofs)//trim(file_loc)
+            file_loc = trim(step_dirpath)//'/lustre_'//trim(t_step_start_string)//trim(mpiiofs)//trim(file_loc)
             inquire (FILE=trim(file_loc), EXIST=file_exist)
 
             if (file_exist) then
@@ -653,7 +667,7 @@ contains
                 if (ib) then
                     ! Read IB Markers
                     write (file_loc, '(A)') 'ib.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                    file_loc = trim(step_dirpath)//trim(mpiiofs)//trim(file_loc)
                     inquire (FILE=trim(file_loc), EXIST=file_exist)
 
                     if (file_exist) then
@@ -673,7 +687,7 @@ contains
 
                     ! Read Levelset
                     write (file_loc, '(A)') 'levelset.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                    file_loc = trim(step_dirpath)//trim(mpiiofs)//trim(file_loc)
                     inquire (FILE=trim(file_loc), EXIST=file_exist)
 
                     if (file_exist) then
@@ -693,7 +707,7 @@ contains
 
                     ! Read Levelset Norm
                     write (file_loc, '(A)') 'levelset_norm.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                    file_loc = trim(step_dirpath)//trim(mpiiofs)//trim(file_loc)
                     inquire (FILE=trim(file_loc), EXIST=file_exist)
 
                     if (file_exist) then
@@ -724,7 +738,7 @@ contains
             else
                 write (file_loc, '(I0,A)') t_step_start, '.dat'
             end if
-            file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+            file_loc = trim(step_dirpath)//trim(mpiiofs)//trim(file_loc)
             inquire (FILE=trim(file_loc), EXIST=file_exist)
 
             if (file_exist) then
@@ -803,7 +817,7 @@ contains
 
                     ! Read IB Markers
                     write (file_loc, '(A)') 'ib.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                    file_loc = trim(step_dirpath)//trim(mpiiofs)//trim(file_loc)
                     inquire (FILE=trim(file_loc), EXIST=file_exist)
 
                     if (file_exist) then
@@ -823,7 +837,7 @@ contains
 
                     ! Read Levelset
                     write (file_loc, '(A)') 'levelset.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                    file_loc = trim(step_dirpath)//trim(mpiiofs)//trim(file_loc)
                     inquire (FILE=trim(file_loc), EXIST=file_exist)
 
                     if (file_exist) then
@@ -834,7 +848,7 @@ contains
 
                         call MPI_FILE_SET_VIEW(ifile, disp, MPI_DOUBLE_PRECISION, MPI_IO_levelset_DATA%view, &
                                                'native', mpi_info_int, ierr)
-                        call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size, &
+                        call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size*num_ibs, &
                                            MPI_DOUBLE_PRECISION, status, ierr)
 
                     else
@@ -843,7 +857,7 @@ contains
 
                     ! Read Levelset Norm
                     write (file_loc, '(A)') 'levelset_norm.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                    file_loc = trim(step_dirpath)//trim(mpiiofs)//trim(file_loc)
                     inquire (FILE=trim(file_loc), EXIST=file_exist)
 
                     if (file_exist) then
@@ -880,7 +894,7 @@ contains
                     allocate (airfoil_grid_l(1:Np))
 
                     write (file_loc, '(A)') 'airfoil_l.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                    file_loc = trim(step_dirpath)//trim(mpiiofs)//trim(file_loc)
                     inquire (FILE=trim(file_loc), EXIST=file_exist)
                     if (file_exist) then
 
@@ -897,7 +911,7 @@ contains
                     end if
 
                     write (file_loc, '(A)') 'airfoil_u.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                    file_loc = trim(step_dirpath)//trim(mpiiofs)//trim(file_loc)
                     inquire (FILE=trim(file_loc), EXIST=file_exist)
                     if (file_exist) then
 
@@ -932,218 +946,6 @@ contains
 
     end subroutine s_read_parallel_data_files
 
-    !> The purpose of this subroutine is to populate the buffers
-        !!          of the grid variables, which are constituted of the cell-
-        !!          boundary locations and cell-width distributions, based on
-        !!          the boundary conditions.
-    subroutine s_populate_grid_variables_buffers
-
-        integer :: i !< Generic loop iterator
-
-        ! Population of Buffers in x-direction =============================
-
-        ! Populating cell-width distribution buffer, at the beginning of the
-        ! coordinate direction, based on the selected boundary condition. In
-        ! order, these are the ghost-cell extrapolation, symmetry, periodic,
-        ! and processor boundary conditions.
-        if (bc_x%beg <= -3) then
-            do i = 1, buff_size
-                dx(-i) = dx(0)
-            end do
-        elseif (bc_x%beg == -2) then
-            do i = 1, buff_size
-                dx(-i) = dx(i - 1)
-            end do
-        elseif (bc_x%beg == -1) then
-            do i = 1, buff_size
-                dx(-i) = dx(m - (i - 1))
-            end do
-        else
-            call s_mpi_sendrecv_grid_variables_buffers(1, -1)
-        end if
-
-        ! Computing the cell-boundary locations buffer, at the beginning of
-        ! the coordinate direction, from the cell-width distribution buffer
-        do i = 1, buff_size
-            x_cb(-1 - i) = x_cb(-i) - dx(-i)
-        end do
-        ! Computing the cell-center locations buffer, at the beginning of
-        ! the coordinate direction, from the cell-width distribution buffer
-        do i = 1, buff_size
-            x_cc(-i) = x_cc(1 - i) - (dx(1 - i) + dx(-i))/2d0
-        end do
-
-        ! Populating the cell-width distribution buffer, at the end of the
-        ! coordinate direction, based on desired boundary condition. These
-        ! include, in order, ghost-cell extrapolation, symmetry, periodic,
-        ! and processor boundary conditions.
-        if (bc_x%end <= -3) then
-            do i = 1, buff_size
-                dx(m + i) = dx(m)
-            end do
-        elseif (bc_x%end == -2) then
-            do i = 1, buff_size
-                dx(m + i) = dx(m - (i - 1))
-            end do
-        elseif (bc_x%end == -1) then
-            do i = 1, buff_size
-                dx(m + i) = dx(i - 1)
-            end do
-        else
-            call s_mpi_sendrecv_grid_variables_buffers(1, 1)
-        end if
-
-        ! Populating the cell-boundary locations buffer, at the end of the
-        ! coordinate direction, from buffer of the cell-width distribution
-        do i = 1, buff_size
-            x_cb(m + i) = x_cb(m + (i - 1)) + dx(m + i)
-        end do
-        ! Populating the cell-center locations buffer, at the end of the
-        ! coordinate direction, from buffer of the cell-width distribution
-        do i = 1, buff_size
-            x_cc(m + i) = x_cc(m + (i - 1)) + (dx(m + (i - 1)) + dx(m + i))/2d0
-        end do
-
-        ! END: Population of Buffers in x-direction ========================
-
-        ! Population of Buffers in y-direction =============================
-
-        ! Populating cell-width distribution buffer, at the beginning of the
-        ! coordinate direction, based on the selected boundary condition. In
-        ! order, these are the ghost-cell extrapolation, symmetry, periodic,
-        ! and processor boundary conditions.
-        if (n == 0) then
-            return
-        elseif (bc_y%beg <= -3 .and. bc_y%beg /= -14) then
-            do i = 1, buff_size
-                dy(-i) = dy(0)
-            end do
-        elseif (bc_y%beg == -2 .or. bc_y%beg == -14) then
-            do i = 1, buff_size
-                dy(-i) = dy(i - 1)
-            end do
-        elseif (bc_y%beg == -1) then
-            do i = 1, buff_size
-                dy(-i) = dy(n - (i - 1))
-            end do
-        else
-            call s_mpi_sendrecv_grid_variables_buffers(2, -1)
-        end if
-
-        ! Computing the cell-boundary locations buffer, at the beginning of
-        ! the coordinate direction, from the cell-width distribution buffer
-        do i = 1, buff_size
-            y_cb(-1 - i) = y_cb(-i) - dy(-i)
-        end do
-        ! Computing the cell-center locations buffer, at the beginning of
-        ! the coordinate direction, from the cell-width distribution buffer
-        do i = 1, buff_size
-            y_cc(-i) = y_cc(1 - i) - (dy(1 - i) + dy(-i))/2d0
-        end do
-
-        ! Populating the cell-width distribution buffer, at the end of the
-        ! coordinate direction, based on desired boundary condition. These
-        ! include, in order, ghost-cell extrapolation, symmetry, periodic,
-        ! and processor boundary conditions.
-        if (bc_y%end <= -3) then
-            do i = 1, buff_size
-                dy(n + i) = dy(n)
-            end do
-        elseif (bc_y%end == -2) then
-            do i = 1, buff_size
-                dy(n + i) = dy(n - (i - 1))
-            end do
-        elseif (bc_y%end == -1) then
-            do i = 1, buff_size
-                dy(n + i) = dy(i - 1)
-            end do
-        else
-            call s_mpi_sendrecv_grid_variables_buffers(2, 1)
-        end if
-
-        ! Populating the cell-boundary locations buffer, at the end of the
-        ! coordinate direction, from buffer of the cell-width distribution
-        do i = 1, buff_size
-            y_cb(n + i) = y_cb(n + (i - 1)) + dy(n + i)
-        end do
-        ! Populating the cell-center locations buffer, at the end of the
-        ! coordinate direction, from buffer of the cell-width distribution
-        do i = 1, buff_size
-            y_cc(n + i) = y_cc(n + (i - 1)) + (dy(n + (i - 1)) + dy(n + i))/2d0
-        end do
-
-        ! END: Population of Buffers in y-direction ========================
-
-        ! Population of Buffers in z-direction =============================
-
-        ! Populating cell-width distribution buffer, at the beginning of the
-        ! coordinate direction, based on the selected boundary condition. In
-        ! order, these are the ghost-cell extrapolation, symmetry, periodic,
-        ! and processor boundary conditions.
-        if (p == 0) then
-            return
-        elseif (bc_z%beg <= -3) then
-            do i = 1, buff_size
-                dz(-i) = dz(0)
-            end do
-        elseif (bc_z%beg == -2) then
-            do i = 1, buff_size
-                dz(-i) = dz(i - 1)
-            end do
-        elseif (bc_z%beg == -1) then
-            do i = 1, buff_size
-                dz(-i) = dz(p - (i - 1))
-            end do
-        else
-            call s_mpi_sendrecv_grid_variables_buffers(3, -1)
-        end if
-
-        ! Computing the cell-boundary locations buffer, at the beginning of
-        ! the coordinate direction, from the cell-width distribution buffer
-        do i = 1, buff_size
-            z_cb(-1 - i) = z_cb(-i) - dz(-i)
-        end do
-        ! Computing the cell-center locations buffer, at the beginning of
-        ! the coordinate direction, from the cell-width distribution buffer
-        do i = 1, buff_size
-            z_cc(-i) = z_cc(1 - i) - (dz(1 - i) + dz(-i))/2d0
-        end do
-
-        ! Populating the cell-width distribution buffer, at the end of the
-        ! coordinate direction, based on desired boundary condition. These
-        ! include, in order, ghost-cell extrapolation, symmetry, periodic,
-        ! and processor boundary conditions.
-        if (bc_z%end <= -3) then
-            do i = 1, buff_size
-                dz(p + i) = dz(p)
-            end do
-        elseif (bc_z%end == -2) then
-            do i = 1, buff_size
-                dz(p + i) = dz(p - (i - 1))
-            end do
-        elseif (bc_z%end == -1) then
-            do i = 1, buff_size
-                dz(p + i) = dz(i - 1)
-            end do
-        else
-            call s_mpi_sendrecv_grid_variables_buffers(3, 1)
-        end if
-
-        ! Populating the cell-boundary locations buffer, at the end of the
-        ! coordinate direction, from buffer of the cell-width distribution
-        do i = 1, buff_size
-            z_cb(p + i) = z_cb(p + (i - 1)) + dz(p + i)
-        end do
-        ! Populating the cell-center locations buffer, at the end of the
-        ! coordinate direction, from buffer of the cell-width distribution
-        do i = 1, buff_size
-            z_cc(p + i) = z_cc(p + (i - 1)) + (dz(p + (i - 1)) + dz(p + i))/2d0
-        end do
-
-        ! END: Population of Buffers in z-direction ========================
-
-    end subroutine s_populate_grid_variables_buffers
-
     !> The purpose of this procedure is to initialize the
         !!      values of the internal-energy equations of each phase
         !!      from the mass of each phase, the mixture momentum and
@@ -1398,6 +1200,7 @@ contains
 
     subroutine s_initialize_modules
         call s_initialize_global_parameters_module()
+
         !Quadrature weights and nodes for polydisperse simulations
         if (bubbles .and. nb > 1 .and. R0_type == 1) then
             call s_simpson
@@ -1417,6 +1220,7 @@ contains
         call acc_present_dump()
 #endif
 
+        call s_initialize_mpi_common_module()
         call s_initialize_mpi_proxy_module()
         call s_initialize_variables_conversion_module()
         if (grid_geometry == 3) call s_initialize_fftw_module()
@@ -1458,6 +1262,8 @@ contains
         call acc_present_dump()
 #endif
 
+        call s_initialize_boundary_conditions_module()
+
         ! Reading in the user provided initial condition and grid data
         call s_read_data_files(q_cons_ts(1)%vf)
 
@@ -1467,7 +1273,7 @@ contains
         if (acoustic_source) call s_precalculate_acoustic_spatial_sources()
 
         ! Populating the buffers of the grid variables using the boundary conditions
-        call s_populate_grid_variables_buffers()
+        call s_mpi_sendrecv_grid_spacing_buffers(bc_id_sfs)
 
         ! Computation of parameters, allocation of memory, association of pointers,
         ! and/or execution of any other tasks that are needed to properly configure
@@ -1573,11 +1379,10 @@ contains
         !$acc update device(sigma, surface_tension)
 
         !$acc update device(dx, dy, dz, x_cb, x_cc, y_cb, y_cc, z_cb, z_cc)
-   
-        !$acc update device(bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3)
-        !$acc update device(bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3)
-        !$acc update device(bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3)
 
+        !$acc update device(bc_x%vel_beg, bc_x%vel_end)
+        !$acc update device(bc_y%vel_beg, bc_y%vel_end)
+        !$acc update device(bc_z%vel_beg, bc_z%vel_end)
         !$acc update device(bc_x%grcbc_in, bc_x%grcbc_out, bc_x%grcbc_vel_out)
         !$acc update device(bc_y%grcbc_in, bc_y%grcbc_out, bc_y%grcbc_vel_out)
         !$acc update device(bc_z%grcbc_in, bc_z%grcbc_out, bc_z%grcbc_vel_out)
@@ -1605,6 +1410,7 @@ contains
         call s_finalize_variables_conversion_module()
         if (grid_geometry == 3) call s_finalize_fftw_module
         call s_finalize_mpi_proxy_module()
+        call s_finalize_mpi_common_module()
         call s_finalize_global_parameters_module()
         if (relax) call s_finalize_relaxation_solver_module()
         if (viscous) then
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index afe909ca52..75d0b91d14 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -16,6 +16,8 @@ module m_viscous
     use m_helper
 
     use m_finite_differences
+
+    use m_boundary_conditions_common
     ! ==========================================================================
 
     private; public s_get_viscous, &
@@ -1303,7 +1305,8 @@ contains
         type(scalar_field), intent(inout) :: grad_z
         type(int_bounds_info) :: ix, iy, iz
 
-        integer :: j, k, l !< Generic loop iterators
+        @:BOUNDARY_CONDITION_INTEGER_DECLARATIONS()
+        integer :: iter_loc
 
         ix%beg = 1 - buff_size; ix%end = m + buff_size - 1
         if (n > 0) then
@@ -1397,66 +1400,31 @@ contains
             end if
         end if
 
-        if (bc_x%beg <= -3) then
-            !$acc parallel loop collapse(2) gang vector default(present)
-            do l = idwbuff(3)%beg, idwbuff(3)%end
-                do k = idwbuff(2)%beg, idwbuff(2)%end
-                    grad_x%sf(0, k, l) = (-3d0*var%sf(0, k, l) + 4d0*var%sf(1, k, l) - var%sf(2, k, l))/ &
-                                         (x_cc(2) - x_cc(0))
-                end do
-            end do
-        end if
-        if (bc_x%end <= -3) then
-            !$acc parallel loop collapse(2) gang vector default(present)
-            do l = idwbuff(3)%beg, idwbuff(3)%end
-                do k = idwbuff(2)%beg, idwbuff(2)%end
-                    grad_x%sf(m, k, l) = (3d0*var%sf(m, k, l) - 4d0*var%sf(m - 1, k, l) + var%sf(m - 2, k, l))/ &
-                                         (x_cc(m) - x_cc(m - 2))
-                end do
-            end do
-        end if
-        if (n > 0) then
-            if (bc_y%beg <= -3 .and. bc_y%beg /= -13) then
-                !$acc parallel loop collapse(2) gang vector default(present)
-                do l = idwbuff(3)%beg, idwbuff(3)%end
-                    do j = idwbuff(1)%beg, idwbuff(1)%end
-                        grad_y%sf(j, 0, l) = (-3d0*var%sf(j, 0, l) + 4d0*var%sf(j, 1, l) - var%sf(j, 2, l))/ &
-                                             (y_cc(2) - y_cc(0))
-                    end do
-                end do
-            end if
-            if (bc_y%end <= -3) then
-                !$acc parallel loop collapse(2) gang vector default(present)
-                do l = idwbuff(3)%beg, idwbuff(3)%end
-                    do j = idwbuff(1)%beg, idwbuff(1)%end
-                        grad_y%sf(j, n, l) = (3d0*var%sf(j, n, l) - 4d0*var%sf(j, n - 1, l) + var%sf(j, n - 2, l))/ &
-                                             (y_cc(n) - y_cc(n - 2))
-                    end do
-                end do
+        #:for dir, cmp in zip([1, 2, 3], ['x', 'y', 'z'])
+            if (${dir}$ <= num_dims .and. any(bc_id_has_bc(${dir}$, -1, :-3))) then
+                #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir=dir, loc="-1", thickness=1, pack_v_size=1)
+                    if (bc_id_sfs(${dir}$, -1)%sf(exlhs, eylhs, ezlhs)%type <= -13 .and. bc_id_sfs(${dir}$, -1)%sf(exlhs, eylhs, ezlhs)%type /= -13) then
+                        grad_${cmp}$%sf(ex, ey, ez) = &
+                            (iter_loc*3d0*var%sf(ex, ey, ez) - &
+                             iter_loc*4d0*var%sf(ex - locx, ey - locy, ez - locz) + &
+                             iter_loc*var%sf(ex - 2*locx, ey - 2*locy, ez - 2*locz))/ &
+                            (${cmp}$_cc(2) - ${cmp}$_cc(0))
+                    end if
+                #:endblock
             end if
-            if (p > 0) then
-                if (bc_z%beg <= -3) then
-                    !$acc parallel loop collapse(2) gang vector default(present)
-                    do k = idwbuff(2)%beg, idwbuff(2)%end
-                        do j = idwbuff(1)%beg, idwbuff(1)%end
-                            grad_z%sf(j, k, 0) = &
-                                (-3d0*var%sf(j, k, 0) + 4d0*var%sf(j, k, 1) - var%sf(j, k, 2))/ &
-                                (z_cc(2) - z_cc(0))
-                        end do
-                    end do
-                end if
-                if (bc_z%end <= -3) then
-                    !$acc parallel loop collapse(2) gang vector default(present)
-                    do k = idwbuff(2)%beg, idwbuff(2)%end
-                        do j = idwbuff(1)%beg, idwbuff(1)%end
-                            grad_z%sf(j, k, p) = &
-                                (3d0*var%sf(j, k, p) - 4d0*var%sf(j, k, p - 1) + var%sf(j, k, p - 2))/ &
-                                (z_cc(p) - z_cc(p - 2))
-                        end do
-                    end do
-                end if
+
+            if (${dir}$ <= num_dims .and. any(bc_id_has_bc(${dir}$, +1, :-3))) then
+                #:block ITERATE_OVER_BUFFER_REGION_SIDED(dir=dir, loc="+1", thickness=1, pack_v_size=1)
+                    if (bc_id_sfs(${dir}$, +1)%sf(exlhs, eylhs, ezlhs)%type <= -13 .and. bc_id_sfs(${dir}$, -1)%sf(exlhs, eylhs, ezlhs)%type /= -13) then
+                        grad_${cmp}$%sf(ex, ey, ez) = &
+                            (-iter_loc*3d0*var%sf(ex, ey, ez) + &
+                             iter_loc*4d0*var%sf(ex - locx, ey - locy, ez - locz) - &
+                             iter_loc*var%sf(ex - 2*locx, ey - 2*locy, ez - 2*locz))/ &
+                            (${cmp}$_cc(e${cmp}$) - ${cmp}$_cc(e${cmp}$-2))
+                    end if
+                #:endblock
             end if
-        end if
+        #:endfor
 
     end subroutine s_compute_fd_gradient
 
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 38f659a8e9..962cab30cb 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -96,10 +96,6 @@ module m_weno
 
     ! END: WENO Coefficients ===================================================
 
-    integer :: v_size !< Number of WENO-reconstructed cell-average variables
-
-    !$acc declare create(v_size)
-
     !> @name Indical bounds in the s1-, s2- and s3-directions
     !> @{
     type(int_bounds_info) :: is1_weno, is2_weno, is3_weno
diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py
index cad27120a0..275fa295fe 100644
--- a/toolchain/mfc/args.py
+++ b/toolchain/mfc/args.py
@@ -82,6 +82,8 @@ def add_common_arguments(p, mask = None):
     test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.")
     test.add_argument(      "--no-build",     action="store_true",                    default=False,      help="(Testing) Do not rebuild MFC.")
     test.add_argument("--case-optimization",  action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
+    test.add_argument(      "--dry-run",      action="store_true",                    default=False,      help="Build and generate case files but do not run tests.")
+
     test_meg = test.add_mutually_exclusive_group()
     test_meg.add_argument("--generate",          action="store_true", default=False, help="(Test Generation) Generate golden files.")
     test_meg.add_argument("--add-new-variables", action="store_true", default=False, help="(Test Generation) If new variables are found in D/ when running tests, add them to the golden files.")
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index 5749bd1727..c091edbf6e 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -55,8 +55,28 @@ def analytic(self):
     'cfl_const_dt': ParamType.LOG,
     'chemistry': ParamType.LOG,
     'cantera_file': ParamType.STR,
+    'num_bc_patches': ParamType.INT,
 }
 
+for i in range(100):
+    COMMON[f"patch_bc({i+1})%type"] = ParamType.INT
+    COMMON[f"patch_bc({i+1})%geometry"] = ParamType.INT
+    COMMON[f"patch_bc({i+1})%dir"] = ParamType.INT
+    COMMON[f"patch_bc({i+1})%loc"] = ParamType.INT
+    COMMON[f"patch_bc({i+1})%radius"] = ParamType.REAL
+
+    for j in range(1, 3+1):
+        COMMON[f"patch_bc({i+1})%centroid({j})"] = ParamType.REAL
+        COMMON[f"patch_bc({i+1})%length({j})"] = ParamType.REAL
+        COMMON[f"patch_bc({i+1})%vel({j})"] = ParamType.REAL
+
+for cmp in ["x", "y", "z"]:
+    for loc in ["beg", "end"]:
+        COMMON[f'bc_{cmp}%{loc}'] = ParamType.INT
+
+        for dir in range(1, 3+1):
+            COMMON[f'bc_{cmp}%vel_{loc}({dir})'] = ParamType.REAL
+
 PRE_PROCESS = COMMON.copy()
 PRE_PROCESS.update({
     'old_grid': ParamType.LOG,
@@ -249,14 +269,6 @@ def analytic(self):
         SIMULATION[f'patch_ib({ib_id})%length_{cmp}'] = ParamType.REAL
 
 for cmp in ["x", "y", "z"]:
-    SIMULATION[f'bc_{cmp}%beg'] = ParamType.INT
-    SIMULATION[f'bc_{cmp}%end'] = ParamType.INT
-    SIMULATION[f'bc_{cmp}%vb1'] = ParamType.REAL
-    SIMULATION[f'bc_{cmp}%vb2'] = ParamType.REAL
-    SIMULATION[f'bc_{cmp}%vb3'] = ParamType.REAL
-    SIMULATION[f'bc_{cmp}%ve1'] = ParamType.REAL
-    SIMULATION[f'bc_{cmp}%ve2'] = ParamType.REAL
-    SIMULATION[f'bc_{cmp}%ve3'] = ParamType.REAL
     SIMULATION[f'bc_{cmp}%pres_in'] = ParamType.REAL
     SIMULATION[f'bc_{cmp}%pres_out'] = ParamType.REAL
     SIMULATION[f'bc_{cmp}%grcbc_in'] = ParamType.LOG
@@ -275,7 +287,6 @@ def analytic(self):
         SIMULATION[f'{var}_{cmp}'] = ParamType.REAL
     SIMULATION[f'bf_{cmp}'] = ParamType.LOG
 
-
     for prepend in ["domain%beg", "domain%end"]:
         SIMULATION[f"{cmp}_{prepend}"] = ParamType.REAL
 
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index beedb639d4..0b84f83227 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -1,5 +1,5 @@
 import os, typing, shutil, time, itertools
-from random import sample
+from random import sample, seed
 
 import rich, rich.table
 
@@ -55,6 +55,7 @@ def __filter(cases_) -> typing.List[TestCase]:
     if ARG("percent") == 100:
         return cases
 
+    seed(time.time())
     return sample(cases, k=int(len(cases)*ARG("percent")/100.0))
 
 
@@ -140,6 +141,10 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
     case.delete_output()
     case.create_directory()
 
+    if ARG("dry_run"):
+        cons.print(f"  [bold magenta]{case.get_uuid()}[/bold magenta]     SKIP     {case.trace}")
+        return
+
     cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
 
     out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")
@@ -224,7 +229,10 @@ def handle_case(case: TestCase, devices: typing.Set[int]):
 
         try:
             _handle_case(case, devices)
-            nPASS += 1
+            if ARG("dry_run"):
+                nSKIP += 1
+            else:
+                nPASS += 1
         except Exception as exc:
             if nAttempts < ARG("max_attempts"):
                 cons.print(f"[bold yellow] Attempt {nAttempts}: Failed test {case.get_uuid()}. Retrying...[/bold yellow]")