diff --git a/Makefile b/Makefile index 1b305a7..b995dd6 100644 --- a/Makefile +++ b/Makefile @@ -91,22 +91,22 @@ S4_BINNAME = $(OBJDIR)/S4 S4_LIBNAME = $(OBJDIR)/libS4.a S4r_LIBNAME = $(OBJDIR)/libS4r.a -#### Download, compile, and install boost serialization lib. +#### Download, compile, and install boost serialization lib. #### This should all work fine, you must modify BOOST_INC, BOOST_LIBS, -#### and PREFIX if you want to install boost to a different location +#### and PREFIX if you want to install boost to a different location # Specify the paths to the boost include and lib directories -BOOST_PREFIX=${CURDIR}/S4 +BOOST_-L$(BOOST_PREFIX)/lib/PREFIX=${CURDIR}/S4 BOOST_INC = -I$(BOOST_PREFIX)/include -BOOST_LIBS = -L$(BOOST_PREFIX)/lib/ -lboost_serialization -BOOST_URL=https://sourceforge.net/projects/boost/files/boost/1.61.0/boost_1_61_0.tar.gz +BOOST_LIBS = -lboost_serialization +BOOST_URL=https://sourceforge.net/projects/boost/files/boost/1.74.0/boost_1_74_0.tar.gz BOOST_FILE=boost.tar.gz # Target for downloading boost from above URL $(BOOST_FILE): wget $(BOOST_URL) -O $(BOOST_FILE) # Target for extracting boost from archive and compiling. Depends on download target above -${CURDIR}/S4/lib: $(BOOST_FILE) +${CURDIR}/S4/lib: $(BOOST_FILE) $(eval BOOST_DIR := $(shell tar tzf $(BOOST_FILE) | sed -e 's@/.*@@' | uniq)) @echo Boost dir is $(BOOST_DIR) tar -xzvf $(BOOST_FILE) @@ -120,32 +120,32 @@ boost: $(BOOST_PREFIX)/lib #### Set the compilation flags -CPPFLAGS = -Wall -I. -IS4 -IS4/RNP -IS4/kiss_fft - +CPPFLAGS = -Wall -I. -IS4 -IS4/RNP -IS4/kiss_fft + ifeq ($(S4_PROF), 1) CPPFLAGS += -g -pg endif ifeq ($(S4_DEBUG), 1) -CPPFLAGS += -ggdb +CPPFLAGS += -ggdb endif ifeq ($(S4_DEBUG), 2) CPPFLAGS += -DENABLE_S4_TRACE -CPPFLAGS += -ggdb +CPPFLAGS += -ggdb endif ifeq ($(S4_DEBUG), 3) CPPFLAGS += -DENABLE_S4_TRACE CPPFLAGS += -DDUMP_MATRICES -CPPFLAGS += -ggdb +CPPFLAGS += -ggdb endif ifeq ($(S4_DEBUG), 4) CPPFLAGS += -DENABLE_S4_TRACE CPPFLAGS += -DDUMP_MATRICES CPPFLAGS += -DDUMP_MATRICES_LARGE -CPPFLAGS += -ggdb +CPPFLAGS += -ggdb endif ifdef BOOST_INC @@ -188,7 +188,7 @@ objdir: mkdir -p $(OBJDIR)/S4k mkdir -p $(OBJDIR)/S4r mkdir -p $(OBJDIR)/modules - + S4_LIBOBJS = \ $(OBJDIR)/S4k/S4.o \ $(OBJDIR)/S4k/rcwa.o \ @@ -288,7 +288,7 @@ $(OBJDIR)/S4k/convert.o: S4/convert.c $(OBJDIR)/S4k/Eigensystems.o: S4/RNP/Eigensystems.cpp $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ - + $(OBJDIR)/S4r/Material.o: S4r/Material.cpp S4r/Material.hpp S4r/Types.hpp @@ -315,11 +315,11 @@ $(OBJDIR)/S4r/IRA.o: S4r/IRA.cpp S4r/IRA.hpp S4r/Types.hpp $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ $(OBJDIR)/S4r/intersection.o: S4r/intersection.c S4r/intersection.h $(CC) -c -O3 $< -o $@ -$(OBJDIR)/S4r/periodic_off2.o: S4r/periodic_off2.c S4r/periodic_off2.h +$(OBJDIR)/S4r/periodic_off2.o: S4r/periodic_off2.c S4r/periodic_off2.h $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ $(OBJDIR)/S4r/predicates.o: S4r/predicates.c $(CC) -c -O3 $< -o $@ - + #### Lua Frontend $(OBJDIR)/S4k/main_lua.o: S4/main_lua.c objdir @@ -352,4 +352,4 @@ S4_pyext: objdir $(S4_LIBNAME) pip3 install --upgrade ./ clean: - rm -rf $(OBJDIR) + rm -rf $(OBJDIR) \ No newline at end of file diff --git a/Makefile.aarch64 b/Makefile.aarch64 new file mode 100644 index 0000000..28c4ce0 --- /dev/null +++ b/Makefile.aarch64 @@ -0,0 +1,354 @@ +# To build: +# make +# Use the 'lib' target first to build the library, then either the Lua +# or Python targets are 'S4lua' and 'python_ext', respectively. + +# Set these to the flags needed to link against BLAS and Lapack. +# If left blank, then performance may be very poor. +# On Mac OS, +# BLAS_LIB = -framework vecLib +# LAPACK_LIB = -framework vecLib +# On Fedora: dnf install openblas-devel +# On Debian and Fedora, with reference BLAS and Lapack (slow) +# BLAS_LIB = -lblas +# LAPACK_LIB = -llapack +# NOTE: on Fedora, need to link blas and lapack properly, where X.X.X is some version numbers +# Linking Command Example: sudo ln -s /usr/lib64/liblapack.so.X.X.X /usr/lib64/liblapack.so +# blas Example: sudo ln -s /usr/lib64/libopeblas64.so.X.X.X /usr/lib64/libblas.so +# Can also use -L to link to the explicit libary path +# For example to use blas/lapack or even mkl installed in a anaconda conda environment: +# BLAS_LIB = -L$(CONDA_PREFIX)/lib/ -lmkl_core -lmkl_rt -lstdc++ -lpthread -lm -ldl +# LAPACK_LIB = -L$(CONDA_PREFIX)/lib/ -lmkl_core -lmkl_rt -lstdc++ -lpthread -lm -ldl +BLAS_LIB = -lblas +LAPACK_LIB = -llapack + +# Specify the flags for Lua headers and libraries (only needed for Lua frontend) +# Recommended: build lua in the current directory, and link against this local version +# LUA_INC = -I./lua-5.2.4/install/include +# LUA_LIB = -L./lua-5.2.4/install/lib -llua -ldl -lm +LUA_INC = -I./lua-5.2.4/install/include +LUA_LIB = -L./lua-5.2.4/install/lib -llua -ldl -lm + +# OPTIONAL +# Typically if installed, +# FFTW3_INC can be left empty +# FFTW3_LIB = -lfftw3 +# or, if Fedora and/or fftw is version 3 but named fftw rather than fftw3 +# FTW3_LIB = -lfftw +# May need to link libraries properly as with blas and lapack above +# FFTW3_INC = -I$(CONDA_PREFIX)/include/ +# FFTW3_LIB = -L$(CONDA_PREFIX)/lib/ -lfftw3 +FFTW3_INC = +FFTW3_LIB = -lfftw3 + +# Typically, +# PTHREAD_INC = -DHAVE_UNISTD_H +# PTHREAD_LIB = -lpthread +# If conda env +# PTHREAD_LIB = -L$(CONDA_PREFIX)/lib/ -lpthread +PTHREAD_INC = -DHAVE_UNISTD_H +#PTHREAD_LIB = -lpthread + +# OPTIONAL +# If not installed: +# Fedora: dnf install libsuitsparse-devel +# Typically, if installed: +#CHOLMOD_INC = -I/usr/include/suitesparse +#CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd +# Alternate Example: I compiled and istalled SuiteSpares in the directory above this S4 directory +# CHOLMOD_INC = -I../SuiteSparse/include/ +# CHOLMOD_LIB = -L../SuiteSparse/lib/ -lcholmod -lamd -lcolamd -lcamd -lccolamd +CHOLMOD_INC = -I/usr/include/suitesparse +CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd + +# Specify the MPI library +# For example, on Fedora: dnf install openmpi-devel +#MPI_INC = -I/usr/include/openmpi-x86_64/openmpi/ompi +#MPI_LIB = -lmpi +# or, explicitly link to the library with -L, example below +#MPI_LIB = -L/usr/lib64/openmpi/lib/libmpi.so +#MPI_INC = -I/usr/include/openmpi-x86_64/openmpi +#MPI_LIB = -L/usr/lib64/openmpi/lib/libmpi.so + +# Enable S4_TRACE debugging +# values of 1, 2, 3 enable debugging, with verbosity increasing as +# value increases. 0 to disable +S4_DEBUG = 0 +S4_PROF = 0 + +# Specify custom compilers if needed +CXX = g++ +CC = gcc + +CFLAGS = -O3 -fPIC -march=native -mtune=native + +# options for Sampler module +OPTFLAGS = -O3 + +OBJDIR = ./build +S4_BINNAME = $(OBJDIR)/S4 +S4_LIBNAME = $(OBJDIR)/libS4.a +S4r_LIBNAME = $(OBJDIR)/libS4r.a + +#### Download, compile, and install boost serialization lib. +#### This should all work fine, you must modify BOOST_INC, BOOST_LIBS, +#### and PREFIX if you want to install boost to a different location + +# Specify the paths to the boost include and lib directories +BOOST_PREFIX=${CURDIR}/S4 +BOOST_INC = -I$(BOOST_PREFIX)/include +BOOST_LIBS = -L$(BOOST_PREFIX)/lib/ -lboost_serialization +BOOST_URL=https://sourceforge.net/projects/boost/files/boost/1.74.0/boost_1_74_0.tar.gz +BOOST_FILE=boost.tar.gz +# Target for downloading boost from above URL +$(BOOST_FILE): + wget $(BOOST_URL) -O $(BOOST_FILE) + +# Target for extracting boost from archive and compiling. Depends on download target above +${CURDIR}/S4/lib: $(BOOST_FILE) + $(eval BOOST_DIR := $(shell tar tzf $(BOOST_FILE) | sed -e 's@/.*@@' | uniq)) + @echo Boost dir is $(BOOST_DIR) + tar -xzvf $(BOOST_FILE) + mv $(BOOST_DIR) boost_src + cd boost_src && ./bootstrap.sh --with-libraries=serialization --prefix=$(BOOST_PREFIX) && ./b2 install +# Final target which pulls everything together +boost: $(BOOST_PREFIX)/lib + +##################### DO NOT EDIT BELOW THIS LINE ##################### + + +#### Set the compilation flags + +CPPFLAGS = -Wall -I. -IS4 -IS4/RNP -IS4/kiss_fft + +ifeq ($(S4_PROF), 1) +CPPFLAGS += -g -pg +endif + +ifeq ($(S4_DEBUG), 1) +CPPFLAGS += -ggdb +endif + +ifeq ($(S4_DEBUG), 2) +CPPFLAGS += -DENABLE_S4_TRACE +CPPFLAGS += -ggdb +endif + +ifeq ($(S4_DEBUG), 3) +CPPFLAGS += -DENABLE_S4_TRACE +CPPFLAGS += -DDUMP_MATRICES +CPPFLAGS += -ggdb +endif + +ifeq ($(S4_DEBUG), 4) +CPPFLAGS += -DENABLE_S4_TRACE +CPPFLAGS += -DDUMP_MATRICES +CPPFLAGS += -DDUMP_MATRICES_LARGE +CPPFLAGS += -ggdb +endif + +ifdef BOOST_INC + CPPFLAGS += $(BOOST_INC) $(BOOST_LIBS) +endif + +ifdef BLAS_LIB +CPPFLAGS += -DHAVE_BLAS +endif + + +ifdef LAPACK_LIB +CPPFLAGS += -DHAVE_LAPACK +endif + +ifdef FFTW3_LIB +CPPFLAGS += -DHAVE_FFTW3 $(FFTW3_INC) +endif + +ifdef PTHREAD_LIB +CPPFLAGS += -DHAVE_LIBPTHREAD $(PTHREAD_INC) +endif + +ifdef CHOLMOD_LIB +CPPFLAGS += -DHAVE_LIBCHOLMOD $(CHOLMOD_INC) +endif + +ifdef MPI_LIB +CPPFLAGS += -DHAVE_MPI $(MPI_INC) +endif + +LIBS = $(BLAS_LIB) $(LAPACK_LIB) $(FFTW3_LIB) $(PTHREAD_LIB) $(CHOLMOD_LIB) $(MPI_LIB) $(BOOST_LIBS) + +#### Compilation targets + +all: $(S4_LIBNAME) + +objdir: + mkdir -p $(OBJDIR) + mkdir -p $(OBJDIR)/S4k + mkdir -p $(OBJDIR)/S4r + mkdir -p $(OBJDIR)/modules + +S4_LIBOBJS = \ + $(OBJDIR)/S4k/S4.o \ + $(OBJDIR)/S4k/rcwa.o \ + $(OBJDIR)/S4k/fmm_common.o \ + $(OBJDIR)/S4k/fmm_FFT.o \ + $(OBJDIR)/S4k/fmm_kottke.o \ + $(OBJDIR)/S4k/fmm_closed.o \ + $(OBJDIR)/S4k/fmm_PolBasisNV.o \ + $(OBJDIR)/S4k/fmm_PolBasisVL.o \ + $(OBJDIR)/S4k/fmm_PolBasisJones.o \ + $(OBJDIR)/S4k/fmm_experimental.o \ + $(OBJDIR)/S4k/fft_iface.o \ + $(OBJDIR)/S4k/pattern.o \ + $(OBJDIR)/S4k/intersection.o \ + $(OBJDIR)/S4k/predicates.o \ + $(OBJDIR)/S4k/numalloc.o \ + $(OBJDIR)/S4k/gsel.o \ + $(OBJDIR)/S4k/sort.o \ + $(OBJDIR)/S4k/kiss_fft.o \ + $(OBJDIR)/S4k/kiss_fftnd.o \ + $(OBJDIR)/S4k/SpectrumSampler.o \ + $(OBJDIR)/S4k/cubature.o \ + $(OBJDIR)/S4k/Interpolator.o \ + $(OBJDIR)/S4k/convert.o + +S4r_LIBOBJS = \ + $(OBJDIR)/S4r/Material.o \ + $(OBJDIR)/S4r/LatticeGridRect.o \ + $(OBJDIR)/S4r/LatticeGridArb.o \ + $(OBJDIR)/S4r/POFF2Mesh.o \ + $(OBJDIR)/S4r/PeriodicMesh.o \ + $(OBJDIR)/S4r/Shape.o \ + $(OBJDIR)/S4r/Simulation.o \ + $(OBJDIR)/S4r/Layer.o \ + $(OBJDIR)/S4r/Pseudoinverse.o \ + $(OBJDIR)/S4r/Eigensystems.o \ + $(OBJDIR)/S4r/IRA.o \ + $(OBJDIR)/S4r/intersection.o \ + $(OBJDIR)/S4r/predicates.o \ + $(OBJDIR)/S4r/periodic_off2.o + +ifndef LAPACK_LIB + S4_LIBOBJS += $(OBJDIR)/S4k/Eigensystems.o +endif + +$(S4_LIBNAME): objdir $(S4_LIBOBJS) + $(AR) crvs $@ $(S4_LIBOBJS) +$(S4r_LIBNAME): objdir $(S4r_LIBOBJS) + $(AR) crvs $@ $(S4r_LIBOBJS) + +$(OBJDIR)/S4k/S4.o: S4/S4.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/rcwa.o: S4/rcwa.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_common.o: S4/fmm/fmm_common.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_FFT.o: S4/fmm/fmm_FFT.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_kottke.o: S4/fmm/fmm_kottke.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_closed.o: S4/fmm/fmm_closed.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_PolBasisNV.o: S4/fmm/fmm_PolBasisNV.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_PolBasisVL.o: S4/fmm/fmm_PolBasisVL.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_PolBasisJones.o: S4/fmm/fmm_PolBasisJones.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_experimental.o: S4/fmm/fmm_experimental.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fft_iface.o: S4/fmm/fft_iface.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/pattern.o: S4/pattern/pattern.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/intersection.o: S4/pattern/intersection.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/predicates.o: S4/pattern/predicates.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/numalloc.o: S4/numalloc.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/gsel.o: S4/gsel.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/sort.o: S4/sort.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/kiss_fft.o: S4/kiss_fft/kiss_fft.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/kiss_fftnd.o: S4/kiss_fft/tools/kiss_fftnd.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/SpectrumSampler.o: S4/SpectrumSampler.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/cubature.o: S4/cubature.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/Interpolator.o: S4/Interpolator.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/convert.o: S4/convert.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/Eigensystems.o: S4/RNP/Eigensystems.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ + + + + +$(OBJDIR)/S4r/Material.o: S4r/Material.cpp S4r/Material.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/LatticeGridRect.o: S4r/LatticeGridRect.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/LatticeGridArb.o: S4r/LatticeGridArb.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/POFF2Mesh.o: S4r/POFF2Mesh.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/PeriodicMesh.o: S4r/PeriodicMesh.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Shape.o: S4r/Shape.cpp S4r/Shape.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Simulation.o: S4r/Simulation.cpp S4r/Simulation.hpp S4r/StarProduct.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Layer.o: S4r/Layer.cpp S4r/Layer.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Pseudoinverse.o: S4r/Pseudoinverse.cpp S4r/Pseudoinverse.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Eigensystems.o: S4r/Eigensystems.cpp S4r/Eigensystems.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/IRA.o: S4r/IRA.cpp S4r/IRA.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/intersection.o: S4r/intersection.c S4r/intersection.h + $(CC) -c -O3 $< -o $@ +$(OBJDIR)/S4r/periodic_off2.o: S4r/periodic_off2.c S4r/periodic_off2.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4r/predicates.o: S4r/predicates.c + $(CC) -c -O3 $< -o $@ + +#### Lua Frontend + +$(OBJDIR)/S4k/main_lua.o: S4/main_lua.c objdir + $(CC) -c $(CFLAGS) $(CPPFLAGS) $(LUA_INC) $< -o $@ +S4lua: $(OBJDIR)/S4k/main_lua.o $(S4_LIBNAME) sampler + $(CXX) $(CFLAGS) $(CPPFLAGS) $< -o $(S4_BINNAME) $(S4_LIBNAME) $(LIBS) $(LUA_LIB) + +$(OBJDIR)/S4r/main_lua.o: S4r/main_lua.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $(LUA_INC) $< -o $@ +$(OBJDIR)/S4r/lua_named_arg.o: S4r/lua_named_arg.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $(LUA_INC) $< -o $@ +$(OBJDIR)/S4r/S4r.o: S4r/S4r.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $(LUA_INC) $< -o $@ +S4rlua: objdir $(OBJDIR)/S4r/main_lua.o $(OBJDIR)/S4r/lua_named_arg.o $(OBJDIR)/S4r/S4r.o $(S4r_LIBNAME) + $(CXX) $(CFLAGS) $(CPPFLAGS) $(OBJDIR)/S4r/main_lua.o $(OBJDIR)/S4r/lua_named_arg.o $(OBJDIR)/S4r/S4r.o -o $@ $(S4r_LIBNAME) $(LIBS) $(LUA_LIB) + +sampler: FunctionSampler1D.so FunctionSampler2D.so +FunctionSampler1D.so: modules/function_sampler_1d.c modules/function_sampler_1d.h modules/lua_function_sampler_1d.c + gcc -c $(OPTFLAGS) -fpic -Wall -I. modules/function_sampler_1d.c -o $(OBJDIR)/modules/function_sampler_1d.o + gcc $(OPTFLAGS) -shared -fpic -Wall $(LUA_INC) -o $(OBJDIR)/FunctionSampler1D.so $(OBJDIR)/modules/function_sampler_1d.o modules/lua_function_sampler_1d.c $(LUA_LIB) +FunctionSampler2D.so: modules/function_sampler_2d.c modules/function_sampler_2d.h modules/lua_function_sampler_2d.c + gcc -c $(OPTFLAGS) -fpic -Wall -I. modules/function_sampler_2d.c -o $(OBJDIR)/modules/function_sampler_2d.o + gcc -c -O2 -fpic -Wall -I. modules/predicates.c -o $(OBJDIR)/modules/mod_predicates.o + gcc $(OPTFLAGS) -shared -fpic -Wall $(LUA_INC) -o $(OBJDIR)/FunctionSampler2D.so $(OBJDIR)/modules/function_sampler_2d.o $(OBJDIR)/modules/mod_predicates.o modules/lua_function_sampler_2d.c $(LUA_LIB) + +#### Python extension + +S4_pyext: objdir $(S4_LIBNAME) + sh gensetup.py.sh $(OBJDIR) $(S4_LIBNAME) "$(LIBS)" $(BOOST_PREFIX) + pip3 install --upgrade ./ + +clean: + rm -rf $(OBJDIR) diff --git a/Makefile.m1 b/Makefile.m1 new file mode 100644 index 0000000..c3e3d1e --- /dev/null +++ b/Makefile.m1 @@ -0,0 +1,337 @@ +# To build: +# make +# Use the 'lib' target first to build the library, then either the Lua +# or Python targets are 'S4lua' and 'python_ext', respectively. + +# Set these to the flags needed to link against BLAS and Lapack. +# If left blank, then performance may be very poor. +# On Mac OS, +# BLAS_LIB = -framework vecLib +# LAPACK_LIB = -framework vecLib +# On Fedora: dnf install openblas-devel +# On Debian and Fedora, with reference BLAS and Lapack (slow) +# BLAS_LIB = -lblas +# LAPACK_LIB = -llapack +# NOTE: on Fedora, need to link blas and lapack properly, where X.X.X is some version numbers +# Linking Command Example: sudo ln -s /usr/lib64/liblapack.so.X.X.X /usr/lib64/liblapack.so +# blas Example: sudo ln -s /usr/lib64/libopeblas64.so.X.X.X /usr/lib64/libblas.so +# Can also use -L to link to the explicit libary path +# For example to use blas/lapack or even mkl installed in a anaconda conda environment: +# BLAS_LIB = -L$(CONDA_PREFIX)/lib/ -lmkl_core -lmkl_rt -lstdc++ -lpthread -lm -ldl +# LAPACK_LIB = -L$(CONDA_PREFIX)/lib/ -lmkl_core -lmkl_rt -lstdc++ -lpthread -lm -ldl +BLAS_LIB = -lblas +LAPACK_LIB = -llapack + +# Specify the flags for Lua headers and libraries (only needed for Lua frontend) +# Recommended: build lua in the current directory, and link against this local version +# LUA_INC = -I./lua-5.2.4/install/include +# LUA_LIB = -L./lua-5.2.4/install/lib -llua -ldl -lm +LUA_INC = -I./lua-5.2.4/install/include +LUA_LIB = -L./lua-5.2.4/install/lib -llua -ldl -lm + +# OPTIONAL +# Typically if installed, +# FFTW3_INC can be left empty +# FFTW3_LIB = -lfftw3 +# or, if Fedora and/or fftw is version 3 but named fftw rather than fftw3 +# FTW3_LIB = -lfftw +# May need to link libraries properly as with blas and lapack above +# FFTW3_INC = -I$(CONDA_PREFIX)/include/ +# FFTW3_LIB = -L$(CONDA_PREFIX)/lib/ -lfftw3 +FFTW3_INC = -I/opt/homebrew/include/ +FFTW3_LIB = -lfftw3 + +# Typically, +# PTHREAD_INC = -DHAVE_UNISTD_H +# PTHREAD_LIB = -lpthread +# If conda env +# PTHREAD_LIB = -L$(CONDA_PREFIX)/lib/ -lpthread +PTHREAD_INC = -DHAVE_UNISTD_H +#PTHREAD_LIB = -lpthread + +# OPTIONAL +# If not installed: +# Fedora: dnf install libsuitsparse-devel +# Typically, if installed: +#CHOLMOD_INC = -I/usr/include/suitesparse +#CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd +# Alternate Example: I compiled and istalled SuiteSpares in the directory above this S4 directory +# CHOLMOD_INC = -I../SuiteSparse/include/ +# CHOLMOD_LIB = -L../SuiteSparse/lib/ -lcholmod -lamd -lcolamd -lcamd -lccolamd +CHOLMOD_INC = -I/opt/homebrew/include/suitesparse +CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd + +# Specify the MPI library +# For example, on Fedora: dnf install openmpi-devel +#MPI_INC = -I/usr/include/openmpi-x86_64/openmpi/ompi +#MPI_LIB = -lmpi +# or, explicitly link to the library with -L, example below +#MPI_LIB = -L/usr/lib64/openmpi/lib/libmpi.so +#MPI_INC = -I/usr/include/openmpi-x86_64/openmpi +#MPI_LIB = -L/usr/lib64/openmpi/lib/libmpi.so + +# Enable S4_TRACE debugging +# values of 1, 2, 3 enable debugging, with verbosity increasing as +# value increases. 0 to disable +S4_DEBUG = 0 +S4_PROF = 0 + +# Specify custom compilers if needed +CXX = clang++ +CC = clang + +#CFLAGS += -O3 -fPIC +CFLAGS = -Wall -O3 -m64 -mcpu=apple-m1 -mtune=native -fPIC + +# options for Sampler module +OPTFLAGS = -O3 + +OBJDIR = ./build +S4_BINNAME = $(OBJDIR)/S4 +S4_LIBNAME = $(OBJDIR)/libS4.a +S4r_LIBNAME = $(OBJDIR)/libS4r.a + +BOOST_INC = -I/opt/homebrew/include/ +BOOST_LIBS = -lboost_serialization +# will say -lboost_serialization is unused but this is A LIE, get "flat namespace" error without it!!! + +##################### DO NOT EDIT BELOW THIS LINE ##################### + + +#### Set the compilation flags + +CPPFLAGS = -Wall -I. -IS4 -IS4/RNP -IS4/kiss_fft + +ifeq ($(S4_PROF), 1) +CPPFLAGS += -g -pg +endif + +ifeq ($(S4_DEBUG), 1) +CPPFLAGS += -ggdb +endif + +ifeq ($(S4_DEBUG), 2) +CPPFLAGS += -DENABLE_S4_TRACE +CPPFLAGS += -ggdb +endif + +ifeq ($(S4_DEBUG), 3) +CPPFLAGS += -DENABLE_S4_TRACE +CPPFLAGS += -DDUMP_MATRICES +CPPFLAGS += -ggdb +endif + +ifeq ($(S4_DEBUG), 4) +CPPFLAGS += -DENABLE_S4_TRACE +CPPFLAGS += -DDUMP_MATRICES +CPPFLAGS += -DDUMP_MATRICES_LARGE +CPPFLAGS += -ggdb +endif + +ifdef BOOST_INC + CPPFLAGS += $(BOOST_INC) $(BOOST_LIBS) +endif + +ifdef BLAS_LIB +CPPFLAGS += -DHAVE_BLAS +endif + + +ifdef LAPACK_LIB +CPPFLAGS += -DHAVE_LAPACK +endif + +ifdef FFTW3_LIB +CPPFLAGS += -DHAVE_FFTW3 $(FFTW3_INC) +endif + +ifdef PTHREAD_LIB +CPPFLAGS += -DHAVE_LIBPTHREAD $(PTHREAD_INC) +endif + +ifdef CHOLMOD_LIB +CPPFLAGS += -DHAVE_LIBCHOLMOD $(CHOLMOD_INC) +endif + +ifdef MPI_LIB +CPPFLAGS += -DHAVE_MPI $(MPI_INC) +endif + +LIBS = $(BLAS_LIB) $(LAPACK_LIB) $(FFTW3_LIB) $(PTHREAD_LIB) $(CHOLMOD_LIB) $(MPI_LIB) $(BOOST_LIBS) + +#### Compilation targets + +all: $(S4_LIBNAME) + +objdir: + mkdir -p $(OBJDIR) + mkdir -p $(OBJDIR)/S4k + mkdir -p $(OBJDIR)/S4r + mkdir -p $(OBJDIR)/modules + +S4_LIBOBJS = \ + $(OBJDIR)/S4k/S4.o \ + $(OBJDIR)/S4k/rcwa.o \ + $(OBJDIR)/S4k/fmm_common.o \ + $(OBJDIR)/S4k/fmm_FFT.o \ + $(OBJDIR)/S4k/fmm_kottke.o \ + $(OBJDIR)/S4k/fmm_closed.o \ + $(OBJDIR)/S4k/fmm_PolBasisNV.o \ + $(OBJDIR)/S4k/fmm_PolBasisVL.o \ + $(OBJDIR)/S4k/fmm_PolBasisJones.o \ + $(OBJDIR)/S4k/fmm_experimental.o \ + $(OBJDIR)/S4k/fft_iface.o \ + $(OBJDIR)/S4k/pattern.o \ + $(OBJDIR)/S4k/intersection.o \ + $(OBJDIR)/S4k/predicates.o \ + $(OBJDIR)/S4k/numalloc.o \ + $(OBJDIR)/S4k/gsel.o \ + $(OBJDIR)/S4k/sort.o \ + $(OBJDIR)/S4k/kiss_fft.o \ + $(OBJDIR)/S4k/kiss_fftnd.o \ + $(OBJDIR)/S4k/SpectrumSampler.o \ + $(OBJDIR)/S4k/cubature.o \ + $(OBJDIR)/S4k/Interpolator.o \ + $(OBJDIR)/S4k/convert.o + +S4r_LIBOBJS = \ + $(OBJDIR)/S4r/Material.o \ + $(OBJDIR)/S4r/LatticeGridRect.o \ + $(OBJDIR)/S4r/LatticeGridArb.o \ + $(OBJDIR)/S4r/POFF2Mesh.o \ + $(OBJDIR)/S4r/PeriodicMesh.o \ + $(OBJDIR)/S4r/Shape.o \ + $(OBJDIR)/S4r/Simulation.o \ + $(OBJDIR)/S4r/Layer.o \ + $(OBJDIR)/S4r/Pseudoinverse.o \ + $(OBJDIR)/S4r/Eigensystems.o \ + $(OBJDIR)/S4r/IRA.o \ + $(OBJDIR)/S4r/intersection.o \ + $(OBJDIR)/S4r/predicates.o \ + $(OBJDIR)/S4r/periodic_off2.o + +ifndef LAPACK_LIB + S4_LIBOBJS += $(OBJDIR)/S4k/Eigensystems.o +endif + +$(S4_LIBNAME): objdir $(S4_LIBOBJS) + $(AR) crvs $@ $(S4_LIBOBJS) +$(S4r_LIBNAME): objdir $(S4r_LIBOBJS) + $(AR) crvs $@ $(S4r_LIBOBJS) + +$(OBJDIR)/S4k/S4.o: S4/S4.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/rcwa.o: S4/rcwa.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_common.o: S4/fmm/fmm_common.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_FFT.o: S4/fmm/fmm_FFT.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_kottke.o: S4/fmm/fmm_kottke.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_closed.o: S4/fmm/fmm_closed.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_PolBasisNV.o: S4/fmm/fmm_PolBasisNV.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_PolBasisVL.o: S4/fmm/fmm_PolBasisVL.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_PolBasisJones.o: S4/fmm/fmm_PolBasisJones.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_experimental.o: S4/fmm/fmm_experimental.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fft_iface.o: S4/fmm/fft_iface.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/pattern.o: S4/pattern/pattern.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/intersection.o: S4/pattern/intersection.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/predicates.o: S4/pattern/predicates.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/numalloc.o: S4/numalloc.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/gsel.o: S4/gsel.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/sort.o: S4/sort.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/kiss_fft.o: S4/kiss_fft/kiss_fft.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/kiss_fftnd.o: S4/kiss_fft/tools/kiss_fftnd.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/SpectrumSampler.o: S4/SpectrumSampler.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/cubature.o: S4/cubature.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/Interpolator.o: S4/Interpolator.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/convert.o: S4/convert.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/Eigensystems.o: S4/RNP/Eigensystems.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ + + + + +$(OBJDIR)/S4r/Material.o: S4r/Material.cpp S4r/Material.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/LatticeGridRect.o: S4r/LatticeGridRect.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/LatticeGridArb.o: S4r/LatticeGridArb.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/POFF2Mesh.o: S4r/POFF2Mesh.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/PeriodicMesh.o: S4r/PeriodicMesh.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Shape.o: S4r/Shape.cpp S4r/Shape.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Simulation.o: S4r/Simulation.cpp S4r/Simulation.hpp S4r/StarProduct.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Layer.o: S4r/Layer.cpp S4r/Layer.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Pseudoinverse.o: S4r/Pseudoinverse.cpp S4r/Pseudoinverse.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Eigensystems.o: S4r/Eigensystems.cpp S4r/Eigensystems.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/IRA.o: S4r/IRA.cpp S4r/IRA.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/intersection.o: S4r/intersection.c S4r/intersection.h + $(CC) -c -O3 $< -o $@ +$(OBJDIR)/S4r/periodic_off2.o: S4r/periodic_off2.c S4r/periodic_off2.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4r/predicates.o: S4r/predicates.c + $(CC) -c -O3 $< -o $@ + +#### Lua Frontend + +$(OBJDIR)/S4k/main_lua.o: S4/main_lua.c objdir + $(CC) -c $(CFLAGS) $(CPPFLAGS) $(LUA_INC) $< -o $@ +S4lua: $(OBJDIR)/S4k/main_lua.o $(S4_LIBNAME) sampler + $(CXX) $(CFLAGS) $(CPPFLAGS) $< -o $(S4_BINNAME) $(S4_LIBNAME) $(LIBS) $(LUA_LIB) + +$(OBJDIR)/S4r/main_lua.o: S4r/main_lua.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $(LUA_INC) $< -o $@ +$(OBJDIR)/S4r/lua_named_arg.o: S4r/lua_named_arg.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $(LUA_INC) $< -o $@ +$(OBJDIR)/S4r/S4r.o: S4r/S4r.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $(LUA_INC) $< -o $@ +S4rlua: objdir $(OBJDIR)/S4r/main_lua.o $(OBJDIR)/S4r/lua_named_arg.o $(OBJDIR)/S4r/S4r.o $(S4r_LIBNAME) + $(CXX) $(CFLAGS) $(CPPFLAGS) $(OBJDIR)/S4r/main_lua.o $(OBJDIR)/S4r/lua_named_arg.o $(OBJDIR)/S4r/S4r.o -o $@ $(S4r_LIBNAME) $(LIBS) $(LUA_LIB) + +sampler: FunctionSampler1D.so FunctionSampler2D.so +FunctionSampler1D.so: modules/function_sampler_1d.c modules/function_sampler_1d.h modules/lua_function_sampler_1d.c + gcc -c $(OPTFLAGS) -fpic -Wall -I. modules/function_sampler_1d.c -o $(OBJDIR)/modules/function_sampler_1d.o + gcc $(OPTFLAGS) -shared -fpic -Wall $(LUA_INC) -o $(OBJDIR)/FunctionSampler1D.so $(OBJDIR)/modules/function_sampler_1d.o modules/lua_function_sampler_1d.c $(LUA_LIB) +FunctionSampler2D.so: modules/function_sampler_2d.c modules/function_sampler_2d.h modules/lua_function_sampler_2d.c + gcc -c $(OPTFLAGS) -fpic -Wall -I. modules/function_sampler_2d.c -o $(OBJDIR)/modules/function_sampler_2d.o + gcc -c -O2 -fpic -Wall -I. modules/predicates.c -o $(OBJDIR)/modules/mod_predicates.o + gcc $(OPTFLAGS) -shared -fpic -Wall $(LUA_INC) -o $(OBJDIR)/FunctionSampler2D.so $(OBJDIR)/modules/function_sampler_2d.o $(OBJDIR)/modules/mod_predicates.o modules/lua_function_sampler_2d.c $(LUA_LIB) + +#### Python extension + +BOOST_PREFIX = /opt/homebrew/ + +S4_pyext: objdir $(S4_LIBNAME) + sh gensetup.py.sh $(OBJDIR) $(S4_LIBNAME) "$(LIBS)" $(BOOST_PREFIX) + pip3 install --upgrade ./ + +clean: + rm -rf $(OBJDIR) diff --git a/Makefile.osx b/Makefile.mac_intel similarity index 68% rename from Makefile.osx rename to Makefile.mac_intel index 1e761a0..34b883f 100644 --- a/Makefile.osx +++ b/Makefile.mac_intel @@ -1,54 +1,87 @@ # To build: # make -# Use either 'S4lua' for Lua, or 'python_ext' for Python. -# S4lua also builds FunctionSampler1D and FunctionSampler2D, in modules. -# You can build the sampler seperately with 'make sampler' +# Use the 'lib' target first to build the library, then either the Lua +# or Python targets are 'S4lua' and 'python_ext', respectively. # Set these to the flags needed to link against BLAS and Lapack. # If left blank, then performance may be very poor. # On Mac OS, # BLAS_LIB = -framework vecLib # LAPACK_LIB = -framework vecLib -# On Debian, with reference BLAS and Lapack (slow) +# On Fedora: dnf install openblas-devel +# On Debian and Fedora, with reference BLAS and Lapack (slow) # BLAS_LIB = -lblas # LAPACK_LIB = -llapack -BLAS_LIB = -framework Accelerate -LAPACK_LIB = -framework Accelerate +# NOTE: on Fedora, need to link blas and lapack properly, where X.X.X is some version numbers +# Linking Command Example: sudo ln -s /usr/lib64/liblapack.so.X.X.X /usr/lib64/liblapack.so +# blas Example: sudo ln -s /usr/lib64/libopeblas64.so.X.X.X /usr/lib64/libblas.so +# Can also use -L to link to the explicit libary path +# For example to use blas/lapack or even mkl installed in a anaconda conda environment: +# BLAS_LIB = -L$(CONDA_PREFIX)/lib/ -lmkl_core -lmkl_rt -lstdc++ -lpthread -lm -ldl +# LAPACK_LIB = -L$(CONDA_PREFIX)/lib/ -lmkl_core -lmkl_rt -lstdc++ -lpthread -lm -ldl +BLAS_LIB = -lblas +LAPACK_LIB = -llapack # Specify the flags for Lua headers and libraries (only needed for Lua frontend) # Recommended: build lua in the current directory, and link against this local version # LUA_INC = -I./lua-5.2.4/install/include -# LUA_LIB = -L./lua-5.2.4/install/lib -llua -ldl +# LUA_LIB = -L./lua-5.2.4/install/lib -llua -ldl -lm LUA_INC = -I./lua-5.2.4/install/include -LUA_LIB = -L./lua-5.2.4/install/lib -llua -ldl +LUA_LIB = -L./lua-5.2.4/install/lib -llua -ldl -lm +# OPTIONAL # Typically if installed, # FFTW3_INC can be left empty -# FFTW3_LIB = -lfftw3 -FFTW3_INC = -FFTW3_LIB = +# FFTW3_LIB = -lfftw3 +# or, if Fedora and/or fftw is version 3 but named fftw rather than fftw3 +# FTW3_LIB = -lfftw +# May need to link libraries properly as with blas and lapack above +# FFTW3_INC = -I$(CONDA_PREFIX)/include/ +# FFTW3_LIB = -L$(CONDA_PREFIX)/lib/ -lfftw3 +FFTW3_INC = -I/opt/homebrew/include/ +FFTW3_LIB = -lfftw3 # Typically, -# PTHREAD_INC = -DHAVE_UNISTD_H -lpthread +# PTHREAD_INC = -DHAVE_UNISTD_H # PTHREAD_LIB = -lpthread -PTHREAD_INC = -DHAVE_UNISTD_H -PTHREAD_LIB = -lpthread - -# Typically if installed, -# CHOLMOD_INC = -I/usr/local/include -# CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd -CHOLMOD_INC = -CHOLMOD_LIB = +# If conda env +# PTHREAD_LIB = -L$(CONDA_PREFIX)/lib/ -lpthread +PTHREAD_INC = -DHAVE_UNISTD_H +#PTHREAD_LIB = -lpthread + +# OPTIONAL +# If not installed: +# Fedora: dnf install libsuitsparse-devel +# Typically, if installed: +#CHOLMOD_INC = -I/usr/include/suitesparse +#CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd +# Alternate Example: I compiled and istalled SuiteSpares in the directory above this S4 directory +# CHOLMOD_INC = -I../SuiteSparse/include/ +# CHOLMOD_LIB = -L../SuiteSparse/lib/ -lcholmod -lamd -lcolamd -lcamd -lccolamd +CHOLMOD_INC = -I/opt/homebrew/include/suitesparse +CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd # Specify the MPI library -MPI_INC = -MPI_LIB = +# For example, on Fedora: dnf install openmpi-devel +#MPI_INC = -I/usr/include/openmpi-x86_64/openmpi/ompi +#MPI_LIB = -lmpi +# or, explicitly link to the library with -L, example below +#MPI_LIB = -L/usr/lib64/openmpi/lib/libmpi.so +#MPI_INC = -I/usr/include/openmpi-x86_64/openmpi +#MPI_LIB = -L/usr/lib64/openmpi/lib/libmpi.so + +# Enable S4_TRACE debugging +# values of 1, 2, 3 enable debugging, with verbosity increasing as +# value increases. 0 to disable +S4_DEBUG = 0 +S4_PROF = 0 # Specify custom compilers if needed -CXX = c++ -CC = cc +CXX = clang++ +CC = clang -CFLAGS = -O3 -fPIC +#CFLAGS += -O3 -fPIC +CFLAGS = -Wall -O3 -m64 -march=native -mtune=native -msse3 -msse2 -msse -fPIC # options for Sampler module OPTFLAGS = -O3 @@ -58,16 +91,50 @@ S4_BINNAME = $(OBJDIR)/S4 S4_LIBNAME = $(OBJDIR)/libS4.a S4r_LIBNAME = $(OBJDIR)/libS4r.a +BOOST_INC = -I/opt/homebrew/include/ +BOOST_LIBS = -lboost_serialization ##################### DO NOT EDIT BELOW THIS LINE ##################### + #### Set the compilation flags -CPPFLAGS = -I. -IS4 -IS4/RNP -IS4/kiss_fft +CPPFLAGS = -Wall -I. -IS4 -IS4/RNP -IS4/kiss_fft + +ifeq ($(S4_PROF), 1) +CPPFLAGS += -g -pg +endif + +ifeq ($(S4_DEBUG), 1) +CPPFLAGS += -ggdb +endif + +ifeq ($(S4_DEBUG), 2) +CPPFLAGS += -DENABLE_S4_TRACE +CPPFLAGS += -ggdb +endif + +ifeq ($(S4_DEBUG), 3) +CPPFLAGS += -DENABLE_S4_TRACE +CPPFLAGS += -DDUMP_MATRICES +CPPFLAGS += -ggdb +endif + +ifeq ($(S4_DEBUG), 4) +CPPFLAGS += -DENABLE_S4_TRACE +CPPFLAGS += -DDUMP_MATRICES +CPPFLAGS += -DDUMP_MATRICES_LARGE +CPPFLAGS += -ggdb +endif + +ifdef BOOST_INC + CPPFLAGS += $(BOOST_INC) $(BOOST_LIBS) +endif ifdef BLAS_LIB CPPFLAGS += -DHAVE_BLAS endif + ifdef LAPACK_LIB CPPFLAGS += -DHAVE_LAPACK endif @@ -88,7 +155,7 @@ ifdef MPI_LIB CPPFLAGS += -DHAVE_MPI $(MPI_INC) endif -LIBS = $(BLAS_LIB) $(LAPACK_LIB) $(FFTW3_LIB) $(PTHREAD_LIB) $(CHOLMOD_LIB) +LIBS = $(BLAS_LIB) $(LAPACK_LIB) $(FFTW3_LIB) $(PTHREAD_LIB) $(CHOLMOD_LIB) $(MPI_LIB) $(BOOST_LIBS) #### Compilation targets @@ -199,7 +266,7 @@ $(OBJDIR)/S4k/convert.o: S4/convert.c $(OBJDIR)/S4k/Eigensystems.o: S4/RNP/Eigensystems.cpp $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ - + $(OBJDIR)/S4r/Material.o: S4r/Material.cpp S4r/Material.hpp S4r/Types.hpp @@ -226,16 +293,15 @@ $(OBJDIR)/S4r/IRA.o: S4r/IRA.cpp S4r/IRA.hpp S4r/Types.hpp $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ $(OBJDIR)/S4r/intersection.o: S4r/intersection.c S4r/intersection.h $(CC) -c -O3 $< -o $@ -$(OBJDIR)/S4r/periodic_off2.o: S4r/periodic_off2.c S4r/periodic_off2.h +$(OBJDIR)/S4r/periodic_off2.o: S4r/periodic_off2.c S4r/periodic_off2.h $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ $(OBJDIR)/S4r/predicates.o: S4r/predicates.c $(CC) -c -O3 $< -o $@ - + #### Lua Frontend $(OBJDIR)/S4k/main_lua.o: S4/main_lua.c objdir $(CC) -c $(CFLAGS) $(CPPFLAGS) $(LUA_INC) $< -o $@ - S4lua: $(OBJDIR)/S4k/main_lua.o $(S4_LIBNAME) sampler $(CXX) $(CFLAGS) $(CPPFLAGS) $< -o $(S4_BINNAME) $(S4_LIBNAME) $(LIBS) $(LUA_LIB) @@ -251,29 +317,19 @@ S4rlua: objdir $(OBJDIR)/S4r/main_lua.o $(OBJDIR)/S4r/lua_named_arg.o $(OBJDIR)/ sampler: FunctionSampler1D.so FunctionSampler2D.so FunctionSampler1D.so: modules/function_sampler_1d.c modules/function_sampler_1d.h modules/lua_function_sampler_1d.c gcc -c $(OPTFLAGS) -fpic -Wall -I. modules/function_sampler_1d.c -o $(OBJDIR)/modules/function_sampler_1d.o - gcc $(OPTFLAGS) -bundle -undefined dynamic_lookup -fpic -Wall $(LUA_INC) -o $(OBJDIR)/FunctionSampler1D.so $(OBJDIR)/modules/function_sampler_1d.o modules/lua_function_sampler_1d.c + gcc $(OPTFLAGS) -shared -fpic -Wall $(LUA_INC) -o $(OBJDIR)/FunctionSampler1D.so $(OBJDIR)/modules/function_sampler_1d.o modules/lua_function_sampler_1d.c $(LUA_LIB) FunctionSampler2D.so: modules/function_sampler_2d.c modules/function_sampler_2d.h modules/lua_function_sampler_2d.c gcc -c $(OPTFLAGS) -fpic -Wall -I. modules/function_sampler_2d.c -o $(OBJDIR)/modules/function_sampler_2d.o gcc -c -O2 -fpic -Wall -I. modules/predicates.c -o $(OBJDIR)/modules/mod_predicates.o - gcc $(OPTFLAGS) -bundle -undefined dynamic_lookup -fpic -Wall $(LUA_INC) -o $(OBJDIR)/FunctionSampler2D.so $(OBJDIR)/modules/function_sampler_2d.o $(OBJDIR)/modules/mod_predicates.o modules/lua_function_sampler_2d.c - - + gcc $(OPTFLAGS) -shared -fpic -Wall $(LUA_INC) -o $(OBJDIR)/FunctionSampler2D.so $(OBJDIR)/modules/function_sampler_2d.o $(OBJDIR)/modules/mod_predicates.o modules/lua_function_sampler_2d.c $(LUA_LIB) #### Python extension -S4_pyext: objdir $(S4_LIBNAME) - echo "$(LIBS)" > $(OBJDIR)/tmp.txt - sh gensetup.py.sh $(OBJDIR) $(S4_LIBNAME) - python setup.py build - -#### R extension +BOOST_PREFIX = /opt/homebrew/ -S4_R_frontend: objdir $(S4_LIBNAME) - R CMD BUILD RCWA - R CMD INSTALL RCWA - -################################################# +S4_pyext: objdir $(S4_LIBNAME) + sh gensetup.py.sh $(OBJDIR) $(S4_LIBNAME) "$(LIBS)" $(BOOST_PREFIX) + pip3 install --upgrade ./ clean: - rm -rf $(OBJDIR) - + rm -rf $(OBJDIR) \ No newline at end of file diff --git a/MakefileHPC b/MakefileHPC new file mode 100644 index 0000000..c6bef7b --- /dev/null +++ b/MakefileHPC @@ -0,0 +1,251 @@ +# to find path to openmpi, type mpiexec --test , the error messege reveals path +MKL_HOME = /apps/cent7/intel/compilers_and_libraries_2017.1.132/linux/mkl +BROWN_MPI_HOME = /apps/brown/openmpi/2.1.0/gcc-5.2.0 +SNYDER_MPI_HOME = /apps/snyder7/openmpi/2.1.0_gcc-5.2.0 +MPI_HOME = $(SNYDER_MPI_HOME) + +BLAS_LIB = -L$(MKL_HOME)/lib/intel64 -Wl,-rpath=$(MKL_HOME)/lib/intel64 -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp +LAPACK_LIB = -L$(MKL_HOME)/lib/intel64 -Wl,-rpath=$(MKL_HOME)/lib/intel64 -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core + +EXTRA_FLAGS = '-Wl,-rpath=$(MKL_HOME)/lib/intel64' , '-Wl,-rpath=$(MPI_HOME)/lib' + +EXTRA_DIRS = '$(MKL_HOME)/lib/intel64' , '$(MPI_HOME)/lib' + +#BLAS_LIB = -lblas +#LAPACK_LIB = -llapack + +# OPTIONAL +# Typically if installed, +# FFTW3_INC can be left empty +# FFTW3_LIB = -lfftw3 +# or, if Fedora and/or fftw is version 3 but named fftw rather than fftw3 +# FTW3_LIB = -lfftw +# May need to link libraries properly as with blas and lapack above +FFTW3_INC = -I$(MLK_HOME)/include/fftw/ +FFTW3_LIB = $(BLAS_LIB) + +# Typically, +# PTHREAD_INC = -DHAVE_UNISTD_H +# PTHREAD_LIB = -lpthread +PTHREAD_INC = -DHAVE_UNISTD_H +PTHREAD_LIB = -lpthread + +# OPTIONAL +# If not installed: +# Fedora: dnf install libsuitsparse-devel +# Typically, if installed: +#CHOLMOD_INC = -I/usr/include/suitesparse +#CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd +#CHOLMOD_INC = -I/usr/include/suitesparse +#CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd + +# Specify the MPI library +# For example, on Fedora: dnf install openmpi-devel +#MPI_INC = -I/usr/include/openmpi-x86_64/openmpi/ompi +#MPI_LIB = -lmpi +# or, explicitly link to the library with -L, example below +MPI_LIB = -L$(MPI_HOME)/lib -Wl,-rpath=$(MPI_HOME)/lib -lmpi +MPI_INC = -I$(MPI_HOME)/include +#MPI_LIB = -L/usr/lib64/openmpi/lib/libmpi.so + +# Specify custom compilers if needed +CXX = $(MPI_HOME)/bin/mpic++ +CC = $(MPI_HOME)/bin/mpicc + +#CFLAGS += -O3 -fPIC +CFLAGS = -O3 -msse3 -msse2 -msse -fPIC + +# options for Sampler module +OPTFLAGS = -O3 + +OBJDIR = ./build +S4_BINNAME = $(OBJDIR)/S4 +S4_LIBNAME = $(OBJDIR)/libS4.a +S4r_LIBNAME = $(OBJDIR)/libS4r.a + +##################### DO NOT EDIT BELOW THIS LINE ##################### + +#### Set the compilation flags + +CPPFLAGS = -I. -IS4 -IS4/RNP -IS4/kiss_fft + +ifdef BLAS_LIB +CPPFLAGS += -DHAVE_BLAS +endif + +ifdef LAPACK_LIB +CPPFLAGS += -DHAVE_LAPACK +endif + +ifdef FFTW3_LIB +CPPFLAGS += -DHAVE_FFTW3 $(FFTW3_INC) +endif + +ifdef PTHREAD_LIB +CPPFLAGS += -DHAVE_LIBPTHREAD $(PTHREAD_INC) +endif + +ifdef CHOLMOD_LIB +CPPFLAGS += -DHAVE_LIBCHOLMOD $(CHOLMOD_INC) +endif + +ifdef MPI_LIB +CPPFLAGS += -DHAVE_MPI $(MPI_INC) +endif + +LIBS = $(BLAS_LIB) $(LAPACK_LIB) $(FFTW3_LIB) $(PTHREAD_LIB) $(CHOLMOD_LIB) $(MPI_LIB) + +#### Compilation targets + +all: $(S4_LIBNAME) + +objdir: + mkdir -p $(OBJDIR) + mkdir -p $(OBJDIR)/S4k + mkdir -p $(OBJDIR)/S4r + mkdir -p $(OBJDIR)/modules + +S4_LIBOBJS = \ + $(OBJDIR)/S4k/S4.o \ + $(OBJDIR)/S4k/rcwa.o \ + $(OBJDIR)/S4k/fmm_common.o \ + $(OBJDIR)/S4k/fmm_FFT.o \ + $(OBJDIR)/S4k/fmm_kottke.o \ + $(OBJDIR)/S4k/fmm_closed.o \ + $(OBJDIR)/S4k/fmm_PolBasisNV.o \ + $(OBJDIR)/S4k/fmm_PolBasisVL.o \ + $(OBJDIR)/S4k/fmm_PolBasisJones.o \ + $(OBJDIR)/S4k/fmm_experimental.o \ + $(OBJDIR)/S4k/fft_iface.o \ + $(OBJDIR)/S4k/pattern.o \ + $(OBJDIR)/S4k/intersection.o \ + $(OBJDIR)/S4k/predicates.o \ + $(OBJDIR)/S4k/numalloc.o \ + $(OBJDIR)/S4k/gsel.o \ + $(OBJDIR)/S4k/sort.o \ + $(OBJDIR)/S4k/kiss_fft.o \ + $(OBJDIR)/S4k/kiss_fftnd.o \ + $(OBJDIR)/S4k/SpectrumSampler.o \ + $(OBJDIR)/S4k/cubature.o \ + $(OBJDIR)/S4k/Interpolator.o \ + $(OBJDIR)/S4k/convert.o + +S4r_LIBOBJS = \ + $(OBJDIR)/S4r/Material.o \ + $(OBJDIR)/S4r/LatticeGridRect.o \ + $(OBJDIR)/S4r/LatticeGridArb.o \ + $(OBJDIR)/S4r/POFF2Mesh.o \ + $(OBJDIR)/S4r/PeriodicMesh.o \ + $(OBJDIR)/S4r/Shape.o \ + $(OBJDIR)/S4r/Simulation.o \ + $(OBJDIR)/S4r/Layer.o \ + $(OBJDIR)/S4r/Pseudoinverse.o \ + $(OBJDIR)/S4r/Eigensystems.o \ + $(OBJDIR)/S4r/IRA.o \ + $(OBJDIR)/S4r/intersection.o \ + $(OBJDIR)/S4r/predicates.o \ + $(OBJDIR)/S4r/periodic_off2.o + +ifndef LAPACK_LIB + S4_LIBOBJS += $(OBJDIR)/S4k/Eigensystems.o +endif + +$(S4_LIBNAME): objdir $(S4_LIBOBJS) + $(AR) crvs $@ $(S4_LIBOBJS) +$(S4r_LIBNAME): objdir $(S4r_LIBOBJS) + $(AR) crvs $@ $(S4r_LIBOBJS) + +$(OBJDIR)/S4k/S4.o: S4/S4.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/rcwa.o: S4/rcwa.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_common.o: S4/fmm/fmm_common.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_FFT.o: S4/fmm/fmm_FFT.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_kottke.o: S4/fmm/fmm_kottke.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_closed.o: S4/fmm/fmm_closed.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_PolBasisNV.o: S4/fmm/fmm_PolBasisNV.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_PolBasisVL.o: S4/fmm/fmm_PolBasisVL.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_PolBasisJones.o: S4/fmm/fmm_PolBasisJones.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_experimental.o: S4/fmm/fmm_experimental.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fft_iface.o: S4/fmm/fft_iface.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/pattern.o: S4/pattern/pattern.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/intersection.o: S4/pattern/intersection.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/predicates.o: S4/pattern/predicates.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/numalloc.o: S4/numalloc.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/gsel.o: S4/gsel.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/sort.o: S4/sort.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/kiss_fft.o: S4/kiss_fft/kiss_fft.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/kiss_fftnd.o: S4/kiss_fft/tools/kiss_fftnd.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/SpectrumSampler.o: S4/SpectrumSampler.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/cubature.o: S4/cubature.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/Interpolator.o: S4/Interpolator.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/convert.o: S4/convert.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/Eigensystems.o: S4/RNP/Eigensystems.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ + + +$(OBJDIR)/S4r/Material.o: S4r/Material.cpp S4r/Material.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/LatticeGridRect.o: S4r/LatticeGridRect.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/LatticeGridArb.o: S4r/LatticeGridArb.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/POFF2Mesh.o: S4r/POFF2Mesh.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/PeriodicMesh.o: S4r/PeriodicMesh.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Shape.o: S4r/Shape.cpp S4r/Shape.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Simulation.o: S4r/Simulation.cpp S4r/Simulation.hpp S4r/StarProduct.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Layer.o: S4r/Layer.cpp S4r/Layer.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Pseudoinverse.o: S4r/Pseudoinverse.cpp S4r/Pseudoinverse.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Eigensystems.o: S4r/Eigensystems.cpp S4r/Eigensystems.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/IRA.o: S4r/IRA.cpp S4r/IRA.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/intersection.o: S4r/intersection.c S4r/intersection.h + $(CC) -c -O3 $< -o $@ +$(OBJDIR)/S4r/periodic_off2.o: S4r/periodic_off2.c S4r/periodic_off2.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4r/predicates.o: S4r/predicates.c + $(CC) -c -O3 $< -o $@ + +#### Python extension + +check: + @echo $(LIBS) + +S4_pyext: objdir $(S4_LIBNAME) + echo "Linking...." + echo $(EXTRA_FLAGS) $(EXTRA_DIRS) + sh gensetupHPC.py.sh $(OBJDIR) $(S4_LIBNAME) "$(LIBS)" "$(EXTRA_FLAGS)" "$(EXTRA_DIRS)" + python setup.py build + #pip3 install --upgrade ./ + +clean: + rm S4.so + rm -rf $(OBJDIR) diff --git a/PythonTest.py b/PythonTest.py new file mode 100644 index 0000000..a3678a4 --- /dev/null +++ b/PythonTest.py @@ -0,0 +1,126 @@ + +# all lengths normalized in terms of a +# frequency normalized in terms of 2*pi*c/a + +# this test file plots a frequency spectrum and field plot for the python extension + +import S4 +import numpy as np +import matplotlib.pyplot as plt +import time + +t = time.time() + +a = 1e-6; #1nm +c_const = 3e8; +print('Bontempi, Kivshar et al. \"Highly sensitive biosensors based on all-dielectric nanoresonator\"') +p = 0.96 +d = 0.73 +h = 0.22 +period = [p,p] +S = S4.New(Lattice=((period[0],0),(0,period[1])),NumBasis=20) +S.SetMaterial(Name = 'SiO2',Epsilon = (1.45 + 0.0j)**2) +S.SetMaterial(Name = 'Si',Epsilon = (3.487+0.0j)**2) +S.SetMaterial(Name = 'Vacuum',Epsilon = (1 + 0j)**2) + +S.AddLayer(Name = 'AirAbove',Thickness = 0, Material = 'Vacuum') +S.AddLayer(Name = 'AirDisp',Thickness = 1, Material = 'Vacuum') +S.AddLayer(Name = 'Si_disks',Thickness = h,Material = 'Vacuum') + +S.SetRegionCircle( + Layer = 'Si_disks', + Material = 'Si', + Center = (0,0), + Radius = d/2 +) + +S.AddLayer(Name = 'Glass_Below',Thickness = 2, Material = 'SiO2') +#S.AddLayerCopy(Name = 'AirBelow',Thickness = 0, Layer = 'AirAbove') +S.AddLayer(Name = 'AirBelow',Thickness = 0,Material = 'Vacuum') + +S.SetExcitationPlanewave( + IncidenceAngles=( + 0,# polar angle in [0,180) + 0 # azimuthal angle in [0,360) + ), + sAmplitude = 0, + pAmplitude = 1, + Order = 0 +) + +S.SetOptions( + PolarizationDecomposition = True +) + +wavelength_um = 1; +freq = 1 / float(wavelength_um); +S.SetFrequency(freq) #unit 2*pi*c_const / a +forward,backward = S.GetPowerFlux(Layer = 'AirAbove', zOffset = 0) # reflected power +forward = S.GetPowerFlux(Layer = 'AirBelow',zOffset = 0) +print('f='+str(1/freq)+', Power_forward= '+str(forward[0].real)+', Power_backward='+str(backward.real)) + +# output POV file +S.OutputStructurePOVRay(Filename = 'output_PythonTest.pov') + + + +# frequency sweep +wavelength_space = np.linspace(1.3, 1.7, 100) + +R = wavelength_space * 0 +T=R +i=0 +for lam in wavelength_space: + f = 1 / float(lam) + S.SetFrequency(f) + (forw,back) = S.GetPowerFlux(Layer = 'AirAbove', zOffset = 0) + forw = S.GetPowerFlux(Layer = 'AirBelow', zOffset = 0) + R[i] = np.abs(forw[0]) + #T[i] = np.abs(back) + i +=1 + +plt.plot(wavelength_space, R) +#plt.hold +#plt.plot(wavelength_space, T) +plt.xlabel('Wavelength (um)') +plt.ylabel('T') +plt.title('Figure 3(a)') + +#field map + +wavelength_p_nm = 1400; +wavelength_um = float(wavelength_p_nm)/1000; # [8.0,14.0] +freq = 1 / float(wavelength_um); +S.SetFrequency(freq) #unit 2*pi*c_const / a +x_space = np.linspace(-p/2,p/2,200) +z_space = np.linspace(0.8,1.4,200) + +X,Z = np.meshgrid(x_space,z_space) +Ey = np.zeros((X.shape[0], X.shape[1])) +xc = 0 +zc = 0 + +for x in x_space: + zc=0 + #print('x='+str(x)+'\n') + for z in z_space: + #print('z='+str(z)+'\tzc='+str(zc)+'\n') + E,H = S.GetFields(x,0,z) + Ey[zc,xc] = np.abs(H[1]) ** 2 + np.abs(H[2]) ** 2 + np.abs(H[0]) ** 2 + #print(str(x)+'\t'+str(z)+'\t'+str(abs(E[1]))) + zc += 1 + xc += 1 + +fig = plt.figure() +cm = np.linspace(np.min(Ey),np.max(Ey), 100) +plt.contourf(x_space, z_space, Ey, cm) +plt.plot([-p/2, p/2], [1, 1], color='w', linestyle='-', linewidth=2) +plt.plot([-d/2, d/2], [1+h, 1+h], color='w', linestyle='-', linewidth=2) +plt.plot([-d/2, -d/2], [1, 1+h], color='w', linestyle='-', linewidth=2) +plt.plot([d/2, d/2], [1, 1+h], color='w', linestyle='-', linewidth=2) +plt.title('Figure 3(c) - |H|^2 - lam = ' + str(wavelength_p_nm)) +plt.show(block=True) + +elapsed = time.time()-t; +print('Elapsed time = '+str(elapsed)+'s') +print('If you can read this, the configuration work is finished!') diff --git a/README b/README deleted file mode 100644 index 5969182..0000000 --- a/README +++ /dev/null @@ -1,10 +0,0 @@ -S4: Stanford Stratified Structure Solver (http://fan.group.stanford.edu/S4/) - -A program for computing electromagnetic fields in periodic, layered -structures, developed by Victor Liu (victorliu@alumni.stanford.edu) of the -Fan group in the Stanford Electrical Engineering Department. - -See the S4 manual, in doc/index.html, for a complete -description of the package and its user interface, as well as -installation instructions, the license and copyright, contact -addresses, and other important information. diff --git a/README.md b/README.md new file mode 100644 index 0000000..c0d384a --- /dev/null +++ b/README.md @@ -0,0 +1,98 @@ +If you are looking for a more user-friendly interface to S4 (wavelength-dependent calculations, material management etc.), check out my Python3 package [RayFlare](https://rayflare.readthedocs.io). + +# Installation instructions (64-bit Ubuntu 16/18/20 or MacOS): + +## Python prerequisites + +``` +pip install numpy wheel setuptools +``` + +## Key steps: + +``` +git clone https://github.com/phoebe-p/S4 +cd S4 +make S4_pyext +``` + +Running the `make boost` command before `make S4_pyext` should automatically download and compile the relevant Boost libraries in the local S4 directory. +HOWEVER, this appears to cause problems on macOS, so if you are installing on macOS you should install the boost libraries +using Homebrew (see below). If you want to use Boost libraries in a different location you will have to edit the Makefile. + +**Note for users of new (late 2020 onwards) Apple machines with M1/Apple silicon/ARM chips: you will need to use Makefile.m1 +to compile successfully, see notes below on how to do this.** + +## Installing relevant libraries etc.: + +**On Ubuntu (with a working version of Python3):** + +``` +sudo apt-get update +sudo apt install make git gcc g++ +sudo apt install libopenblas-dev libfftw3-dev libsuitesparse-dev +``` + +- libopenblas installs OpenBLAS, to satisfy the LAPACK and BLAS requirements +- lib fftw3 install FFTW3, to satisfy the FFTW requirements +- libsuitesparse install libraries to satisdy CHOLMOD & related requirements + +If you want, you can also install the boost libraries using `sudo apt install libboost-all-dev` instead of running `make boost`. + +**On MacOS using Homebrew:** + +``` +brew install fftw suite-sparse openblas lapack boost +``` + +You can get the make and git commands from homebrew, or through Apple Developer Tools. If the packages are installed/symlinked by Homebrew to the default location (/usr/local/include) you should not have to modify the Makefile, and you should be able to use the same Makefile as Ubuntu/Linux (i.e. no need to use Makefile.osx). + +*If you have multiple Python versions, you may need to modify the S4_pyext part of the Makefile:* + +```` +pip3 install --upgrade ./ +```` + +to e.g.: +``` +[path of target python or virtual environment] setup.py install +``` + +You can install S4 into a virtual environment automatically by just activating that environment in your terminal before running `make S4_pyext`. + +See [here](https://rayflare.readthedocs.io/en/latest/Installation/installation.html) for more extensive instructions. + + +**On MacOS with M1/ARM chips:** + +Some of the flags used in the default Makefile cause issues with the new chip architecture. Run the following instead: + +``` +git clone https://github.com/phoebe-p/S4 +cd S4 +make S4_pyext --file="Makefile.m1" +``` + +Installing the libraries with Homebrew should work as expected. + +------------------------------------- + +S4: Stanford Stratified Structure Solver (http://fan.group.stanford.edu/S4/) + +A program for computing electromagnetic fields in periodic, layered +structures, developed by Victor Liu (victorliu@alumni.stanford.edu) of the +Fan group in the Stanford Electrical Engineering Department. + +See the S4 manual, in doc/index.html, for a complete +description of the package and its user interface, as well as +installation instructions, the license and copyright, contact +addresses, and other important information. + +--------------------------------------- + +sajidmc: The MakefileHPC is created to compile the S4 in HPC environment without +root access. Also added an example for python extension testing: + +Check Wiki for details: https://github.com/sajidmc/S4/wiki +Example result: https://raw.githubusercontent.com/sajidmc/S4/master/examples/bontempi_et_al_Nanoscale_2017.png + diff --git a/S4/Interpolator.c b/S4/Interpolator.c index 7504b33..6754345 100644 --- a/S4/Interpolator.c +++ b/S4/Interpolator.c @@ -54,7 +54,7 @@ double* Interpolator_Get(const Interpolator I, double x, int *ny){ if(I->n < 1){ return NULL; } const int ld = 1+I->ny; int i, j; - + *ny = I->ny; switch(I->type){ case Interpolator_CUBIC_SPLINE: @@ -114,4 +114,4 @@ double* Interpolator_Get(const Interpolator I, double x, int *ny){ } } return NULL; -} +} \ No newline at end of file diff --git a/S4/Interpolator.h b/S4/Interpolator.h index 702328d..3ef4e0b 100644 --- a/S4/Interpolator.h +++ b/S4/Interpolator.h @@ -28,7 +28,7 @@ typedef enum Interpolator_type_{ } Interpolator_type; Interpolator Interpolator_New(int n, int ny, double *xy, Interpolator_type type); -void Interpolator_Destroy(Interpolator I); -double* Interpolator_Get(const Interpolator I, double x, int *ny); +void Interpolator_Destroy(Interpolator intp); +double* Interpolator_Get(const Interpolator intp, double x, int *ny); #endif /* _INTERPOLATOR_H_ */ diff --git a/S4/S4.cpp b/S4/S4.cpp index 3fab8af..d0ac9df 100644 --- a/S4/S4.cpp +++ b/S4/S4.cpp @@ -1072,7 +1072,7 @@ int Simulation_ChangeLayerThickness(Simulation *S, Layer *layer, const double *t int ret = 0; if(NULL == S){ ret = -1; } if(NULL == layer){ ret = -2; } - if(thick < 0){ ret = -3; } + if(NULL == thick || *thick < 0){ ret = -3; } if(0 != ret){ S4_TRACE("< Simulation_RemoveLayerPatterns (failed; ret = %d) [omega=%f]\n", ret, S->omega[0]); return ret; diff --git a/S4/fmm/fmm_common.cpp b/S4/fmm/fmm_common.cpp index 858834c..1716fde 100644 --- a/S4/fmm/fmm_common.cpp +++ b/S4/fmm/fmm_common.cpp @@ -52,7 +52,7 @@ double GetLanczosSmoothingFactor(double mp1, int power, double f[2]){ double GetLanczosSmoothingFactor(double mp1, int power, double f[2]){ double fr = hypot(f[0],f[1]); //fprintf(stderr, "%f\t%f\t%f\t%f\t%f\n", f[0], f[1], fr, mp1, 2*mp1+1-fr); - double j = Jinc(fr/(2*mp1+1)); + double j = Jinc(fr/(2*mp1)); return pow(j, power); } diff --git a/S4/main_python.c b/S4/main_python.c index 376f6bb..adac9b8 100644 --- a/S4/main_python.c +++ b/S4/main_python.c @@ -187,7 +187,7 @@ typedef struct{ typedef struct { PyObject_HEAD - Interpolator I; + Interpolator intp; }S4Interpolator; typedef struct @@ -566,7 +566,7 @@ static PyObject *S4Interpolator_new(PyTypeObject *type, PyObject *args, PyObject free(interData.xy); interData.xy = NULL; return NULL; } - self->I = Interpolator_New(interData.n, interData.ny, interData.xy, inter_type); + self->intp = Interpolator_New(interData.n, interData.ny, interData.xy, inter_type); } free(interData.xy); interData.xy = NULL; return (PyObject*)self; @@ -581,7 +581,7 @@ static PyObject *S4Interpolator_Get(S4Interpolator *self, PyObject *args, PyObje PyObject *ret; if(!PyArg_ParseTupleAndKeywords(args, kwds, "d:Get", kwlist, &x)) return NULL; - ys = Interpolator_Get(self->I, x, &ny); + ys = Interpolator_Get(self->intp, x, &ny); if(NULL == ys) Py_RETURN_NONE; ret = PyTuple_New(ny); @@ -645,7 +645,7 @@ static void S4Sim_dealloc(S4Sim* self){ static void S4Interpolator_dealloc(S4Interpolator *self) { - Interpolator_Destroy(self->I); + Interpolator_Destroy(self->intp); Py_TYPE(self)->tp_free((PyObject*) self); } @@ -664,14 +664,16 @@ static PyObject *S4Sim_Clone(S4Sim *self, PyObject *args){ } return (PyObject*)cpy; } + static PyObject *S4Sim_LoadSolution(S4Sim *self, PyObject *args, PyObject *kwds){ //printf"Inside S4Sim_LoadSolution\n"); static char *kwlist[] = { "Filename", NULL }; const char *fname; - if(!PyArg_ParseTupleAndKeywords(args, kwds, "s:LoadSolution", kwlist, &fname)){ return NULL; } + // int err; + + if(!PyArg_ParseTupleAndKeywords(args, kwds, "s:LoadSolution", kwlist, &fname)){ return NULL; } //printf"Parsed args!\n"); - int err; - err = Simulation_LoadSolution(&(self->S), fname); + // err = Simulation_LoadSolution(&(self->S), fname); // if (err != 0){ // HandleSolutionErrorCode("Simulation_LoadSolution", err); // } @@ -682,11 +684,11 @@ static PyObject *S4Sim_SaveSolution(S4Sim *self, PyObject *args, PyObject *kwds) static char *kwlist[] = { "Filename", NULL }; const char *fname; if(!PyArg_ParseTupleAndKeywords(args, kwds, "s:SaveSolution", kwlist, &fname)){ return NULL; } - int err; - err = Simulation_SaveSolution(&(self->S), fname); - if (err != 0){ - HandleSolutionErrorCode("Simulation_SaveSolution", err); - } + // int err; + // err = Simulation_SaveSolution(&(self->S), fname); + // if (err != 0){ + // HandleSolutionErrorCode("Simulation_SaveSolution", err); + // } Py_RETURN_NONE; } @@ -1203,9 +1205,9 @@ static PyObject *S4Sim_GetPropagationConstants(S4Sim *self, PyObject *args, PyOb return NULL; } n = Simulation_GetNumG(&(self->S), &G); - int n2 = 2*n; + int n2 = 2*n; // q vector in S4 contains 2*n complex doubles, meaning we need a memory - // space that can accomodate 4*n doubles + // space that can accomodate 4*n doubles q = (double*)malloc(sizeof(double)*2*n2); ret = Simulation_GetPropagationConstants(&(self->S), layer, q); if(0 != ret){ @@ -1476,12 +1478,12 @@ static PyObject *S4Sim_GetFieldsOnGridNumpy(S4Sim *self, PyObject *args, PyObjec /* PyArray_Descr* desc = PyArray_DescrFromType(NPY_COMPLEX128); */ Earr = PyArray_SimpleNewFromData(3, dims, NPY_COMPLEX128, Efields); /* PyArray_Dims new_dims; */ - /* npy_intp tmp[3] = {1, 0, 2}; */ + /* npy_intp tmp[3] = {1, 0, 2}; */ /* new_dims.ptr = tmp; */ /* new_dims.len = 3; */ /* Earr = PyArray_Transpose(Earr, &new_dims); */ /* npy_intp *strides = PyArray_STRIDES(Earr); */ - /* npy_intp temp; */ + /* npy_intp temp; */ /* temp = strides[0]; */ /* strides[0] = strides[1]; */ /* strides[1] = temp; */ @@ -1509,7 +1511,7 @@ static PyObject *S4Sim_GetFieldsOnGridNumpy(S4Sim *self, PyObject *args, PyObjec /* if (!(EHfields = PyArray_Zeros(4, &dims, desc, 0))) { */ /* goto fail; */ /* } */ - + /* return EHfields; */ /* return Py_BuildValue("(OO)", Earr, Harr); */ return Py_BuildValue("(NN)", Earr, Harr); @@ -2242,4 +2244,4 @@ PyMODINIT_FUNC initS4(void) #ifdef __cplusplus } -#endif +#endif \ No newline at end of file diff --git a/S4/pattern/pattern.c b/S4/pattern/pattern.c index 7e449e0..a6e9e5a 100644 --- a/S4/pattern/pattern.c +++ b/S4/pattern/pattern.c @@ -1337,10 +1337,10 @@ static void sparse_linsolve_d(int n, const double *M, const int *Mcol, int Mnd, ((int*)A->p)[i] = nnz; ((int*)A->i)[nnz] = i; break; - case CHOLMOD_INTLONG: - ((SuiteSparse_long*)A->p)[i] = nnz; - ((int*)A->i)[nnz] = i; - break; +// case CHOLMOD_INTLONG: +// ((SuiteSparse_long*)A->p)[i] = nnz; +// ((int*)A->i)[nnz] = i; +// break; case CHOLMOD_LONG: ((SuiteSparse_long*)A->p)[i] = nnz; ((SuiteSparse_long*)A->i)[nnz] = i; @@ -1356,9 +1356,9 @@ static void sparse_linsolve_d(int n, const double *M, const int *Mcol, int Mnd, case CHOLMOD_INT: ((int*)A->i)[nnz] = Mcol[Mnd*i+j]; break; - case CHOLMOD_INTLONG: - ((int*)A->i)[nnz] = Mcol[Mnd*i+j]; - break; +// case CHOLMOD_INTLONG: +// ((int*)A->i)[nnz] = Mcol[Mnd*i+j]; +// break; case CHOLMOD_LONG: ((SuiteSparse_long*)A->i)[nnz] = Mcol[Mnd*i+j]; break; @@ -1371,9 +1371,9 @@ static void sparse_linsolve_d(int n, const double *M, const int *Mcol, int Mnd, case CHOLMOD_INT: ((int*)A->p)[i] = nnz; break; - case CHOLMOD_INTLONG: - ((SuiteSparse_long*)A->p)[i] = nnz; - break; +// case CHOLMOD_INTLONG: +// ((SuiteSparse_long*)A->p)[i] = nnz; +// break; case CHOLMOD_LONG: ((SuiteSparse_long*)A->p)[i] = nnz; break; diff --git a/S4/rcwa.cpp b/S4/rcwa.cpp index 7b4c263..1c7a8f8 100644 --- a/S4/rcwa.cpp +++ b/S4/rcwa.cpp @@ -1,2124 +1,2130 @@ -/* Copyright (C) 2009-2011, Stanford University - * This file is part of S4 - * Written by Victor Liu (vkl@stanford.edu) - * - * S4 is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * S4 is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include "config.h" - -#define _USE_MATH_DEFINES -#include -#include -#include -#include -#include "rcwa.h" -#include "fmm/fft_iface.h" -#include -#ifdef HAVE_BLAS -# include -#endif -#include -#ifdef HAVE_LAPACK -# include -# include -#else -# include -#endif - -#ifdef DUMP_MATRICES -# define RNP_OUTPUT_MATHEMATICA -# define DUMP_STREAM std::cerr -//# define DUMP_STREAM (omega.real() > 1.91637 ? std::cerr : std::cout) -# include -#endif -# include -# include - -#include - -static inline void* rcwa_malloc(size_t size){ - void *ret = malloc_aligned(size, 16); - //memset(ret, 0x0, size); - return ret; -} -static inline void rcwa_free(void *ptr){ - free_aligned(ptr); -} - -#ifdef HAVE_LAPACK - extern "C" void zgelss_( - const integer &m, const integer &n, const integer &nRHS, - std::complex *a, const integer &lda, - std::complex *b, const integer &ldb, - double *s, const double &rcond, integer *rank, - std::complex *work, const integer &lwork, - double *rwork, integer *info - ); -#endif - -static void SingularLinearSolve( - size_t m, size_t n, size_t nRHS, std::complex *a, size_t lda, - std::complex *b, size_t ldb, const double &rcond -){ -#ifdef HAVE_LAPACK - integer info, rank; - std::complex dummy; - zgelss_(m, n, nRHS, a, lda, b, ldb, NULL, rcond, &rank, &dummy, -1, NULL, &info); - integer lwork = (integer)dummy.real(); - std::complex *work = (std::complex*)rcwa_malloc(sizeof(std::complex) * lwork); - double *rwork = (double*)rcwa_malloc(sizeof(double) * 6*m); - - zgelss_(m, n, nRHS, a, lda, b, ldb, rwork, rcond, &rank, work, lwork, rwork+m, &info); - - rcwa_free(rwork); - rcwa_free(work); -#else - RNP::LinearSolve<'N'>(n, nRHS, a, lda, b, ldb); -#endif -} - - -// kp = k-parallel matrix -// kp = omega^2 - Kappa = omega^2 - [ ky*epsinv*ky -ky*epsinv*kx ] -// [ -kx*epsinv*ky kx*epsinv*kx ] -// = omega^2 + [ -ky ] [ epsinv ] [ ky -kx ] -// - -static void MakeKPMatrix( - std::complex omega, - size_t n, - const double *kx, - const double *ky, - const std::complex *Epsilon_inv, - int epstype, - const std::complex *kp_existing, - std::complex *kp, - const size_t ldkp // leading dimension of kp, >= 2*n -){ - const size_t n2 = 2*n; - if(NULL != kp_existing){ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp_existing,n2, kp,ldkp); - return; - } - - const std::complex omega2 = omega*omega; - if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., kp,ldkp); - - for(size_t i = 0; i < n; ++i){ - kp[(0+i)+(0+i)*ldkp] = omega2 - ky[i]*Epsilon_inv[0]*ky[i]; - kp[(n+i)+(0+i)*ldkp] = kx[i]*Epsilon_inv[0]*ky[i]; - } - for(size_t i = 0; i < n; ++i){ - kp[(0+i)+(n+i)*ldkp] = ky[i]*Epsilon_inv[0]*kx[i]; - kp[(n+i)+(n+i)*ldkp] = omega2 - kx[i]*Epsilon_inv[0]*kx[i]; - } - }else{ - RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[0+0*ldkp], ldkp); - RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[n+0*ldkp], ldkp); - RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[0+n*ldkp], ldkp); - RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[n+n*ldkp], ldkp); - - for(size_t i = 0; i < n; ++i){ - RNP::TBLAS::Scale(n2, -ky[i], &kp[0+i*n2], 1); - } - for(size_t i = 0; i < n; ++i){ - RNP::TBLAS::Scale(n2, kx[i], &kp[0+(i+n)*n2], 1); - } - for(size_t i = 0; i < n; ++i){ - RNP::TBLAS::Scale(n2, ky[i], &kp[i+0*n2], ldkp); - } - for(size_t i = 0; i < n; ++i){ - RNP::TBLAS::Scale(n2, -kx[i], &kp[i+n+0*n2], ldkp); - } - const std::complex omega2 = omega*omega; - for(size_t i = 0; i < n2; ++i){ - kp[i+i*ldkp] += omega2; - } - } -} - -// kp = k-parallel matrix -// kp = omega^2 - Kappa = omega^2 - [ ky*epsinv*ky -ky*epsinv*kx ] -// [ -kx*epsinv*ky kx*epsinv*kx ] -// = omega^2 + [ -ky ] [ epsinv ] [ ky -kx ] -// [ kx ] - -void MultKPMatrix( - const char *trans, - const std::complex omega, - const size_t n, - const double *kx, - const double *ky, - const std::complex *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix - const int Epsilon2_type, - const std::complex *kp, // kp if exists. Can be NULL - const size_t ncols, - const std::complex *a, - const size_t lda, - std::complex *y, // same size as a, y <- kp*a - const size_t ldy -){ - const size_t n2 = 2*n; - - if(NULL != kp){ - if('N' == trans[0]){ - if(ncols > 1){ - RNP::TBLAS::MultMM<'N','N'>(n2,ncols,n2, std::complex(1.),kp,n2, a,lda, std::complex(0.),y,ldy); - }else{ - RNP::TBLAS::MultMV<'N'>(n2,n2, std::complex(1.),kp,n2, a,1, std::complex(0.),y,1); - } - }else{ - if(ncols > 1){ - RNP::TBLAS::MultMM<'C','N'>(n2,ncols,n2, std::complex(1.),kp,n2, a,lda, std::complex(0.),y,ldy); - }else{ - RNP::TBLAS::MultMV<'C'>(n2,n2, std::complex(1.),kp,n2, a,1, std::complex(0.),y,1); - } - } - return; - } - - const std::complex omega2 = omega*omega; - if(EPSILON2_TYPE_BLKDIAG1_SCALAR == Epsilon2_type || EPSILON2_TYPE_BLKDIAG2_SCALAR == Epsilon2_type){ - const std::complex epsinv( - 'N' == trans[0] ? Epsilon_inv[0] : std::conj(Epsilon_inv[0]) - ); - for(size_t j = 0; j < ncols; ++j){ - for(size_t i = 0; i < n; ++i){ - std::complex c = a[(0+i)+j*lda]; - std::complex d = a[(n+i)+j*lda]; - y[(0+i)+j*ldy] = (omega2 - ky[i]*epsinv*ky[i]) * c + ( ky[i]*epsinv*kx[i]) * d; - y[(n+i)+j*ldy] = ( kx[i]*epsinv*ky[i]) * c + (omega2 - kx[i]*epsinv*kx[i]) * d; - } - } - }else{ - for(size_t j = 0; j < ncols; ++j){ - for(size_t i = 0; i < n; ++i){ - y[i+j*ldy] = ky[i]*a[(0+i)+j*lda] - kx[i]*a[(n+i)+j*lda]; - } - } - if('N' == trans[0]){ - if(ncols > 1){ - RNP::TBLAS::MultMM<'N','N'>(n,ncols,n, 1.,Epsilon_inv,n, &y[0+0*ldy],ldy, 0.,&y[n+0*ldy],ldy); - }else{ - RNP::TBLAS::MultMV<'N'>(n,n, 1.,Epsilon_inv,n, &y[0+0*ldy],1, 0.,&y[n+0*ldy],1); - } - }else{ - if(ncols > 1){ - RNP::TBLAS::MultMM<'C','N'>(n,ncols,n, 1.,Epsilon_inv,n, &y[0+0*ldy],ldy, 0.,&y[n+0*ldy],ldy); - }else{ - RNP::TBLAS::MultMV<'C'>(n,n, 1.,Epsilon_inv,n, &y[0+0*ldy],1, 0.,&y[n+0*ldy],1); - } - } - - for(size_t j = 0; j < ncols; ++j){ - for(size_t i = 0; i < n; ++i){ - y[(0+i)+j*ldy] = omega2 * a[(0+i)+j*lda] - ky[i]*y[n+i+j*ldy]; - y[(n+i)+j*ldy] = omega2 * a[(n+i)+j*lda] + kx[i]*y[n+i+j*ldy]; - } - } - } -} - -void SolveLayerEigensystem_uniform( - std::complex omega, - size_t n, - const double *kx, - const double *ky, - std::complex eps, - std::complex *q, // length 2*glist.n - std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - std::complex *phi // size (2*glist.n)^2 -){ - const size_t n2 = 2*n; - - const std::complex eta = 1./ eps; - const std::complex omega2 = omega*omega; - - // Fill kp - if(NULL != kp){ - MakeKPMatrix(omega, n, kx, ky, &eta, EPSILON2_TYPE_BLKDIAG1_SCALAR, NULL, kp, n2); - -#ifdef DUMP_MATRICES - DUMP_STREAM << "kp:" << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,kp,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,kp,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - } - - for(size_t i = 0; i < n; ++i){ - q[i] = eps*omega2 - kx[i]*kx[i] - ky[i]*ky[i]; - - // Set the \hat{q} vector (diagonal matrix) while we're at it - if(0 == omega.imag()){ // Not bandsolving - q[i] = std::sqrt(q[i]); - if(q[i].imag() < 0){ - q[i] = -q[i]; - } - }else{ // performing some kind of bandsolving, need to choose the appropriate branch - if(q[i].real() < 0){ - // branch cut should be just below positive real axis - q[i] = std::complex(0,1) * std::sqrt(-q[i]); - }else{ - // branch cut should be just below negative real axis - // This is the default behavior for sqrt(std::complex) - q[i] = std::sqrt(q[i]); - } - } - - /* - if(std::complex(0) == q[i]){ - q[i] = DBL_EPSILON*omega; - } - */ - - q[i+n] = q[i]; - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "q:" << std::endl; - RNP::IO::PrintVector(n2,q,1, DUMP_STREAM) << std::endl << std::endl; -#endif - if(NULL != phi){ - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., phi,n2); -#ifdef DUMP_MATRICES - DUMP_STREAM << "phi:" << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,phi,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,phi,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - } -} - -void SolveLayerEigensystem( - std::complex omega, - size_t n, - const double *kx, - const double *ky, - const std::complex *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix - const std::complex *Epsilon2, // size (2*glist.n)^2 (dielectric/normal-field matrix) - int epstype, - std::complex *q, // length 2*glist.n - std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - std::complex *phi, // size (2*glist.n)^2 - std::complex *work_, - double *rwork_, - size_t lwork -){ - const size_t n2 = 2*n; - - if((size_t)-1 == lwork){ - RNP::Eigensystem(n2, NULL, n2, q, NULL, 1, phi, n2, work_, NULL, lwork); - work_[0] += n2*n2; - return; - }else if(0 == lwork){ - lwork = n2*n2+2*n2; - } - - std::complex *work = work_; - size_t eigenlwork; - if(NULL == work_ || lwork < n2*n2+2*n2){ - lwork = (size_t)-1; - RNP::Eigensystem(n2, NULL, n2, q, NULL, 1, phi, n2, q, NULL, lwork); - eigenlwork = (size_t)q[0].real(); - work = (std::complex*)rcwa_malloc(sizeof(std::complex)*(eigenlwork + n2*n2)); - }else{ - eigenlwork = lwork - n2*n2; - } - std::complex *op = work; - std::complex *eigenwork = op + n2*n2; - - double *rwork = rwork_; - if(NULL == rwork_){ - rwork = (double*)rcwa_malloc(sizeof(double)*2*n2); - } - -#ifdef DUMP_MATRICES - DUMP_STREAM << "kx:" << std::endl; - RNP::IO::PrintVector(n,kx,1, DUMP_STREAM) << std::endl; - DUMP_STREAM << "ky:" << std::endl; - RNP::IO::PrintVector(n,ky,1, DUMP_STREAM) << std::endl << std::endl; -#endif - -#ifdef DUMP_MATRICES - DUMP_STREAM << "Epsilon_inv:" << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n,n,Epsilon_inv,n, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n,Epsilon_inv,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - // Fill kp - std::complex *kp_use = (NULL != kp ? kp : phi); - - MakeKPMatrix(omega, n, kx, ky, Epsilon_inv, epstype, NULL, kp_use, n2); -#ifdef DUMP_MATRICES - DUMP_STREAM << "kp:" << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,kp_use,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,&kp_use[0+(n2-1)*n2],1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - -#ifdef DUMP_MATRICES - DUMP_STREAM << "Epsilon2:" << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,Epsilon2,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,Epsilon2,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - // Make the eigenoperator Epsilon2*kp - [kxkx, kxky; kykx, kyky] - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., op,n2); - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,Epsilon2,n2, kp_use,n2, 0.,op,n2); - - for(size_t i = 0; i < n; ++i){ - op[i+i*n2] -= kx[i]*kx[i]; - } - for(size_t i = 0; i < n; ++i){ - op[i+n+i*n2] -= ky[i]*kx[i]; - } - for(size_t i = 0; i < n; ++i){ - op[i+(i+n)*n2] -= kx[i]*ky[i]; - } - for(size_t i = 0; i < n; ++i){ - op[i+n+(i+n)*n2] -= ky[i]*ky[i]; - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "op:" << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,op,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintMatrix(n2,n2,op,n2, DUMP_STREAM) << std::endl << std::endl; - //RNP::IO::PrintVector(n2,&op[0+(n2-1)*n2],1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - -#ifdef DUMP_MATRICES -# ifdef DUMP_MATRICES_LARGE - std::complex *op_save = (std::complex *)rcwa_malloc(sizeof(std::complex) * 2*n2*n2); - std::complex *op_temp = op_save + n2*n2; - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, op,n2, op_save,n2); -# endif -#endif - int info = RNP::Eigensystem(n2, op, n2, q, NULL, 1, phi, n2, eigenwork, rwork, eigenlwork); - if(0 != info){ - std::cerr << "Layer eigensystem returned info = " << info << std::endl; - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "eigen info = " << info << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::TBLAS::Fill(n2*n2, 0., op_temp,1); - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,op_save,n2, phi,n2, 0.,op_temp,n2); - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Axpy(n2, -q[i], &phi[0+i*n2],1, &op_temp[0+i*n2],1); - } - DUMP_STREAM << "eigensystem residual:" << std::endl; - RNP::IO::PrintMatrix(n2,n2,op_temp,n2, DUMP_STREAM) << std::endl << std::endl; - rcwa_free(op_save); -# endif -#endif - - for(size_t i = 0; i < n2; ++i){ - // Set the \hat{q} vector (diagonal matrix) while we're at it - if(0 == omega.imag()){ // Not bandsolving - q[i] = std::sqrt(q[i]); - if(q[i].imag() < 0){ - q[i] = -q[i]; - } - }else{ // performing some kind of bandsolving, need to choose the appropriate branch - if(q[i].real() < 0){ - // branch cut should be just below positive real axis - q[i] = std::complex(0,1) * std::sqrt(-q[i]); - }else{ - // branch cut should be just below negative real axis - // This is the default behavior for sqrt(std::complex) - q[i] = std::sqrt(q[i]); - } - } - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "q:" << std::endl; - RNP::IO::PrintVector(n2,q,1, DUMP_STREAM) << std::endl << std::endl; - DUMP_STREAM << "phi:" << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,phi,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,phi,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - if(NULL == work_ || lwork < n2*n2+2*n2){ - rcwa_free(work); - } - if(NULL == rwork_){ - rcwa_free(rwork); - } -} - - -void InitSMatrix( - size_t n, - std::complex *S // size (4*n)^2 -){ - const size_t n4 = 4*n; - RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,1., S, n4); -} -void GetSMatrix( - size_t nlayers, - size_t n, // glist.n - const double *kx, const double *ky, - std::complex omega, - const double *thickness, // list of thicknesses - const std::complex **q, // list of q vectors - const std::complex **Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix - int *epstype, - const std::complex **kp, - const std::complex **phi, - std::complex *S, // size (4*n)^2 - std::complex *work_, - size_t *iwork, - size_t lwork -){ - if(0 == nlayers){ return; } - const size_t n2 = 2*n; - const size_t n4 = 2*n2; - - if((size_t)-1 == lwork){ - work_[0] = n4*(n4+1); - return; - } - std::complex *work = work_; - if(NULL == work_ || lwork < n4*(n4+1)){ - work = (std::complex*)rcwa_malloc(sizeof(std::complex)*(n4*(n4+1))); - } - size_t *pivots = iwork; - if(NULL == iwork){ - pivots = (size_t*)rcwa_malloc(sizeof(size_t)*n4); - } - - RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,1., S, n4); - - std::complex *t1 = work; - std::complex *t2 = t1 + n2*n2; - std::complex *in1 = t2 + n2*n2; - std::complex *in2 = in1 + n2*n2; - std::complex *d1 = in2 + n2*n2; - std::complex *d2 = d1 + n2; - - for(size_t l = 0; l < nlayers-1; ++l){ - size_t lp1 = l+1; - if(lp1 >= nlayers){ lp1 = l; } - - // Make the interface matrices - if((lp1 == l) || (q[l] == q[lp1] && ((NULL != kp[l] && kp[l] == kp[lp1]) || Epsilon_inv[l] == Epsilon_inv[lp1]) && phi[l] == phi[lp1])){ - // This is a trivial interface, set to identity - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., in1, n2); - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., in2, n2); - }else{ - // The interface matrix is the inverse of the mode-to-field matrix of layer l - // times the mode-to-field matrix of layer l+1 (lp1). - // The mode-to-field matrix is of the form - // [ B -B ] where A = phi - // [ A A ] where B = kp*phi*inv(diag(q)) = G*A/q - // So we want - // 0.5 * [ iBl iAl ] [ Blp1 -Blp1 ] - // [ -iBl iAl ] [ Alp1 Alp1 ] - // Multiplying out gives - // 0.5 * [ P+Q P-Q ] // where P = iAl*Alp1, and i in front means inverse - // [ P-Q P+Q ] // where Q = iBl*Blp1 - // Making P is easy, since A is a single matrix. - // Making Q is as follows: - // Q = iBl*Blp1 - // = ql*iAl*iGl * Gl*Alp1*iqlp1 - // We will only store I11 and I21 - /* - { - std::complex *Ml = (std::complex*)rcwa_malloc(sizeof(std::complex)*n4*n4); - std::complex *Mlp1 = (std::complex*)rcwa_malloc(sizeof(std::complex)*n4*n4); - - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., &Ml[0+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[n2+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[n2+n2*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[0+n2*n4],n4); - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, 1./q[l][i], &Ml[0+(i+n2)*n4], 1); - } - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,kp[l],n2, &Ml[0+n2*n4],n4, 0.,&Ml[0+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Ml[0+0*n4],n4, &Ml[0+n2*n4],n4); - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, -1., &Ml[0+(i+n2)*n4], 1); - } - - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., &Mlp1[0+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[n2+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[n2+n2*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[0+n2*n4],n4); - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, 1./q[lp1][i], &Mlp1[0+(i+n2)*n4], 1); - } - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,kp[lp1],n2, &Mlp1[0+n2*n4],n4, 0.,&Mlp1[0+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Mlp1[0+0*n4],n4, &Mlp1[0+n2*n4],n4); - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, -1., &Mlp1[0+(i+n2)*n4], 1); - } - - //RNP::LinearSolve<'N'>(n4, n4, Ml, n4, Mlp1, n4, NULL, pivots); -#ifdef DUMP_MATRICES - DUMP_STREAM << "M(" << l << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n4,n4,Ml,n4, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n4,Ml,1, DUMP_STREAM) << std::endl << std::endl; -# endif - DUMP_STREAM << "M(" << lp1 << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n4,n4,Mlp1,n4, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n4,Mlp1,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - rcwa_free(Mlp1); - rcwa_free(Ml); - } - */ - // Make Bl in t1 - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., t1,n2); - { - if(NULL == phi[l]){ - if(NULL == kp[l]){ - MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[l], epstype[l], kp[l], t1,n2); - }else{ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[l],n2, t1,n2); - } - }else{ - MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[l], epstype[l], kp[l], n2, phi[l],n2, t1,n2); - } - //for(size_t i = 0; i < n2; ++i){ - // RNP::TBLAS::Scale(n2, 1./q[l][i], &t1[0+i*n2], 1); - //} - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "Bl(" << l << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,t1,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,t1,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - // Make Blp1 in in1 - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., in1,n2); - { - if(NULL == phi[lp1]){ - if(NULL == kp[lp1]){ - MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[lp1], epstype[lp1], kp[lp1], in1,n2); - }else{ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[lp1],n2, in1,n2); - } - }else{ - MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[lp1], epstype[lp1], kp[lp1], n2, phi[lp1],n2, in1,n2); - } - //for(size_t i = 0; i < n2; ++i){ - // RNP::TBLAS::Scale(n2, 1./q[lp1][i], &in1[0+i*n2], 1); - //} - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "Bl(" << l+1 << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,in1,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,in1,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - int solve_info; - // Make Q in in1 - //RNP::LinearSolve<'N'>(n2, n2, t1, n2, in1, n2, &solve_info, pivots); - SingularLinearSolve(n2,n2,n2, t1,n2, in1,n2, DBL_EPSILON); - // Now perform the diagonal scalings - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, q[l][i], &in1[i+0*n2], n2); - } - { - double maxel = 0; - for(size_t i = 0; i < n2; ++i){ - double el = std::abs(q[lp1][i]); - if(el > maxel){ maxel = el; } - } - for(size_t i = 0; i < n2; ++i){ - double el = std::abs(q[lp1][i]); - if(el < DBL_EPSILON * maxel){ - RNP::TBLAS::Scale(n2, 0., &in1[0+i*n2], 1); - }else{ - RNP::TBLAS::Scale(n2, 1./q[lp1][i], &in1[0+i*n2], 1); - } - } - } - - // Make P in in2 - if(NULL == phi[lp1]){ - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., in2,n2); - }else{ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, in2,n2); - } - if(NULL != phi[l]){ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, t1,n2); - RNP::LinearSolve<'N'>(n2, n2, t1, n2, in2, n2, &solve_info, pivots); - } - - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, in2,n2, t1,n2); // in2 = P, t1 = P, in1 = Q - RNP::TBLAS::Axpy(n2*n2, -1., in1,1, in2,1); // in2 = P-Q, t1 = P, in1 = Q - RNP::TBLAS::Axpy(n2*n2, 1., t1,1, in1,1); // in2 = P+Q, t1 = P, in1 = P+Q - RNP::TBLAS::Scale(n2*n2, 0.5, in1,1); - RNP::TBLAS::Scale(n2*n2, 0.5, in2,1); - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "Interface1(" << l+1 << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,in1,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,in1,1, DUMP_STREAM) << std::endl << std::endl; -# endif - DUMP_STREAM << "Interface2(" << l+1 << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,in2,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,in2,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - for(size_t i = 0; i < n2; ++i){ - d1[i] = std::exp(q[l ][i] * std::complex(0,thickness[l ])); - d2[i] = std::exp(q[lp1][i] * std::complex(0,thickness[lp1])); - } - - // Make S11 - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, -1.,&S[0+n2*n4],n4, in2,n2, 0.,t1,n2); // t1 = -S12 I21 - for(size_t i = 0; i < n2; ++i){ // t1 = -f_l S12 I21 - RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2); - } - RNP::TBLAS::Axpy(n2*n2, 1., in1,1, t1,1); // t1 = (I11 - f_l S12 I21) - - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., t2,n2); - int solve_info; - RNP::LinearSolve<'N'>(n2, n2, t1, n2, t2, n2, &solve_info, pivots); // t2 = (I11 - f_l S12 I21)^{-1} - - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &S[0+0*n4],n4, t1,n2); - for(size_t i = 0; i < n2; ++i){ // t1 = f_l S11 - RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2); - } - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t2,n2, t1,n2, 0.,&S[0+0*n4],n4); - // S11 is done, and we need to hold on to t2 = (I11 - f_l S12 I21)^{-1} - - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[0+n2*n4],n4, in1,n2, 0.,t1,n2); // t1 = S12 I22 - for(size_t i = 0; i < n2; ++i){ // t1 = f_l S12 I22 - RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2); - } - RNP::TBLAS::Axpy(n2*n2, -1., in2,1, t1,1); // t1 = f_l S12 I22 - I12 - for(size_t i = 0; i < n2; ++i){ // t1 = (f_l S12 I22 - I12) f_{l+1} - RNP::TBLAS::Scale(n2, d2[i], &t1[0+i*n2], 1); - } - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t2,n2, t1,n2, 0.,&S[0+n2*n4],n4); - // S12 done, and t2 can be reused - - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[n2+n2*n4],n4, in2,n2, 0.,t1,n2); // t1 = S22 I21 - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t1,n2, &S[0+0*n4],n4, 1.,&S[n2+0*n4],n4); - // S21 done, need to keep t1 = S22 I21 - - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[n2+n2*n4],n4, in1,n2, 0.,t2,n2); // t2 = S22 I22 - for(size_t i = 0; i < n2; ++i){ // t2 = S22 I22 f_{l+1} - RNP::TBLAS::Scale(n2, d2[i], &t2[0+i*n2], 1); - } - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, t2,n2, &S[n2+n2*n4],n4); - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t1,n2, &S[0+n2*n4],n4, 1.,&S[n2+n2*n4],n4); - -#ifdef DUMP_MATRICES - DUMP_STREAM << "S(1," << l+2 << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n4,n4,S,n4, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n4,S,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - } - if(NULL == work_ || lwork < n4*(n4+1)){ - rcwa_free(work); - } - if(NULL == iwork){ - rcwa_free(pivots); - } -} - -int SolveInterior( - size_t nlayers, - size_t which_layer, - size_t n, // glist.n - const double *kx, const double *ky, - std::complex omega, - const double *thickness, // list of thicknesses - const std::complex **q, // list of q vectors - const std::complex **Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix - int *epstype, - const std::complex **kp, - const std::complex **phi, - std::complex *a0, // length 2*n - std::complex *bN, // length 2*n - std::complex *ab, // length 4*n - std::complex *work_, // length lwork - size_t *iwork, // length n2 - size_t lwork // set to -1 for query into work[0], at least 2*(4*n)^2 + 2*(2*n) + 4*n*(4*n+1) -){ - if(0 == nlayers){ return-1; } - if(which_layer >= nlayers){ return -2; } - - const size_t n2 = 2*n; - const size_t n4 = 2*n2; - const size_t lwork_GetSMatrix = 4*n*(4*n+1); - const size_t lwork_needed = 2*n4*n4 + 2*n2 + lwork_GetSMatrix; - - if((size_t)-1 == lwork){ - work_[0] = lwork_needed; - return 0; - } - std::complex *work = work_; - if(NULL == work_ || lwork < lwork_needed){ - work = (std::complex*)rcwa_malloc(sizeof(std::complex)*lwork_needed); - } - size_t *pivots = iwork; - if(NULL == iwork){ - pivots = (size_t*)rcwa_malloc(sizeof(size_t)*n2); - } - - std::complex *S0l = work; - std::complex *SlN = S0l + n4*n4; - std::complex *S11a0 = SlN + n4*n4; - std::complex *S22bN = S11a0 + n2; - std::complex *work_GetSMatrix = S22bN + n2; - std::complex *al = ab; - std::complex *bl = al+n2; - std::complex *temp = S0l; - size_t ldtemp = n4; - - int info; - - GetSMatrix(which_layer+1, n, kx, ky, omega, - thickness, q, Epsilon_inv, epstype, kp, phi, - S0l, work_GetSMatrix, pivots, lwork_GetSMatrix); - GetSMatrix(nlayers-which_layer, n, kx, ky, omega, - thickness+which_layer, q+which_layer, Epsilon_inv+which_layer, epstype+which_layer, kp+which_layer, phi+which_layer, - SlN, work_GetSMatrix, pivots, lwork_GetSMatrix); - -#ifdef DUMP_MATRICES - DUMP_STREAM << "S0l(0," << which_layer << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n4,n4,S0l,n4, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n4,S0l,1, DUMP_STREAM) << std::endl << std::endl; -# endif - DUMP_STREAM << "SlN(" << which_layer << "," << nlayers-1 << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n4,n4,SlN,n4, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n4,SlN,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - // both solutions only depend on the products S11(0,l)*a0 and S22(l,N)*bN - if(NULL != a0){ - RNP::TBLAS::MultMV<'N'>(n2, n2, std::complex(1.0), &S0l[0+0*n4], n4, - a0, 1, - std::complex(0.0), S11a0, 1); - }else{ - RNP::TBLAS::Fill(n2, 0., S11a0, 1); - } - if(NULL != bN){ - RNP::TBLAS::MultMV<'N'>(n2, n2, std::complex(1.0), &SlN[n2+n2*n4], n4, - bN, 1, - std::complex(0.0), S22bN, 1); - }else{ - RNP::TBLAS::Fill(n2, 0., S22bN, 1); - } - - // We overwrite the upper left submatrix S11(0,l) since it's not needed anymore - // temp is set to S0l for this reason. - - // Compute -S_12(0,l)S_21(l,N) - RNP::TBLAS::MultMM<'N','N'>(n2, n2, n2, std::complex(-1.0), &S0l[0+n2*n4], n4, - &SlN[n2+0*n4], n4, - std::complex(0.0), temp, ldtemp); - for(size_t i = 0; i < n2; ++i){ - temp[i+i*ldtemp] += 1.; - } // temp = (1 - S_12(0,l)S_21(l,N)) - - RNP::TBLAS::MultMV<'N'>(n2, n2, std::complex(1.0), &S0l[0+n2*n4], n4, - S22bN, 1, - std::complex(0.0), al, 1); // al = S_12(0,l)S_22(l,N)bN - RNP::TBLAS::Axpy(n2, std::complex(1.0), S11a0, 1, al, 1); // al = S_11(0,l)*a0 + S_12(0,l)S_22(l,N)bN - - RNP::LinearSolve<'N'>(n2, 1, temp, ldtemp, al, n2, &info, pivots); - // al done - - // Make the other matrix - // Compute S_21(l,N)S_12(0,l) - RNP::TBLAS::MultMM<'N','N'>(n2, n2, n2, std::complex(-1.0), &SlN[n2+0*n4], n4, - &S0l[0+n2*n4], n4, - std::complex(0.0), temp, ldtemp); - for(size_t i = 0; i < n2; ++i){ - temp[i+i*ldtemp] += 1.; - } // temp = (1 - S_21(l,N)S_12(0,l)) - - RNP::TBLAS::MultMV<'N'>(n2, n2, std::complex(1.0), &SlN[n2+0*n4], n4, - S11a0, 1, - std::complex(0.0), bl, 1); // bl = S_21(l,N)S_11(0,l)a0 - RNP::TBLAS::Axpy(n2, std::complex(1.0), S22bN, 1, bl, 1); // bl = S_21(l,N)S_11(0,l)a0 + S_22(l,N)bN - RNP::LinearSolve<'N'>(n2, 1, temp, ldtemp, bl, n2, &info, pivots); - -#ifdef DUMP_MATRICES - DUMP_STREAM << "al:" << std::endl; - RNP::IO::PrintVector(n2,al,1, DUMP_STREAM) << std::endl << std::endl; - DUMP_STREAM << "bl:" << std::endl; - RNP::IO::PrintVector(n2,bl,1, DUMP_STREAM) << std::endl << std::endl; -#endif - - if(NULL == work_ || lwork < lwork_needed){ - rcwa_free(work); - } - if(NULL == iwork){ - rcwa_free(pivots); - } - return 0; -} - - - -void TranslateAmplitudes( - size_t n, // glist.n - const std::complex *q, // length 2*glist.n - double thickness, - double dz, - std::complex *ab -){ - const size_t n2 = 2*n; - for(size_t i = 0; i < n2; ++i){ - std::complex iq = std::complex(0,1)*q[i]; - // When extrapolating, the exponentials can be huge, so we need to check - // against overflow. This doesn't solve the problem completely, but it helps. - if(0. != ab[i]){ - ab[i] *= std::exp(iq*dz); - } - if(0. != ab[i+n2]){ - ab[i+n2] *= std::exp(iq*(thickness-dz)); - } - } -} - -void GetZPoyntingFlux( - size_t n, // glist.n - const double *kx, const double *ky, - std::complex omega, - const std::complex *q, // length 2*glist.n - const std::complex *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix - int epstype, - const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - const std::complex *phi, // size (2*glist.n)^2 - const std::complex *ab, // length 4*glist.n - std::complex *forward, - std::complex *backward, - std::complex *work -){ - const size_t n2 = 2*n; - - std::complex *a2 = work; - if(NULL == work){ - a2 = (std::complex *)rcwa_malloc(sizeof(std::complex) * 4*n2); - } - std::complex *b2 = a2 + n2; - std::complex *a3 = b2 + n2; - std::complex *b3 = a3 + n2; - - memcpy(a2, ab, sizeof(std::complex) * 2*n2); - for(size_t i = 0; i < n2; ++i){ a2[i] /= (omega*q[i]); } - for(size_t i = 0; i < n2; ++i){ b2[i] /= (omega*q[i]); } - if(NULL == phi){ - RNP::TBLAS::CopyMatrix<'A'>(n2,2, a2,n2, a3,n2); - }else{ - RNP::TBLAS::MultMM<'N','N'>(n2,2,n2, std::complex(1.),phi,n2, a2,n2, std::complex(0.), a3,n2); - } - MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv, epstype, kp, 2, a3,n2, a2,n2); - if(NULL == phi){ - RNP::TBLAS::CopyMatrix<'A'>(n2,2, a2,n2, a3,n2); - }else{ - RNP::TBLAS::MultMM<'C','N'>(n2,2,n2, std::complex(1.),phi,n2, a2,n2, std::complex(0.), a3,n2); - } - - // At this point a3[n2] is alpha and b3[n2] is beta, - // ab[n2] is a and (ab+n2)[n2] is b - - *forward = RNP::TBLAS::ConjugateDot(n2, ab ,1, a3,1).real(); - *backward = -RNP::TBLAS::ConjugateDot(n2, &ab[n2],1, b3,1).real(); - std::complex diff = 0.5*(RNP::TBLAS::ConjugateDot(n2, &ab[n2],1, a3,1) - RNP::TBLAS::ConjugateDot(n2, b3,1, ab,1)); - *forward += diff; - *backward += std::conj(diff); - - if(NULL == work){ - rcwa_free(a2); - } -} - -void GetZPoyntingFluxComponents( - size_t n, // glist.n - const double *kx, const double *ky, - std::complex omega, - const std::complex *q, // length 2*glist.n - const std::complex *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix - int epstype, - const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - const std::complex *phi, // size (2*glist.n)^2 - const std::complex *ab, // length 4*glist.n - std::complex *forward, - std::complex *backward, - std::complex *work -){ - const size_t n2 = 2*n; - - std::complex *a2 = work; - if(NULL == work){ - a2 = (std::complex *)rcwa_malloc(sizeof(std::complex) * 4*n2); - } - std::complex *b2 = a2 + n2; - std::complex *a3 = b2 + n2; - std::complex *b3 = a3 + n2; - - memcpy(a2, ab, sizeof(std::complex) * 2*n2); - for(size_t i = 0; i < n2; ++i){ a2[i] /= (omega*q[i]); } - for(size_t i = 0; i < n2; ++i){ b2[i] /= (omega*q[i]); } - if(NULL == phi){ - RNP::TBLAS::CopyMatrix<'A'>(n2,2, a2,n2, a3,n2); - }else{ - RNP::TBLAS::MultMM<'N','N'>(n2,2,n2, std::complex(1.),phi,n2, a2,n2, std::complex(0.), a3,n2); - } - MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv, epstype, kp, 2, a3,n2, a2,n2); - // At this point, a2[n2] contains alpha_e, b2[n2] contains beta_e - - if(NULL == phi){ - RNP::TBLAS::CopyMatrix<'A'>(n2,2, ab,n2, a3,n2); - }else{ - RNP::TBLAS::MultMM<'N','N'>(n2,2,n2, std::complex(1.),phi,n2, ab,n2, std::complex(0.), a3,n2); - } - // At this point, a3[n2] contains alpha_h, b3[n2] contains beta_h - - for(size_t i = 0; i < n; ++i){ - forward[i] = 0; - backward[i] = 0; - for(size_t j = 0; j < n2; j+=n){ - const size_t k = i+j; - forward[i] += std::real(std::conj(a2[k])*a3[k]); - backward[i] -= std::real(std::conj(b2[k])*b3[k]); - std::complex diff = 0.5*(std::conj(b3[k])*a2[k] - std::conj(b2[k])*a3[k]); - forward[i] += diff; - backward[i] += std::conj(diff); - } - } - - if(NULL == work){ - rcwa_free(a2); - } -} - -static void GetInPlaneFieldVector( - size_t n, // glist.n - const double *kx, const double *ky, - std::complex omega, - const std::complex *q, // length 2*glist.n - const std::complex *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix - int epstype, - const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - const std::complex *phi, // size (2*glist.n)^2 - const std::complex *ab, // length 4*glist.n - std::complex *eh // length 8*2*glist.n -){ - // ab are the mode amplitudes times the f(z) operators in the column vector - // on the bottom row of eqn 35. By the time we enter this function, - // TranslateAmplitudes has already been called which does the multiplcation - // by the f(z), f(d-z) operators. Those operators are the things that take - // the mode amplitudes at each interface and use the - // eigenvalues/propagation constants q to find the values of the mode - // amplitudes at a particular z coordinate. The array has length 4n. The - // forward in the first 2n locations, and the backward mode amplitudes are - // stored in the remaining 2n locations. The forward amplitudes are - // relative to the front of the layer, while the backward amplitudes are - // relative to the back of the layer. - // So ab = [f(z)a, f(d-z)b] - - const size_t n2 = 2*n; - const size_t n4 = 2*n2; - - // This is the column vector on the bottom line of eqn 35 by q^-1 and - // storing it in eh[8N:12N]. This will be used for computing the top two - // blocks of the matrix in eqn 35 - // TODO: I don't understand why the omega is in here - for(size_t i = 0; i < n2; ++i){ - eh[4*n2+i] = ab[i]/(omega*q[i]); - eh[5*n2+i] = -ab[i+n2]/(omega*q[i]); - } - // This copies the array ab, which is of length 4n, to the 12N-th element - // of eh. eh is only 16N elements long, so this takes up the last section - // of space in the array. At this point eh[8N:16N] = [f(z)a/q -f(d-z)b/q - // f(z)a f(d-z)b]. The minus sign in the 2nd section is needed later, - // because the top right block of the matrix in eqn 35 has a minus sign. - RNP::TBLAS::Copy(n4, ab,1, &eh[6*n2],1); - if(NULL == phi){ - // This operation copies eh[8N:16N] (8N elements) to eh[0:8N] (also 8N - // elements) - RNP::TBLAS::CopyMatrix<'A'>(n2,4, &eh[4*n2],n2, &eh[0],n2); - }else{ - // This computes following matrix op - // C := alpha*A*B + beta*C - // m = 2N, n = 4, k = 2N - // A = phi (m x k) - // B = eh[8N:] (k x n) - // C = eh[0] (m x n) - // alpha = 1, beta = 0 - // This computes the first step of the operation in eqn 35 and stores - // it at the beginning of the eh array eh[0:8N] - RNP::TBLAS::MultMM<'N','N'>(n2,4,n2, std::complex(1.),phi,n2, &eh[4*n2],n2, std::complex(0.),&eh[0],n2); - } - // At this point, the 4 columns of &eh[0] (length n2) are - // [ phi.inv(q).a -phi.inv(q).b phi.a phi.b ] - // kp = k-parallel matrix - // kp = omega^2 - Kappa = omega^2 - [ ky*epsinv*ky -ky*epsinv*kx ] - // [ -kx*epsinv*ky kx*epsinv*kx ] - // = omega^2 + [ -ky ] [ epsinv ] [ ky -kx ] - // [ kx ] - // So kp is the thing inside the parentheses in the upper row of the matrix - // in equation 35 - // This function call peforms the matrix multiplication in the top row of - // eqn 35. There are 2 "columns" of 2N elements, so the first 4N elements - // of eh (see top of comments for contents of eh) get multiplied by the kp matrix - // and stored in eh[8N:12N] - MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv, epstype, kp, 2, &eh[0],n2, &eh[4*n2],n2); - // Computes eh[4N:6N] + eh[6N:8N] and stores in in eh[6N:8N]. This is - // adding the two bottom blocks of eqn 35 together after multiplication to - // get the hx and hy components - RNP::TBLAS::Axpy(n2, std::complex(1.),&eh[2*n2],1, &eh[3*n2],1); - // Computes eh[10N:12N] + eh[8N:10N] and stores it in eh[8N:10N]. This is - // just adding the two top blocks in eqn 35 together after all the - // M - // multiplication to get the -ey and ex components - RNP::TBLAS::Axpy(n2, std::complex(1.),&eh[5*n2],1, &eh[4*n2],1); - /* - // hx = eh[6N:7N] - const std::complex *hx = &eh[3*n2+0]; - // hy = eh[7N:8N] - const std::complex *hy = &eh[3*n2+n]; - // -ey = eh[8N:9N] - const std::complex *ney = &eh[4*n2+0]; - // ex = eh[9N:10N] - const std::complex *ex = &eh[4*n2+n]; - */ -} - -void GetFieldAtPoint( - size_t n, // glist.n - const double *kx, - const double *ky, - std::complex omega, - const std::complex *q, // length 2*glist.n - const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - const std::complex *phi, // size (2*glist.n)^2 - const std::complex *epsilon_inv, // size (glist.n)^2, non NULL for efield != NULL - int epstype, - const std::complex *ab, // length 4*glist.n - const double r[2], // coordinates within layer - std::complex efield[3], - std::complex hfield[3], - std::complex *work // 8*n2 -){ - // This is just a way of instantiating 1 with a complex number type, and 0 - // with a complex number type, which I think is necessary for type - // compatibility and whatnot with all the lapack functions. - const std::complex z_zero(0.); - const std::complex z_one(1.); - const size_t n2 = 2*n; - - // This is the memory workspace for the fourier coeffs of the fields. Its - // of length 16*N - std::complex *eh = work; - if(NULL == work){ - eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); - } - - // This is the function I need to modify. Right now it puts ex, ey, hx, hy - // inside eh (i.e the fourier coefficients of the transverse field - // components). I need to get at the fourier coefficients of d_normal and - // e_tangential and return them, then use the code below to get the real - // space reconstructions - GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); - // So eh contains the fourier coefficients of hx, hy, -ey and ex. This code - // below sets up pointers to the various continous chunks of the array that - // contain the relevant fourier components. Notice the first 6N (3*2*N) - // elements of the array are skipped. - const std::complex *hx = &eh[3*n2+0]; - const std::complex *hy = &eh[3*n2+n]; - const std::complex *ney = &eh[4*n2+0]; - const std::complex *ex = &eh[4*n2+n]; - - std::complex fE[3], fH[3]; - fE[0] = 0; - fE[1] = 0; - fE[2] = 0; - fH[0] = 0; - fH[1] = 0; - fH[2] = 0; - - if(NULL != efield && NULL != epsilon_inv){ - // This is setting up the left side of equation 11 of the paper and - // storing in the first N elements of the eh array - for(size_t i = 0; i < n; ++i){ - eh[i] = (ky[i]*hx[i] - kx[i]*hy[i]); - } - // This multiplies epsilon_inv by eh[0:N] and copies it into the - // memory space of eh[N:2N]. Looking at equation 11 of the paper, you - // can see after setting up the left size we still need to multiply it - // by \hat{\epsilon}_z^-1. We haven't divided by omega yet, for some - // reason that is done in the next loop. I think we could do it by - // replaceing epsilon_inv[0] in the Scale call by epsilon_inv[0]/omega - // and z_one by 1/omega in the MultMV call. Perhaps there is a reason - // the author didn't do it this way - if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ - RNP::TBLAS::Scale(n, epsilon_inv[0], eh,1); - RNP::TBLAS::Copy(n, eh,1, &eh[n], 1); - }else{ - // eh[n] = (1+0i) * epsilon_inv*eh + 0 * eh[n] - // The 0 * eh[n] part seems silly but it allows us to copy the - // result of the first term into eh[n] - RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1); - } - } - - // This loop below is basically equation 5 from the paper!! - for(size_t i = 0; i < n; ++i){ - // This is (\vec{k} + \vec{G}) \cdot \vec{x} at each G vector - const double theta = (kx[i]*r[0] + ky[i]*r[1]); - // This is using Euler's rule to compute e^{i (\vec{k} + \vec{G}) \cdot \vec{x}) - const std::complex phase(cos(theta),sin(theta)); - // And this is just where each individual term is added to the total - fH[0] += hx[i]*phase; - fH[1] += hy[i]*phase; - fE[0] += ex[i]*phase; - fE[1] -= ney[i]*phase; - // These is eqn 14 from the paper. z component of h is expressed in - // terms of transverse components of e - fH[2] += (kx[i] * -ney[i] - ky[i] * ex[i]) * (phase / omega); - // This is the final bit of equation 11 from the paper. We need to - // divide by omega to get the fourier coefficients of e_z, then - // multiply by phase and sum to get real space resconstruction - fE[2] += eh[n+i] * (phase / omega); - } - - if(NULL != efield && NULL != epsilon_inv){ - efield[0] = fE[0]; - efield[1] = fE[1]; - efield[2] = fE[2]; - } - if(NULL != hfield){ - hfield[0] = fH[0]; - hfield[1] = fH[1]; - hfield[2] = fH[2]; - } - - if(NULL == work){ - rcwa_free(eh); - } -} - - -void GetFieldAtPointImproved( - size_t n, // glist.n - const double *kx, - const double *ky, - std::complex omega, - const std::complex *q, // length 2*glist.n - const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - const std::complex *phi, // size (2*glist.n)^2 - const std::complex *epsilon_inv, // size (glist.n)^2, non NULL for efield != NULL. This is \hat{\eta} in paper notation - const std::complex *P, // Projection operator onto tangential. 2nx2n matrix, size 4n^2 - const std::complex *W, // Weismann operator. 2nx2n matrix, size 4n^2 - const std::complex *Epsilon2, // curly Epsilon2 matrix (see eqn 51) - const std::complex epsilon, // The scalar, real space value of epsilon - int epstype, - const std::complex *ab, // length 4*glist.n - const double r[2], // coordinates within layer - std::complex efield[3], - std::complex hfield[3], - std::complex *work // 8*n2 -){ - // This is just a way of instantiating 1 with a complex number type, and 0 - // with a complex number type, which I think is necessary for type - // compatibility and whatnot with all the lapack functions. - const std::complex z_zero(0.); - const std::complex z_one(1.); - const size_t n2 = 2*n; - -#ifdef DUMP_MATRICES - if(NULL != P){ - DUMP_STREAM << "P:" << std::endl; - RNP::IO::PrintMatrix(n2,n2,P,n2, DUMP_STREAM) << std::endl << std::endl; - DUMP_STREAM << "W:" << std::endl; - RNP::IO::PrintMatrix(n2,n2,W,n2, DUMP_STREAM) << std::endl << std::endl; - } -#endif -#ifdef DUMP_MATRICES - if(NULL != W){ - DUMP_STREAM << "Epsilon2:" << std::endl; - RNP::IO::PrintMatrix(n2,n2,Epsilon2,n2, DUMP_STREAM) << std::endl << std::endl; - } -#endif - - // This is the memory workspace for the fourier coeffs of the fields. Its - // of length 16*N - std::complex *eh = work; - if(NULL == work){ - eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); - } - - // The function below puts ex, ey, hx, hy inside eh (i.e the fourier - // coefficients of the transverse field components). I need to get at the - // fourier coefficients of d_normal and e_tangential, then use the code - // below this function call to get the real space reconstructions - // hx = eh[6N:7N] - // hy = eh[7N:8N] - // -ey = eh[8N:9N] - // ex = eh[9N:10N] - GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); - // So eh contains the fourier coefficients of hx, hy, -ey and ex. This code - // below sets up pointers to the various continous chunks of the array that - // contain the relevant fourier components. Notice the first 6N (3*2*N) - // elements of the array are skipped. - // START HERE - // 1) I need to get access to Epsilon2, and use it to extract the fourier - // coefficients of dx, dy by multiplying it with [-ey ex]. In other words - // [-dy dx] = Epsilon2 [-ey ex]. - // 2) I then use P to project onto normal basis, and I - P to project onto - // tangential basis. - const std::complex *hx = &eh[3*n2+0]; - const std::complex *hy = &eh[3*n2+n]; - const std::complex *ney = &eh[4*n2+0]; - const std::complex *ex = &eh[4*n2+n]; - - // Arrays for the real space reconstructions of the 3 field components - std::complex fE[3], fH[3]; - fE[0] = 0; - fE[1] = 0; - fE[2] = 0; - fH[0] = 0; - fH[1] = 0; - fH[2] = 0; - - // So the equations look like this: - // [-d_{n,y} d_{n,x}] = W [-ey ex] - // \hat{T} = I - P - // [-e_{t,y} e_{t,x}] = T [-ey, ex] - // Remeber P is NULL for unpatterned layers, and we don't need to use this - // procedure for unpatterned layers anyway! - std::complex *dn_and_et = (std::complex*)rcwa_malloc(sizeof(std::complex) * 2*n2); - // These are just some convenience pointers to the various sections of - // the dn and et array - const std::complex *ndny = &dn_and_et[0]; - const std::complex *dnx = &dn_and_et[n]; - const std::complex *nety = &dn_and_et[n2]; - const std::complex *etx = &dn_and_et[n2+n]; - if(NULL != W){ - // We now have all we need to get the Fourier coefficients of the - // normal component of d and the tangential components of e. I - // allocated a new array for the dnormal components and etangential - // components. dnormal will be in the first 2N elements, etangential in - // the last 2N elements - // First dnormal - // This is my approach - /* RNP::TBLAS::MultMV<'N'>(n2,n2,z_one,N,n2,&eh[4*n2],1, z_zero,&dn_and_et[0],1); */ - // This is Weismann's approach verbatim - RNP::TBLAS::MultMV<'N'>(n2,n2,z_one,W,n2,&eh[4*n2],1, z_zero,&dn_and_et[0],1); -#ifdef DUMP_MATRICES - DUMP_STREAM << "Dnormal:" << std::endl; - RNP::IO::PrintVector(n2,dn_and_et,1,DUMP_STREAM) << std::endl << std::endl; -#endif - // Then lets construct \hat{T}. This is the operator that projects onto the - // direction tangential to a material interface, and is just I - P - std::complex *T; - // std::complex diag = std::complex(1.0*n); - std::complex diag = std::complex(1.0); - T = (std::complex*)rcwa_malloc(sizeof(std::complex) * n2*n2); - // Set T to the identity first - RNP::TBLAS::SetMatrix<'A'>(n2, n2, std::complex(0.), diag, T, n2); -#ifdef DUMP_MATRICES - DUMP_STREAM << "T:" << std::endl; - RNP::IO::PrintMatrix(n2,n2,N,n2, DUMP_STREAM) << std::endl << std::endl; -#endif - // Now subtract P from it. This loops through each row and treats each row - // as a vector, computing - // T[i,:] = -1*P[i,:] + T[i,:] - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Axpy(n2, std::complex(-1.), &P[0+i*n2], 1, &T[0+i*n2], 1); - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "T:" << std::endl; - RNP::IO::PrintMatrix(n2,n2,T,n2, DUMP_STREAM) << std::endl << std::endl; - /* DUMP_STREAM << "Epsilon_inv:" << std::endl; */ - /* RNP::IO::PrintMatrix(n,n, epsilon_inv,n, DUMP_STREAM) << std::endl << std::endl; */ -#endif - // Now we compute the matrix product of N and Epsilon2 and store it in - // the memory space of N - /* RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, std::complex(1.),N,n2,Epsilon2,n2,std::complex(0.),N,n2); */ - // Now etangential - RNP::TBLAS::MultMV<'N'>(n2,n2,z_one,T,n2,&eh[4*n2],1, z_zero,&dn_and_et[n2],1); -#ifdef DUMP_MATRICES - DUMP_STREAM << "Etangential:" << std::endl; - RNP::IO::PrintVector(n2,&dn_and_et[n2],1,DUMP_STREAM) << std::endl << std::endl; -#endif - if(NULL != T){ - rcwa_free(T); - } - } - if(NULL != efield && NULL != epsilon_inv){ - // This is setting up the left side of equation 11 of the paper and - // storing in the first N elements of the eh array. The goal is to - // compute the ez components - for(size_t i = 0; i < n; ++i){ - eh[i] = (ky[i]*hx[i] - kx[i]*hy[i]); - } - // This multiplies epsilon_inv by eh[0:N] and copies it into the - // memory space of eh[N:2N]. Looking at equation 11 of the paper, you - // can see after setting up the left size we still need to multiply it - // by \hat{\epsilon}_z^-1. We haven't divided by omega yet, for some - // reason that is done in the next loop. I think we could do it by - // replaceing epsilon_inv[0] in the Scale call by epsilon_inv[0]/omega - // and z_one by 1/omega in the MultMV call. Perhaps there is a reason - // the author didn't do it this way - if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ - RNP::TBLAS::Scale(n, epsilon_inv[0], eh,1); - RNP::TBLAS::Copy(n, eh,1, &eh[n], 1); - }else{ - // eh[n] = (1+0i) * epsilon_inv*eh[0:N] + 0 * eh[N] - // The 0 * eh[n] part seems silly but it allows us to copy the - // result of the first term into eh[n] - RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1); - } - } - - // This loop below is basically equation 5 from the paper!! - for(size_t i = 0; i < n; ++i){ - // This is (\vec{k} + \vec{G}) \cdot \vec{x} at each G vector - const double theta = (kx[i]*r[0] + ky[i]*r[1]); - // This is using Euler's rule to compute e^{i (\vec{k} + \vec{G}) \cdot \vec{x}) - const std::complex phase(cos(theta),sin(theta)); - // And this is just where each individual term is added to the total - fH[0] += hx[i]*phase; - fH[1] += hy[i]*phase; - if(NULL != P){ - fE[0] += dnx[i]*(phase/epsilon) + etx[i]*phase; - fE[1] -= ndny[i]*(phase/epsilon) + nety[i]*phase; - } else { - fE[0] += ex[i]*phase; - fE[1] -= ney[i]*phase; - } - // These is eqn 14 from the paper. z component of h is expressed in - // terms of transverse components of e - fH[2] += (kx[i] * -ney[i] - ky[i] * ex[i]) * (phase / omega); - // This is the final bit of equation 11 from the paper. We need to - // divide by omega to get the fourier coefficients of e_z, then - // multiply by phase and sum to get real space resconstruction - fE[2] += eh[n+i] * (phase / omega); - } - - if(NULL != efield && NULL != epsilon_inv){ - efield[0] = fE[0]; - efield[1] = fE[1]; - efield[2] = fE[2]; - } - if(NULL != hfield){ - hfield[0] = fH[0]; - hfield[1] = fH[1]; - hfield[2] = fH[2]; - } - - if(NULL == work){ - rcwa_free(eh); - } - if(NULL != dn_and_et){ - rcwa_free(dn_and_et); - } -} - -void GetFieldOnGridImproved( - size_t n, // glist.n - int *G, - const double *kx, const double *ky, - std::complex omega, - const std::complex *q, // length 2*glist.n - const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - const std::complex *phi, // size (2*glist.n)^2 - const std::complex *epsilon_inv, // size (glist.n)^2, non NULL for efield != NULL || kp == NULL - const std::complex *P, // Projection operator onto tangential. 2nx2n matrix, size 4n^2 - const std::complex *W, // Weismann operator. 2nx2n matrix, size 4n^2 - const std::complex *epsilon, // The scalar, real space value of epsilon - int epstype, - const std::complex *ab, // length 4*glist.n - const size_t nxy[2], // number of points per lattice direction - std::complex *efield, - std::complex *hfield -){ - const std::complex z_zero(0.); - const std::complex z_one(1.); - const size_t n2 = 2*n; - // Total # of real space sampling points - const size_t N = nxy[0]*nxy[1]; - // Center of grid in real space - const int nxyoff[2] = { (int)(nxy[0]/2), (int)(nxy[1]/2) }; - int inxy[2] = { (int)nxy[0], (int)nxy[1] }; - int inxy_rev[2] = { (int)nxy[1], (int)nxy[0] }; - - std::complex *eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); - - GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); - const std::complex *hx = &eh[3*n2+0]; - const std::complex *hy = &eh[3*n2+n]; - const std::complex *ney = &eh[4*n2+0]; - const std::complex *ex = &eh[4*n2+n]; - - // This is allocating memory for 6 Fourier transforms of size N (number of - // real space grid points requested). This makes sense because there are 3 - // E field components to compute, and 3 H field components to compute (6 - // overall) - // The two arrays below are arrays of pointers. This means each element in - // from/to is a pointer to an entirely new memory space that is allocated - // by the fft_alloc_complex function - std::complex *from[8]; - std::complex *to[8]; - fft_plan plan[8]; - for(unsigned i = 0; i < 8; ++i){ - from[i] = fft_alloc_complex(N); - to[i] = fft_alloc_complex(N); - memset(from[i], 0, sizeof(std::complex) * N); - // This is just a function that allocates a struct that "plans" or - // configures the Fourier transform. Mainly astruct containing the - // pointers to memory space containing real space data (from) and - // pointer to the memory space for the results (to) - plan[i] = fft_plan_dft_2d(inxy_rev, from[i], to[i], 1); - } - - // This is setting up the left side of equation 11 of the paper and - // storing in the first N elements of the eh array - for(size_t i = 0; i < n; ++i){ - eh[i] = (ky[i]*hx[i] - kx[i]*hy[i]); - } - // This multiplies epsilon_inv by eh[0:N] and copies it into the - // memory space of eh[N:2N]. Looking at equation 11 of the paper, you - // can see after setting up the left size we still need to multiply it - // by \hat{\epsilon}_z^-1. We haven't divided by omega yet, for some - // reason that is done in the next loop. I think we could do it by - // replaceing epsilon_inv[0] in the Scale call by epsilon_inv[0]/omega - // and z_one by 1/omega in the MultMV call. Perhaps there is a reason - // the author didn't do it this way - if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ - RNP::TBLAS::Scale(n, epsilon_inv[0], eh,1); - RNP::TBLAS::Copy(n, eh,1, &eh[n], 1); - }else{ - RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1); - } - - // So the equations look like this: - // [-d_{n,y} d_{n,x}] = W [-ey ex] - // \hat{T} = I - P - // [-e_{t,y} e_{t,x}] = T [-ey, ex] - // Remeber P is NULL for unpatterned layers, and we don't need to use this - // procedure for unpatterned layers anyway! - std::complex *dn_and_et = (std::complex*)rcwa_malloc(sizeof(std::complex) * 2*n2); - // These are just some convenience pointers to the various sections of - // the dn and et array - const std::complex *ndny = &dn_and_et[0]; - const std::complex *dnx = &dn_and_et[n]; - const std::complex *nety = &dn_and_et[n2]; - const std::complex *etx = &dn_and_et[n2+n]; - if(NULL != W){ - // We now have all we need to get the Fourier coefficients of the - // normal component of d and the tangential components of e. I - // allocated a new array for the dnormal components and etangential - // components. dnormal will be in the first 2N elements, etangential in - // the last 2N elements - // First dnormal - // This is my approach - /* RNP::TBLAS::MultMV<'N'>(n2,n2,z_one,N,n2,&eh[4*n2],1, z_zero,&dn_and_et[0],1); */ - // This is Weismann's approach verbatim - RNP::TBLAS::MultMV<'N'>(n2,n2,z_one,W,n2,&eh[4*n2],1, z_zero,&dn_and_et[0],1); -#ifdef DUMP_MATRICES - DUMP_STREAM << "Dnormal:" << std::endl; - RNP::IO::PrintVector(n2,dn_and_et,1,DUMP_STREAM) << std::endl << std::endl; -#endif - // Then lets construct \hat{T}. This is the operator that projects onto the - // direction tangential to a material interface, and is just I - P - std::complex *T; - // std::complex diag = std::complex(1.0*n); - std::complex diag = std::complex(1.0); - T = (std::complex*)rcwa_malloc(sizeof(std::complex) * n2*n2); - // Set T to the identity first - RNP::TBLAS::SetMatrix<'A'>(n2, n2, std::complex(0.), diag, T, n2); -#ifdef DUMP_MATRICES - DUMP_STREAM << "T:" << std::endl; - RNP::IO::PrintMatrix(n2,n2,N,n2, DUMP_STREAM) << std::endl << std::endl; -#endif - // Now subtract P from it. This loops through each row and treats each row - // as a vector, computing - // T[i,:] = -1*P[i,:] + T[i,:] - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Axpy(n2, std::complex(-1.), &P[0+i*n2], 1, &T[0+i*n2], 1); - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "T:" << std::endl; - RNP::IO::PrintMatrix(n2,n2,T,n2, DUMP_STREAM) << std::endl << std::endl; - /* DUMP_STREAM << "Epsilon_inv:" << std::endl; */ - /* RNP::IO::PrintMatrix(n,n, epsilon_inv,n, DUMP_STREAM) << std::endl << std::endl; */ -#endif - // Now etangential - RNP::TBLAS::MultMV<'N'>(n2,n2,z_one,T,n2,&eh[4*n2],1, z_zero,&dn_and_et[n2],1); -#ifdef DUMP_MATRICES - DUMP_STREAM << "Etangential:" << std::endl; - RNP::IO::PrintVector(n2,&dn_and_et[n2],1,DUMP_STREAM) << std::endl << std::endl; -#endif - if(NULL != T){ - rcwa_free(T); - } - } - - // Here is where we populate the matrices of fourier coefficients that we - // are then going to DFT - for(size_t i = 0; i < n; ++i){ - // G just contains indices for the two components of the G vectors. A - // pair of these indices defines a point in k-space, and the indices in - // this pair are stored adjacent to one another in the G array. So - // these are basically just (m, n). The indices run from -Nmax/2 to - // Nmax/2 where Nmax is the radius in k-space you get after circular - // truncation. - const int iu = G[2*i+0]; - const int iv = G[2*i+1]; - // This if block has to do with the shifting of the origin. Is it at - // the corner or the center of the unit cell? - if( - (nxyoff[0] - (int)nxy[0] < iu && iu <= nxyoff[0]) && - (nxyoff[1] - (int)nxy[1] < iv && iv <= nxyoff[1]) - ){ - - const int ii = (iu >= 0 ? iu : iu + nxy[0]); - const int jj = (iv >= 0 ? iv : iv + nxy[1]); - // I think I need to jump in here, and just populate from[3] with - // the fourier coefficients of Dnormal,x and Etangential,x and - // from[4] with the y components - // Dnormal_{x,y} inside from[3],from[4] and Etangential_{x,y} - // inside from[6],from[7] - from[0][ii+jj*nxy[0]] = hx[i]; - from[1][ii+jj*nxy[0]] = hy[i]; - from[2][ii+jj*nxy[0]] = (kx[i] * ney[i] + ky[i] * ex[i]) / omega; - from[3][ii+jj*nxy[0]] = dnx[i]; - from[4][ii+jj*nxy[0]] = -ndny[i]; - from[5][ii+jj*nxy[0]] = eh[n+i] / omega; - from[6][ii+jj*nxy[0]] = etx[i]; - from[7][ii+jj*nxy[0]] = -nety[i]; - } - } - - for(unsigned i = 0; i < 8; ++i){ - fft_plan_exec(plan[i]); - } - - for(size_t j = 0; j < nxy[1]; ++j){ - for(size_t i = 0; i < nxy[0]; ++i){ - // This is the real space reconstruction - hfield[3*(i+j*nxy[0])+0] = to[0][i+j*nxy[0]]; - hfield[3*(i+j*nxy[0])+1] = to[1][i+j*nxy[0]]; - hfield[3*(i+j*nxy[0])+2] = to[2][i+j*nxy[0]]; - // x component - efield[3*(i+j*nxy[0])+0] = to[3][i+j*nxy[0]]/epsilon[i+j*nxy[0]] + to[6][i+j*nxy[0]]; - // y component - efield[3*(i+j*nxy[0])+1] = to[4][i+j*nxy[0]]/epsilon[i+j*nxy[0]] + to[7][i+j*nxy[0]];; - // z component - efield[3*(i+j*nxy[0])+2] = to[5][i+j*nxy[0]]; - } - } - - for(unsigned i = 0; i < 8; ++i){ - fft_plan_destroy(plan[i]); - fft_free(to[i]); - fft_free(from[i]); - } - rcwa_free(eh); -} - -void GetFieldOnGrid( - size_t n, // glist.n - int *G, - const double *kx, const double *ky, - std::complex omega, - const std::complex *q, // length 2*glist.n - const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - const std::complex *phi, // size (2*glist.n)^2 - const std::complex *epsilon_inv, // size (glist.n)^2, non NULL for efield != NULL || kp == NULL - int epstype, - const std::complex *ab, // length 4*glist.n - const size_t nxy[2], // number of points per lattice direction - std::complex *efield, - std::complex *hfield -){ - const std::complex z_zero(0.); - const std::complex z_one(1.); - const size_t n2 = 2*n; - // Total # of real space sampling points - const size_t N = nxy[0]*nxy[1]; - // Center of grid - const int nxyoff[2] = { (int)(nxy[0]/2), (int)(nxy[1]/2) }; - int inxy[2] = { (int)nxy[0], (int)nxy[1] }; - int inxy_rev[2] = { (int)nxy[1], (int)nxy[0] }; - - std::complex *eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); - - GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); - const std::complex *hx = &eh[3*n2+0]; - const std::complex *hy = &eh[3*n2+n]; - const std::complex *ney = &eh[4*n2+0]; - const std::complex *ex = &eh[4*n2+n]; - - std::complex *from[6]; - std::complex *to[6]; - fft_plan plan[6]; - for(unsigned i = 0; i < 6; ++i){ - from[i] = fft_alloc_complex(N); - to[i] = fft_alloc_complex(N); - memset(from[i], 0, sizeof(std::complex) * N); - plan[i] = fft_plan_dft_2d(inxy_rev, from[i], to[i], 1); - } - - for(size_t i = 0; i < n; ++i){ - eh[i] = (ky[i]*hx[i] - kx[i]*hy[i]); - } - if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ - RNP::TBLAS::Scale(n, epsilon_inv[0], eh,1); - RNP::TBLAS::Copy(n, eh,1, &eh[n], 1); - }else{ - RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1); - } - for(size_t i = 0; i < n; ++i){ - const int iu = G[2*i+0]; - const int iv = G[2*i+1]; - if( - (nxyoff[0] - (int)nxy[0] < iu && iu <= nxyoff[0]) && - (nxyoff[1] - (int)nxy[1] < iv && iv <= nxyoff[1]) - ){ - const int ii = (iu >= 0 ? iu : iu + nxy[0]); - const int jj = (iv >= 0 ? iv : iv + nxy[1]); - from[0][ii+jj*nxy[0]] = hx[i]; - from[1][ii+jj*nxy[0]] = hy[i]; - from[2][ii+jj*nxy[0]] = (kx[i] * ney[i] + ky[i] * ex[i]) / omega; - from[3][ii+jj*nxy[0]] = ex[i]; - from[4][ii+jj*nxy[0]] = -ney[i]; - from[5][ii+jj*nxy[0]] = eh[n+i] / omega; - } - } - - for(unsigned i = 0; i < 6; ++i){ - fft_plan_exec(plan[i]); - } - - for(size_t j = 0; j < nxy[1]; ++j){ - for(size_t i = 0; i < nxy[0]; ++i){ - hfield[3*(i+j*nxy[0])+0] = to[0][i+j*nxy[0]]; - hfield[3*(i+j*nxy[0])+1] = to[1][i+j*nxy[0]]; - hfield[3*(i+j*nxy[0])+2] = to[2][i+j*nxy[0]]; - efield[3*(i+j*nxy[0])+0] = to[3][i+j*nxy[0]]; - efield[3*(i+j*nxy[0])+1] = to[4][i+j*nxy[0]]; - efield[3*(i+j*nxy[0])+2] = to[5][i+j*nxy[0]]; - } - } - - for(unsigned i = 0; i < 6; ++i){ - fft_plan_destroy(plan[i]); - fft_free(to[i]); - fft_free(from[i]); - } - rcwa_free(eh); -} - -void GetZStressTensorIntegral( - size_t n, // glist.n - const double *kx, - const double *ky, - std::complex omega, - const std::complex *q, // length 2*glist.n - const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - const std::complex *phi, // size (2*glist.n)^2 - const std::complex *epsilon_inv, // size (glist.n)^2 - const std::complex *epsilon2, // size (2*glist.n)^2 - int epstype, - const std::complex *ab, // length 4*glist.n - std::complex integral[3], - std::complex *work -){ - const size_t n2 = 2*n; - const std::complex z_zero(0.); - const std::complex z_one(1.); - - std::complex *eh = work; - if(NULL == work){ - eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); - } - - GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); - std::complex *ez = &eh[1*n2+n]; - std::complex *dz = &eh[2*n2+0]; - std::complex *hz = &eh[2*n2+n]; - const std::complex *hx = &eh[3*n2+0]; - const std::complex *hy = &eh[3*n2+n]; - const std::complex *ney = &eh[4*n2+0]; - const std::complex *ex = &eh[4*n2+n]; - std::complex *ndy = &eh[5*n2+0]; - std::complex *dx = &eh[5*n2+n]; - - integral[0] = 0; - integral[1] = 0; - integral[2] = 0; - - for(size_t i = 0; i < n; ++i){ - hz[i] = (kx[i] * -ney[i] - ky[i] * ex[i]) / omega; - dz[i] = (ky[i]*hx[i] - kx[i]*hy[i]) / omega; - } - if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ - RNP::TBLAS::Copy(n, dz,1, ez, 1); - RNP::TBLAS::Scale(n, epsilon_inv[0], ez,1); - }else{ - RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, dz,1, z_zero,ez,1); - } - RNP::TBLAS::MultMV<'N'>(n2,n2, z_one,epsilon2,n2, ney,1, z_zero,ndy,1); - - for(size_t i = 0; i < n; ++i){ - std::complex cdz = std::conj(dz[i]); - std::complex cbz = std::conj(hz[i]); - integral[0] += ex[i]*cdz; - integral[1] +=-ney[i]*cdz; - integral[2] += 0.5*ez[i]*cdz; - integral[2] -= 0.5*ney[i]*std::conj(ndy[i]); - integral[2] -= 0.5*ex[i]*std::conj(dx[i]); - - integral[0] += hx[i]*cbz; - integral[1] += hy[i]*cbz; - integral[2] += 0.5*hz[i]*std::conj(hz[i]); - integral[2] -= 0.5*hy[i]*std::conj(hy[i]); - integral[2] -= 0.5*hx[i]*std::conj(hx[i]); - } - - if(NULL == work){ - rcwa_free(eh); - } -} - -std::complex zsinc(const std::complex &z){ - if(std::abs(z) < DBL_EPSILON){ - return 1.0; - }else{ - return std::sin(z)/z; - } -} - -void GetLayerVolumeIntegral( - char which, - size_t n, // glist.n - const double *kx, - const double *ky, - std::complex omega, - double thickness, - const std::complex *q, // length 2*glist.n - const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - const std::complex *phi, // size (2*glist.n)^2 - const std::complex *epsilon_inv, // size (glist.n)^2 - const std::complex *epsilon2, // size (2*glist.n)^2 - int epstype, - const std::complex *ab, // length 4*glist.n - std::complex *integral, - std::complex *work -){ - const size_t n2 = 2*n; - const size_t n4 = 2*n2; - const std::complex omega2 = omega*omega; - const double aomega2(std::norm(omega)); - const double omega_2(1./aomega2); - std::complex omega2_2(1.); - if(0 != omega.imag()){ - omega2_2 = omega_2*omega2; - } - const std::complex z_zero(0.); - const std::complex z_one(1.); - - std::complex *Q = work; - if(NULL == work){ - Q = (std::complex*)rcwa_malloc(sizeof(std::complex)*n4*n4); - } - RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,0., Q,n4); - - // Form CsC (in Q) - if('E' == which || 'U' == which){ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, epsilon2,n2, &Q[0+0*n4],n4); - }else if('e' == which){ - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., &Q[0+0*n4],n4); - } - if('H' == which || 'U' == which){ - for(size_t i = 0; i < n; ++i){ - Q[i+i*n4] += omega_2*kx[i]*kx[i]; - } - for(size_t i = 0; i < n; ++i){ - Q[i+n+i*n4] += omega_2*ky[i]*kx[i]; - } - for(size_t i = 0; i < n; ++i){ - Q[i+(i+n)*n4] += omega_2*kx[i]*ky[i]; - } - for(size_t i = 0; i < n; ++i){ - Q[i+n+(i+n)*n4] += omega_2*ky[i]*ky[i]; - } - } - // At this point, top block of CsC is done - if('E' == which || 'U' == which){ - MakeKPMatrix(omega, n, kx, ky, epsilon_inv, epstype, kp, &Q[n2+n2*n4],n4); - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, -omega_2, &Q[n2+(i+n2)*n4],1); - Q[i+n2+(i+n2)*n4] += z_one; - Q[i+n2+(i+n2)*n4] *= omega2_2; - } - }else if('e' == which){ - if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ - RNP::TBLAS::SetMatrix<'A'>(n,n, 0.,std::norm(epsilon_inv[0]), &Q[n2+n2*n4],n4); - }else{ - RNP::TBLAS::MultMM<'C','N'>(n,n,n, z_one,epsilon_inv,n, epsilon_inv,n, z_zero,&Q[n2+n2*n4],n4); - } - RNP::TBLAS::CopyMatrix<'A'>(n,n, &Q[n2+n2*n4],n4, &Q[n2+n+n2*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n,n, &Q[n2+n2*n4],n4, &Q[n2+(n2+n)*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n,n, &Q[n2+n2*n4],n4, &Q[n2+n+(n2+n)*n4],n4); - for(size_t i = 0; i < n; ++i){ - RNP::TBLAS::Scale(n2, ky[i], &Q[n2+0+(n2+i)*n4],1); - RNP::TBLAS::Scale(n2, -kx[i], &Q[n2+0+(n2+n+i)*n4],1); - RNP::TBLAS::Scale(n2, ky[i], &Q[n2+i+n2*n4],n4); - RNP::TBLAS::Scale(n2, -kx[i], &Q[n2+n+i+n2*n4],n4); - } - } - if('H' == which || 'U' == which){ - for(size_t i = 0; i < n2; ++i){ - Q[i+n2+(i+n2)*n4] += z_one; - } - } - - // Apply the amplitude-to-field matrix M to either side - // CsC = [ A ] M = [ kp.phi.inv(q) -kp.phi.inv(q) ] = [ fie -fie ] - // [ B ] [ phi phi ] [ phi phi ] - // Q = M^H * CsC * M = [ S+T -S+T ], S = fie^H * A * fie - // [ -S+T S+T ] T = phi^H * B * phi - // integral = int_0^thickness [x(z)^H * Q * x(z)] dz, x(z) = [a*exp(iqz);b*exp(iq(d-z))] - - // Make T - if(NULL != phi){ - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, z_one,&Q[n2+n2*n4],n4, phi,n2, z_zero,&Q[0+n2*n4],n4); - RNP::TBLAS::MultMM<'C','N'>(n2,n2,n2, z_one,phi,n2, &Q[0+n2*n4],n4, z_zero, &Q[n2+n2*n4],n4); - } - // At this point: Q = [ A ] - // [ T ] - if(NULL == kp){ - MakeKPMatrix(omega, n, kx, ky, epsilon_inv, epstype, kp, &Q[0+n2*n4],n4); - } - const std::complex *kp_use = (kp != NULL ? kp : &Q[0+n2*n4]); - const size_t ldkp = (kp != NULL ? n2 : n4); - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, z_one,&Q[0+0*n4],n4, kp_use,ldkp, z_zero,&Q[n2+0*n4],n4); - RNP::TBLAS::MultMM<'C','N'>(n2,n2,n2, z_one,kp_use,ldkp, &Q[n2+0*n4],n4, z_zero, &Q[0+0*n4],n4); - if(NULL != phi){ - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, z_one,&Q[0+0*n4],n4, phi,n2, z_zero,&Q[n2+0*n4],n4); - RNP::TBLAS::MultMM<'C','N'>(n2,n2,n2, z_one,phi,n2, &Q[n2+0*n4],n4, z_zero, &Q[0+0*n4],n4); - } - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, 1./(omega*q[i]), &Q[0+i*n4],1); - RNP::TBLAS::Scale(n2, 1./(omega*q[i]), &Q[i+0*n4],n4); - } - // At this point: Q = [ S ] - // [ T ] - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Q[n2+n2*n4],n4, &Q[n2+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Q[n2+n2*n4],n4, &Q[0+n2*n4],n4); - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Axpy(n2, -1., &Q[0+i*n4],1, &Q[n2+i*n4],1); - RNP::TBLAS::Axpy(n2, -1., &Q[0+i*n4],1, &Q[0+(i+n2)*n4],1); - } - // At this point: Q = [ S T-S ] - // [ T-S T ] - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Axpy(n2, 1., &Q[0+i*n4],1, &Q[n2+(i+n2)*n4],1); - } - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Q[n2+n2*n4],n4, &Q[0+0*n4],n4); - // Q done - - // The integral we need to construct is - // \int_0^d x(z)^H * Q * x(z) dz, where x(z) = [ a*e^{iqz}; b*e^{iq(d-z)} ] - // We are summing terms like - // \int_0^d x_i(z)^* x_j(z) Q_{ij} dz - // The generic form is - // Q_{ij} ab[i]^* ab[j] \int_0^d e^{-iq^* L_i(z)} e^{iq L_j(z)}, - // where L_i(z) is either z or d-z, and s_i is either 1 or -1. - // Note that L_i(d) = d or 0, so L_i(d) = d*(1+s_i)/2, and L_i(0) = d*(1-s_i)/2 - // The integral simplifies to - // \frac{1}{i(s_j q - s_i q^*)} [ e^{iq L_j(d)-iq^* L_i(d)} - e^{iq L_j(0)-iq^* L_i(0)} ] - // \frac{1}{i(s_j q - s_i q^*)} [ e^{id/2 (q (s_j+1)-q^* (s_i+1))} - e^{id/2 (q (1-s_j)-q^* (1-s_i))} ] - // \frac{1}{i(s_j q - s_i q^*)} [ e^{id/2 (q-q^*)} e^{id/2 (s_j q - s_i q^*)} - e^{id/2 (q-q^*)} e^{-id/2 (s_j q - s_i q^*)} ] - // \frac{e^{id/2 (q-q^*)} }{i(s_j q - s_i q^*)} [ e^{id/2 (s_j q - s_i q^*)} - e^{-id/2 (s_j q - s_i q^*)} ] - // \frac{e^{id/2 (q-q^*)} }{i(s_j q - s_i q^*)} 2 i sin[ d/2 (s_j q - s_i q^*) ] - // \frac{d e^{id/2 (q-q^*)} }{d/2 (s_j q - s_i q^*)} sin[ d/2 (s_j q - s_i q^*) ] - // d e^{id/2 (q-q^*)} sinc[ d/2 (s_j q - s_i q^*) ], where sinc(x) = sin(x)/x - - *integral = 0; - const double d_2 = 0.5*thickness; - for(size_t j = 0; j < n4; ++j){ - const double sj = (j < n2) ? 1 : -1; - const size_t qj = (j < n2) ? j : j-n2; - for(size_t i = 0; i < n4; ++i){ - const double si = (i < n2) ? 1 : -1; - const size_t qi = (i < n2) ? i : i-n2; - std::complex term = Q[i+j*n4] * std::conj(ab[i]) * ab[j]; - term *= thickness*std::exp(std::complex(0,d_2) * (q[qj] - std::conj(q[qi]))); - term *= zsinc(d_2 * (sj*q[qj] - si*std::conj(q[qi]))); - *integral += term; - } - } - if(NULL == work){ - rcwa_free(Q); - } -} - - -// returns Integral[ abs(XX)^2, z] for XX = ex, ey, ez, hx, hy, hz -void GetLayerZIntegral( - size_t n, // glist.n - const double *kx, - const double *ky, - std::complex omega, - const double &thickness, const double r[2], - const std::complex *q, // length 2*glist.n - const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - const std::complex *phi, // size (2*glist.n)^2 - const std::complex *epsilon_inv, // size (glist.n)^2 - const std::complex *epsilon2, // size (2*glist.n)^2 - int epstype, - const std::complex *ab, // length 4*glist.n - double integral[6], - std::complex *work -){ - static const std::complex zero(0.); - static const std::complex one(1.); - const size_t n2 = 2*n; - const size_t n4 = 2*n2; - const std::complex iomega(1./omega); - - std::complex *f = work; - if(NULL == work){ - f = (std::complex*)rcwa_malloc(sizeof(std::complex)*12*n4); - } - std::complex *g = f + 6*n4; - RNP::TBLAS::Fill(6*n4, 0., f, 1); - - // We will simultaneously generate f for all 6 components - - // Generate the Fourier phase factors - for(size_t i = 0; i < n; ++i){ // this is for ex - double theta = (kx[i]*r[0] + ky[i]*r[1]); - f[0*n4+1*n+i] = std::complex(cos(theta),-sin(theta)); // note the minus - } - RNP::TBLAS::Copy(n, &f[n],1, &f[1*n4+0*n],1); // -ey - if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ - RNP::TBLAS::Copy(n, &f[n4],1, &f[2*n4+n2],1); // ez - RNP::TBLAS::Scale(n, iomega*epsilon_inv[0], &f[2*n4+n2],1); - }else{ - RNP::TBLAS::MultMV<'C'>(n,n, iomega,epsilon_inv,n, &f[n4],1, zero,&f[2*n4+n2],1); // ez - } - RNP::TBLAS::Copy(n, &f[2*n4+n2],1, &f[2*n4+n2+n],1); // ez - for(size_t i = 0; i < n; ++i){ // ez - f[2*n4+n2+i] *= ky[i]; - } - for(size_t i = 0; i < n; ++i){ // ez - f[2*n4+n2+n+i] *= -kx[i]; - } - RNP::TBLAS::Copy(n, &f[n4],1, &f[3*n4+n2],1); // hx - RNP::TBLAS::Copy(n, &f[n4],1, &f[4*n4+n2+n],1); // hy - for(size_t i = 0; i < n; ++i){ // -hz - f[5*n4+0*n+i] = f[n4+i] * iomega * kx[i]; - } - for(size_t i = 0; i < n; ++i){ // -hz - f[5*n4+1*n+i] = f[n4+i] * iomega * ky[i]; - } - - // Form M^H C^H * f - // M = [ kp.phi.inv(q) -kp.phi.inv(q) ] - // [ phi phi ] - if(NULL != phi){ - MultKPMatrix("C", omega, n, kx, ky, epsilon_inv, epstype, kp, 6, f,n4, &g[n2],n4); - RNP::TBLAS::MultMM<'C','N'>(n2,6,n2, one,phi,n2, &g[n2],n4, zero,&g[0 ],n4); - RNP::TBLAS::MultMM<'C','N'>(n2,6,n2, one,phi,n2, &f[n2],n4, zero,&g[n2],n4); - }else{ - MultKPMatrix("C", omega, n, kx, ky, epsilon_inv, epstype, kp, 6, f,n4, &g[0],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,6, &f[n2],n4, &g[n2],n4); - } // top half of g is phi^H kp^H ftop, bottom half is phi^H fbot - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(6, 1./std::conj(omega*q[i]), &g[i],n4); - } - for(size_t j = 0; j < 6; ++j){ - for(size_t i = 0; i < n2; ++i){ - std::complex t(g[i+j*n4]); - g[i+j*n4] += g[i+n2+j*n4]; - g[i+n2+j*n4] -= t; - } - } - - for(size_t i = 0; i < 6; ++i){ - integral[i] = 0; - } - - // At this point, each column of g contains a vector v such that - // v v^H is a rank-1 matrix that must be integrated over. - const double d_2 = 0.5*thickness; - for(size_t j = 0; j < n4; ++j){ - const double sj = (j < n2) ? 1 : -1; - const size_t qj = (j < n2) ? j : j-n2; - for(size_t i = 0; i < n4; ++i){ - const double si = (i < n2) ? 1 : -1; - const size_t qi = (i < n2) ? i : i-n2; - std::complex term = std::conj(ab[i]) * ab[j]; - term *= thickness*std::exp(std::complex(0,d_2) * (q[qj] - std::conj(q[qi]))); - term *= zsinc(d_2 * (sj*q[qj] - si*std::conj(q[qi]))); - for(size_t ii = 0; ii < 6; ++ii){ - integral[ii] += std::real(std::conj(g[j+ii*n4])*g[i+ii*n4] * term); - } - } - } - - if(NULL == work){ - rcwa_free(f); - } -} +/* Copyright (C) 2009-2011, Stanford University + * This file is part of S4 + * Written by Victor Liu (vkl@stanford.edu) + * + * S4 is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * S4 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "config.h" + +#define _USE_MATH_DEFINES +#include +#include +#include +#include +#include "rcwa.h" +#include "fmm/fft_iface.h" +#include +#ifdef HAVE_BLAS +# include +#endif +#include +#ifdef HAVE_LAPACK +# include +# include +#else +# include +#endif + +#ifdef DUMP_MATRICES +# define RNP_OUTPUT_MATHEMATICA +# define DUMP_STREAM std::cerr +//# define DUMP_STREAM (omega.real() > 1.91637 ? std::cerr : std::cout) +# include +#endif +# include +# include + +#include + +static inline void* rcwa_malloc(size_t size){ + void *ret = malloc_aligned(size, 16); + //memset(ret, 0x0, size); + return ret; +} +static inline void rcwa_free(void *ptr){ + free_aligned(ptr); +} + +#ifdef HAVE_LAPACK + extern "C" void zgelss_( + const integer &m, const integer &n, const integer &nRHS, + std::complex *a, const integer &lda, + std::complex *b, const integer &ldb, + double *s, const double &rcond, integer *rank, + std::complex *work, const integer &lwork, + double *rwork, integer *info + ); +#endif + +static void SingularLinearSolve( + size_t m, size_t n, size_t nRHS, std::complex *a, size_t lda, + std::complex *b, size_t ldb, const double &rcond +){ +#ifdef HAVE_LAPACK + integer info, rank; + std::complex dummy; + zgelss_(m, n, nRHS, a, lda, b, ldb, NULL, rcond, &rank, &dummy, -1, NULL, &info); + integer lwork = (integer)dummy.real(); + std::complex *work = (std::complex*)rcwa_malloc(sizeof(std::complex) * lwork); + double *rwork = (double*)rcwa_malloc(sizeof(double) * 6*m); + + zgelss_(m, n, nRHS, a, lda, b, ldb, rwork, rcond, &rank, work, lwork, rwork+m, &info); + + rcwa_free(rwork); + rcwa_free(work); +#else + RNP::LinearSolve<'N'>(n, nRHS, a, lda, b, ldb); +#endif +} + + +// kp = k-parallel matrix +// kp = omega^2 - Kappa = omega^2 - [ ky*epsinv*ky -ky*epsinv*kx ] +// [ -kx*epsinv*ky kx*epsinv*kx ] +// = omega^2 + [ -ky ] [ epsinv ] [ ky -kx ] +// + +static void MakeKPMatrix( + std::complex omega, + size_t n, + const double *kx, + const double *ky, + const std::complex *Epsilon_inv, + int epstype, + const std::complex *kp_existing, + std::complex *kp, + const size_t ldkp // leading dimension of kp, >= 2*n +){ + const size_t n2 = 2*n; + if(NULL != kp_existing){ + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp_existing,n2, kp,ldkp); + return; + } + + const std::complex omega2 = omega*omega; + if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., kp,ldkp); + + for(size_t i = 0; i < n; ++i){ + kp[(0+i)+(0+i)*ldkp] = omega2 - ky[i]*Epsilon_inv[0]*ky[i]; + kp[(n+i)+(0+i)*ldkp] = kx[i]*Epsilon_inv[0]*ky[i]; + } + for(size_t i = 0; i < n; ++i){ + kp[(0+i)+(n+i)*ldkp] = ky[i]*Epsilon_inv[0]*kx[i]; + kp[(n+i)+(n+i)*ldkp] = omega2 - kx[i]*Epsilon_inv[0]*kx[i]; + } + }else{ + RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[0+0*ldkp], ldkp); + RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[n+0*ldkp], ldkp); + RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[0+n*ldkp], ldkp); + RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[n+n*ldkp], ldkp); + + for(size_t i = 0; i < n; ++i){ + RNP::TBLAS::Scale(n2, -ky[i], &kp[0+i*n2], 1); + } + for(size_t i = 0; i < n; ++i){ + RNP::TBLAS::Scale(n2, kx[i], &kp[0+(i+n)*n2], 1); + } + for(size_t i = 0; i < n; ++i){ + RNP::TBLAS::Scale(n2, ky[i], &kp[i+0*n2], ldkp); + } + for(size_t i = 0; i < n; ++i){ + RNP::TBLAS::Scale(n2, -kx[i], &kp[i+n+0*n2], ldkp); + } + const std::complex omega2 = omega*omega; + for(size_t i = 0; i < n2; ++i){ + kp[i+i*ldkp] += omega2; + } + } +} + +// kp = k-parallel matrix +// kp = omega^2 - Kappa = omega^2 - [ ky*epsinv*ky -ky*epsinv*kx ] +// [ -kx*epsinv*ky kx*epsinv*kx ] +// = omega^2 + [ -ky ] [ epsinv ] [ ky -kx ] +// [ kx ] + +void MultKPMatrix( + const char *trans, + const std::complex omega, + const size_t n, + const double *kx, + const double *ky, + const std::complex *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix + const int Epsilon2_type, + const std::complex *kp, // kp if exists. Can be NULL + const size_t ncols, + const std::complex *a, + const size_t lda, + std::complex *y, // same size as a, y <- kp*a + const size_t ldy +){ + const size_t n2 = 2*n; + + if(NULL != kp){ + if('N' == trans[0]){ + if(ncols > 1){ + RNP::TBLAS::MultMM<'N','N'>(n2,ncols,n2, std::complex(1.),kp,n2, a,lda, std::complex(0.),y,ldy); + }else{ + RNP::TBLAS::MultMV<'N'>(n2,n2, std::complex(1.),kp,n2, a,1, std::complex(0.),y,1); + } + }else{ + if(ncols > 1){ + RNP::TBLAS::MultMM<'C','N'>(n2,ncols,n2, std::complex(1.),kp,n2, a,lda, std::complex(0.),y,ldy); + }else{ + RNP::TBLAS::MultMV<'C'>(n2,n2, std::complex(1.),kp,n2, a,1, std::complex(0.),y,1); + } + } + return; + } + + const std::complex omega2 = omega*omega; + if(EPSILON2_TYPE_BLKDIAG1_SCALAR == Epsilon2_type || EPSILON2_TYPE_BLKDIAG2_SCALAR == Epsilon2_type){ + const std::complex epsinv( + 'N' == trans[0] ? Epsilon_inv[0] : std::conj(Epsilon_inv[0]) + ); + for(size_t j = 0; j < ncols; ++j){ + for(size_t i = 0; i < n; ++i){ + std::complex c = a[(0+i)+j*lda]; + std::complex d = a[(n+i)+j*lda]; + y[(0+i)+j*ldy] = (omega2 - ky[i]*epsinv*ky[i]) * c + ( ky[i]*epsinv*kx[i]) * d; + y[(n+i)+j*ldy] = ( kx[i]*epsinv*ky[i]) * c + (omega2 - kx[i]*epsinv*kx[i]) * d; + } + } + }else{ + for(size_t j = 0; j < ncols; ++j){ + for(size_t i = 0; i < n; ++i){ + y[i+j*ldy] = ky[i]*a[(0+i)+j*lda] - kx[i]*a[(n+i)+j*lda]; + } + } + if('N' == trans[0]){ + if(ncols > 1){ + RNP::TBLAS::MultMM<'N','N'>(n,ncols,n, 1.,Epsilon_inv,n, &y[0+0*ldy],ldy, 0.,&y[n+0*ldy],ldy); + }else{ + RNP::TBLAS::MultMV<'N'>(n,n, 1.,Epsilon_inv,n, &y[0+0*ldy],1, 0.,&y[n+0*ldy],1); + } + }else{ + if(ncols > 1){ + RNP::TBLAS::MultMM<'C','N'>(n,ncols,n, 1.,Epsilon_inv,n, &y[0+0*ldy],ldy, 0.,&y[n+0*ldy],ldy); + }else{ + RNP::TBLAS::MultMV<'C'>(n,n, 1.,Epsilon_inv,n, &y[0+0*ldy],1, 0.,&y[n+0*ldy],1); + } + } + + for(size_t j = 0; j < ncols; ++j){ + for(size_t i = 0; i < n; ++i){ + y[(0+i)+j*ldy] = omega2 * a[(0+i)+j*lda] - ky[i]*y[n+i+j*ldy]; + y[(n+i)+j*ldy] = omega2 * a[(n+i)+j*lda] + kx[i]*y[n+i+j*ldy]; + } + } + } +} + +void SolveLayerEigensystem_uniform( + std::complex omega, + size_t n, + const double *kx, + const double *ky, + std::complex eps, + std::complex *q, // length 2*glist.n + std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) + std::complex *phi // size (2*glist.n)^2 +){ + const size_t n2 = 2*n; + + const std::complex eta = 1./ eps; + const std::complex omega2 = omega*omega; + + // Fill kp + if(NULL != kp){ + MakeKPMatrix(omega, n, kx, ky, &eta, EPSILON2_TYPE_BLKDIAG1_SCALAR, NULL, kp, n2); + +#ifdef DUMP_MATRICES + DUMP_STREAM << "kp:" << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n2,n2,kp,n2, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n2,kp,1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + } + + for(size_t i = 0; i < n; ++i){ + q[i] = eps*omega2 - kx[i]*kx[i] - ky[i]*ky[i]; + + // Set the \hat{q} vector (diagonal matrix) while we're at it + if(0 == omega.imag()){ // Not bandsolving + q[i] = std::sqrt(q[i]); + if(q[i].imag() < 0){ + q[i] = -q[i]; + } + }else{ // performing some kind of bandsolving, need to choose the appropriate branch + if(q[i].real() < 0){ + // branch cut should be just below positive real axis + q[i] = std::complex(0,1) * std::sqrt(-q[i]); + }else{ + // branch cut should be just below negative real axis + // This is the default behavior for sqrt(std::complex) + q[i] = std::sqrt(q[i]); + } + } + + /* + if(std::complex(0) == q[i]){ + q[i] = DBL_EPSILON*omega; + } + */ + + q[i+n] = q[i]; + } +#ifdef DUMP_MATRICES + DUMP_STREAM << "q:" << std::endl; + RNP::IO::PrintVector(n2,q,1, DUMP_STREAM) << std::endl << std::endl; +#endif + if(NULL != phi){ + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., phi,n2); +#ifdef DUMP_MATRICES + DUMP_STREAM << "phi:" << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n2,n2,phi,n2, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n2,phi,1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + } +} + +void SolveLayerEigensystem( + std::complex omega, + size_t n, + const double *kx, + const double *ky, + const std::complex *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix + const std::complex *Epsilon2, // size (2*glist.n)^2 (dielectric/normal-field matrix) + int epstype, + std::complex *q, // length 2*glist.n + std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) + std::complex *phi, // size (2*glist.n)^2 + std::complex *work_, + double *rwork_, + size_t lwork +){ + const size_t n2 = 2*n; + + if((size_t)-1 == lwork){ + //M.P. (Michael Povolotskyi) Bellow I have to mimic the instructions from RNP::Eigensystem in file Eigensystems.cpp + // because MKL lapack cannot handle NULL pointer for the matrix + // RNP::Eigensystem(n2, NULL, n2, q, NULL, 1, phi, n2, work_, NULL, lwork); //M.P + work_[0] = double(2*n2); //M.P. + work_[0] += n2*n2; + + return; + }else if(0 == lwork){ + lwork = n2*n2+2*n2; + } + + std::complex *work = work_; + size_t eigenlwork; + if(NULL == work_ || lwork < n2*n2+2*n2){ + lwork = (size_t)-1; + //M.P. Same as above + // RNP::Eigensystem(n2, NULL, n2, q, NULL, 1, phi, n2, q, NULL, lwork); //M.P. + q[0] = (double)(2*n2); //M.P + eigenlwork = (size_t)q[0].real(); + work = (std::complex*)rcwa_malloc(sizeof(std::complex)*(eigenlwork + n2*n2)); + }else{ + eigenlwork = lwork - n2*n2; + } + std::complex *op = work; + std::complex *eigenwork = op + n2*n2; + + double *rwork = rwork_; + if(NULL == rwork_){ + rwork = (double*)rcwa_malloc(sizeof(double)*2*n2); + } + +#ifdef DUMP_MATRICES + DUMP_STREAM << "kx:" << std::endl; + RNP::IO::PrintVector(n,kx,1, DUMP_STREAM) << std::endl; + DUMP_STREAM << "ky:" << std::endl; + RNP::IO::PrintVector(n,ky,1, DUMP_STREAM) << std::endl << std::endl; +#endif + +#ifdef DUMP_MATRICES + DUMP_STREAM << "Epsilon_inv:" << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n,n,Epsilon_inv,n, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n,Epsilon_inv,1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + + // Fill kp + std::complex *kp_use = (NULL != kp ? kp : phi); + + MakeKPMatrix(omega, n, kx, ky, Epsilon_inv, epstype, NULL, kp_use, n2); +#ifdef DUMP_MATRICES + DUMP_STREAM << "kp:" << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n2,n2,kp_use,n2, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n2,&kp_use[0+(n2-1)*n2],1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + + +#ifdef DUMP_MATRICES + DUMP_STREAM << "Epsilon2:" << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n2,n2,Epsilon2,n2, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n2,Epsilon2,1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + + // Make the eigenoperator Epsilon2*kp - [kxkx, kxky; kykx, kyky] + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., op,n2); + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,Epsilon2,n2, kp_use,n2, 0.,op,n2); + + for(size_t i = 0; i < n; ++i){ + op[i+i*n2] -= kx[i]*kx[i]; + } + for(size_t i = 0; i < n; ++i){ + op[i+n+i*n2] -= ky[i]*kx[i]; + } + for(size_t i = 0; i < n; ++i){ + op[i+(i+n)*n2] -= kx[i]*ky[i]; + } + for(size_t i = 0; i < n; ++i){ + op[i+n+(i+n)*n2] -= ky[i]*ky[i]; + } +#ifdef DUMP_MATRICES + DUMP_STREAM << "op:" << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n2,n2,op,n2, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintMatrix(n2,n2,op,n2, DUMP_STREAM) << std::endl << std::endl; + //RNP::IO::PrintVector(n2,&op[0+(n2-1)*n2],1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + +#ifdef DUMP_MATRICES +# ifdef DUMP_MATRICES_LARGE + std::complex *op_save = (std::complex *)rcwa_malloc(sizeof(std::complex) * 2*n2*n2); + std::complex *op_temp = op_save + n2*n2; + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, op,n2, op_save,n2); +# endif +#endif + int info = RNP::Eigensystem(n2, op, n2, q, NULL, 1, phi, n2, eigenwork, rwork, eigenlwork); + if(0 != info){ + std::cerr << "Layer eigensystem returned info = " << info << std::endl; + } +#ifdef DUMP_MATRICES + DUMP_STREAM << "eigen info = " << info << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::TBLAS::Fill(n2*n2, 0., op_temp,1); + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,op_save,n2, phi,n2, 0.,op_temp,n2); + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Axpy(n2, -q[i], &phi[0+i*n2],1, &op_temp[0+i*n2],1); + } + DUMP_STREAM << "eigensystem residual:" << std::endl; + RNP::IO::PrintMatrix(n2,n2,op_temp,n2, DUMP_STREAM) << std::endl << std::endl; + rcwa_free(op_save); +# endif +#endif + + for(size_t i = 0; i < n2; ++i){ + // Set the \hat{q} vector (diagonal matrix) while we're at it + if(0 == omega.imag()){ // Not bandsolving + q[i] = std::sqrt(q[i]); + if(q[i].imag() < 0){ + q[i] = -q[i]; + } + }else{ // performing some kind of bandsolving, need to choose the appropriate branch + if(q[i].real() < 0){ + // branch cut should be just below positive real axis + q[i] = std::complex(0,1) * std::sqrt(-q[i]); + }else{ + // branch cut should be just below negative real axis + // This is the default behavior for sqrt(std::complex) + q[i] = std::sqrt(q[i]); + } + } + } +#ifdef DUMP_MATRICES + DUMP_STREAM << "q:" << std::endl; + RNP::IO::PrintVector(n2,q,1, DUMP_STREAM) << std::endl << std::endl; + DUMP_STREAM << "phi:" << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n2,n2,phi,n2, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n2,phi,1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + + if(NULL == work_ || lwork < n2*n2+2*n2){ + rcwa_free(work); + } + if(NULL == rwork_){ + rcwa_free(rwork); + } +} + + +void InitSMatrix( + size_t n, + std::complex *S // size (4*n)^2 +){ + const size_t n4 = 4*n; + RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,1., S, n4); +} +void GetSMatrix( + size_t nlayers, + size_t n, // glist.n + const double *kx, const double *ky, + std::complex omega, + const double *thickness, // list of thicknesses + const std::complex **q, // list of q vectors + const std::complex **Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix + int *epstype, + const std::complex **kp, + const std::complex **phi, + std::complex *S, // size (4*n)^2 + std::complex *work_, + size_t *iwork, + size_t lwork +){ + if(0 == nlayers){ return; } + const size_t n2 = 2*n; + const size_t n4 = 2*n2; + + if((size_t)-1 == lwork){ + work_[0] = n4*(n4+1); + return; + } + std::complex *work = work_; + if(NULL == work_ || lwork < n4*(n4+1)){ + work = (std::complex*)rcwa_malloc(sizeof(std::complex)*(n4*(n4+1))); + } + size_t *pivots = iwork; + if(NULL == iwork){ + pivots = (size_t*)rcwa_malloc(sizeof(size_t)*n4); + } + + RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,1., S, n4); + + std::complex *t1 = work; + std::complex *t2 = t1 + n2*n2; + std::complex *in1 = t2 + n2*n2; + std::complex *in2 = in1 + n2*n2; + std::complex *d1 = in2 + n2*n2; + std::complex *d2 = d1 + n2; + + for(size_t l = 0; l < nlayers-1; ++l){ + size_t lp1 = l+1; + if(lp1 >= nlayers){ lp1 = l; } + + // Make the interface matrices + if((lp1 == l) || (q[l] == q[lp1] && ((NULL != kp[l] && kp[l] == kp[lp1]) || Epsilon_inv[l] == Epsilon_inv[lp1]) && phi[l] == phi[lp1])){ + // This is a trivial interface, set to identity + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., in1, n2); + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., in2, n2); + }else{ + // The interface matrix is the inverse of the mode-to-field matrix of layer l + // times the mode-to-field matrix of layer l+1 (lp1). + // The mode-to-field matrix is of the form + // [ B -B ] where A = phi + // [ A A ] where B = kp*phi*inv(diag(q)) = G*A/q + // So we want + // 0.5 * [ iBl iAl ] [ Blp1 -Blp1 ] + // [ -iBl iAl ] [ Alp1 Alp1 ] + // Multiplying out gives + // 0.5 * [ P+Q P-Q ] // where P = iAl*Alp1, and i in front means inverse + // [ P-Q P+Q ] // where Q = iBl*Blp1 + // Making P is easy, since A is a single matrix. + // Making Q is as follows: + // Q = iBl*Blp1 + // = ql*iAl*iGl * Gl*Alp1*iqlp1 + // We will only store I11 and I21 + /* + { + std::complex *Ml = (std::complex*)rcwa_malloc(sizeof(std::complex)*n4*n4); + std::complex *Mlp1 = (std::complex*)rcwa_malloc(sizeof(std::complex)*n4*n4); + + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., &Ml[0+0*n4],n4); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[n2+0*n4],n4); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[n2+n2*n4],n4); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[0+n2*n4],n4); + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Scale(n2, 1./q[l][i], &Ml[0+(i+n2)*n4], 1); + } + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,kp[l],n2, &Ml[0+n2*n4],n4, 0.,&Ml[0+0*n4],n4); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Ml[0+0*n4],n4, &Ml[0+n2*n4],n4); + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Scale(n2, -1., &Ml[0+(i+n2)*n4], 1); + } + + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., &Mlp1[0+0*n4],n4); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[n2+0*n4],n4); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[n2+n2*n4],n4); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[0+n2*n4],n4); + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Scale(n2, 1./q[lp1][i], &Mlp1[0+(i+n2)*n4], 1); + } + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,kp[lp1],n2, &Mlp1[0+n2*n4],n4, 0.,&Mlp1[0+0*n4],n4); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Mlp1[0+0*n4],n4, &Mlp1[0+n2*n4],n4); + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Scale(n2, -1., &Mlp1[0+(i+n2)*n4], 1); + } + + //RNP::LinearSolve<'N'>(n4, n4, Ml, n4, Mlp1, n4, NULL, pivots); +#ifdef DUMP_MATRICES + DUMP_STREAM << "M(" << l << ") = " << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n4,n4,Ml,n4, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n4,Ml,1, DUMP_STREAM) << std::endl << std::endl; +# endif + DUMP_STREAM << "M(" << lp1 << ") = " << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n4,n4,Mlp1,n4, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n4,Mlp1,1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + + rcwa_free(Mlp1); + rcwa_free(Ml); + } + */ + // Make Bl in t1 + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., t1,n2); + { + if(NULL == phi[l]){ + if(NULL == kp[l]){ + MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[l], epstype[l], kp[l], t1,n2); + }else{ + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[l],n2, t1,n2); + } + }else{ + MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[l], epstype[l], kp[l], n2, phi[l],n2, t1,n2); + } + //for(size_t i = 0; i < n2; ++i){ + // RNP::TBLAS::Scale(n2, 1./q[l][i], &t1[0+i*n2], 1); + //} + } +#ifdef DUMP_MATRICES + DUMP_STREAM << "Bl(" << l << ") = " << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n2,n2,t1,n2, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n2,t1,1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + // Make Blp1 in in1 + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., in1,n2); + { + if(NULL == phi[lp1]){ + if(NULL == kp[lp1]){ + MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[lp1], epstype[lp1], kp[lp1], in1,n2); + }else{ + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[lp1],n2, in1,n2); + } + }else{ + MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[lp1], epstype[lp1], kp[lp1], n2, phi[lp1],n2, in1,n2); + } + //for(size_t i = 0; i < n2; ++i){ + // RNP::TBLAS::Scale(n2, 1./q[lp1][i], &in1[0+i*n2], 1); + //} + } +#ifdef DUMP_MATRICES + DUMP_STREAM << "Bl(" << l+1 << ") = " << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n2,n2,in1,n2, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n2,in1,1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + int solve_info; + // Make Q in in1 + //RNP::LinearSolve<'N'>(n2, n2, t1, n2, in1, n2, &solve_info, pivots); + SingularLinearSolve(n2,n2,n2, t1,n2, in1,n2, DBL_EPSILON); + // Now perform the diagonal scalings + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Scale(n2, q[l][i], &in1[i+0*n2], n2); + } + { + double maxel = 0; + for(size_t i = 0; i < n2; ++i){ + double el = std::abs(q[lp1][i]); + if(el > maxel){ maxel = el; } + } + for(size_t i = 0; i < n2; ++i){ + double el = std::abs(q[lp1][i]); + if(el < DBL_EPSILON * maxel){ + RNP::TBLAS::Scale(n2, 0., &in1[0+i*n2], 1); + }else{ + RNP::TBLAS::Scale(n2, 1./q[lp1][i], &in1[0+i*n2], 1); + } + } + } + + // Make P in in2 + if(NULL == phi[lp1]){ + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., in2,n2); + }else{ + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, in2,n2); + } + if(NULL != phi[l]){ + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, t1,n2); + RNP::LinearSolve<'N'>(n2, n2, t1, n2, in2, n2, &solve_info, pivots); + } + + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, in2,n2, t1,n2); // in2 = P, t1 = P, in1 = Q + RNP::TBLAS::Axpy(n2*n2, -1., in1,1, in2,1); // in2 = P-Q, t1 = P, in1 = Q + RNP::TBLAS::Axpy(n2*n2, 1., t1,1, in1,1); // in2 = P+Q, t1 = P, in1 = P+Q + RNP::TBLAS::Scale(n2*n2, 0.5, in1,1); + RNP::TBLAS::Scale(n2*n2, 0.5, in2,1); + } +#ifdef DUMP_MATRICES + DUMP_STREAM << "Interface1(" << l+1 << ") = " << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n2,n2,in1,n2, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n2,in1,1, DUMP_STREAM) << std::endl << std::endl; +# endif + DUMP_STREAM << "Interface2(" << l+1 << ") = " << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n2,n2,in2,n2, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n2,in2,1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + + for(size_t i = 0; i < n2; ++i){ + d1[i] = std::exp(q[l ][i] * std::complex(0,thickness[l ])); + d2[i] = std::exp(q[lp1][i] * std::complex(0,thickness[lp1])); + } + + // Make S11 + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, -1.,&S[0+n2*n4],n4, in2,n2, 0.,t1,n2); // t1 = -S12 I21 + for(size_t i = 0; i < n2; ++i){ // t1 = -f_l S12 I21 + RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2); + } + RNP::TBLAS::Axpy(n2*n2, 1., in1,1, t1,1); // t1 = (I11 - f_l S12 I21) + + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., t2,n2); + int solve_info; + RNP::LinearSolve<'N'>(n2, n2, t1, n2, t2, n2, &solve_info, pivots); // t2 = (I11 - f_l S12 I21)^{-1} + + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &S[0+0*n4],n4, t1,n2); + for(size_t i = 0; i < n2; ++i){ // t1 = f_l S11 + RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2); + } + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t2,n2, t1,n2, 0.,&S[0+0*n4],n4); + // S11 is done, and we need to hold on to t2 = (I11 - f_l S12 I21)^{-1} + + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[0+n2*n4],n4, in1,n2, 0.,t1,n2); // t1 = S12 I22 + for(size_t i = 0; i < n2; ++i){ // t1 = f_l S12 I22 + RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2); + } + RNP::TBLAS::Axpy(n2*n2, -1., in2,1, t1,1); // t1 = f_l S12 I22 - I12 + for(size_t i = 0; i < n2; ++i){ // t1 = (f_l S12 I22 - I12) f_{l+1} + RNP::TBLAS::Scale(n2, d2[i], &t1[0+i*n2], 1); + } + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t2,n2, t1,n2, 0.,&S[0+n2*n4],n4); + // S12 done, and t2 can be reused + + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[n2+n2*n4],n4, in2,n2, 0.,t1,n2); // t1 = S22 I21 + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t1,n2, &S[0+0*n4],n4, 1.,&S[n2+0*n4],n4); + // S21 done, need to keep t1 = S22 I21 + + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[n2+n2*n4],n4, in1,n2, 0.,t2,n2); // t2 = S22 I22 + for(size_t i = 0; i < n2; ++i){ // t2 = S22 I22 f_{l+1} + RNP::TBLAS::Scale(n2, d2[i], &t2[0+i*n2], 1); + } + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, t2,n2, &S[n2+n2*n4],n4); + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t1,n2, &S[0+n2*n4],n4, 1.,&S[n2+n2*n4],n4); + +#ifdef DUMP_MATRICES + DUMP_STREAM << "S(1," << l+2 << ") = " << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n4,n4,S,n4, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n4,S,1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + } + if(NULL == work_ || lwork < n4*(n4+1)){ + rcwa_free(work); + } + if(NULL == iwork){ + rcwa_free(pivots); + } +} + +int SolveInterior( + size_t nlayers, + size_t which_layer, + size_t n, // glist.n + const double *kx, const double *ky, + std::complex omega, + const double *thickness, // list of thicknesses + const std::complex **q, // list of q vectors + const std::complex **Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix + int *epstype, + const std::complex **kp, + const std::complex **phi, + std::complex *a0, // length 2*n + std::complex *bN, // length 2*n + std::complex *ab, // length 4*n + std::complex *work_, // length lwork + size_t *iwork, // length n2 + size_t lwork // set to -1 for query into work[0], at least 2*(4*n)^2 + 2*(2*n) + 4*n*(4*n+1) +){ + if(0 == nlayers){ return-1; } + if(which_layer >= nlayers){ return -2; } + + const size_t n2 = 2*n; + const size_t n4 = 2*n2; + const size_t lwork_GetSMatrix = 4*n*(4*n+1); + const size_t lwork_needed = 2*n4*n4 + 2*n2 + lwork_GetSMatrix; + + if((size_t)-1 == lwork){ + work_[0] = lwork_needed; + return 0; + } + std::complex *work = work_; + if(NULL == work_ || lwork < lwork_needed){ + work = (std::complex*)rcwa_malloc(sizeof(std::complex)*lwork_needed); + } + size_t *pivots = iwork; + if(NULL == iwork){ + pivots = (size_t*)rcwa_malloc(sizeof(size_t)*n2); + } + + std::complex *S0l = work; + std::complex *SlN = S0l + n4*n4; + std::complex *S11a0 = SlN + n4*n4; + std::complex *S22bN = S11a0 + n2; + std::complex *work_GetSMatrix = S22bN + n2; + std::complex *al = ab; + std::complex *bl = al+n2; + std::complex *temp = S0l; + size_t ldtemp = n4; + + int info; + + GetSMatrix(which_layer+1, n, kx, ky, omega, + thickness, q, Epsilon_inv, epstype, kp, phi, + S0l, work_GetSMatrix, pivots, lwork_GetSMatrix); + GetSMatrix(nlayers-which_layer, n, kx, ky, omega, + thickness+which_layer, q+which_layer, Epsilon_inv+which_layer, epstype+which_layer, kp+which_layer, phi+which_layer, + SlN, work_GetSMatrix, pivots, lwork_GetSMatrix); + +#ifdef DUMP_MATRICES + DUMP_STREAM << "S0l(0," << which_layer << ") = " << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n4,n4,S0l,n4, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n4,S0l,1, DUMP_STREAM) << std::endl << std::endl; +# endif + DUMP_STREAM << "SlN(" << which_layer << "," << nlayers-1 << ") = " << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n4,n4,SlN,n4, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n4,SlN,1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif + + // both solutions only depend on the products S11(0,l)*a0 and S22(l,N)*bN + if(NULL != a0){ + RNP::TBLAS::MultMV<'N'>(n2, n2, std::complex(1.0), &S0l[0+0*n4], n4, + a0, 1, + std::complex(0.0), S11a0, 1); + }else{ + RNP::TBLAS::Fill(n2, 0., S11a0, 1); + } + if(NULL != bN){ + RNP::TBLAS::MultMV<'N'>(n2, n2, std::complex(1.0), &SlN[n2+n2*n4], n4, + bN, 1, + std::complex(0.0), S22bN, 1); + }else{ + RNP::TBLAS::Fill(n2, 0., S22bN, 1); + } + + // We overwrite the upper left submatrix S11(0,l) since it's not needed anymore + // temp is set to S0l for this reason. + + // Compute -S_12(0,l)S_21(l,N) + RNP::TBLAS::MultMM<'N','N'>(n2, n2, n2, std::complex(-1.0), &S0l[0+n2*n4], n4, + &SlN[n2+0*n4], n4, + std::complex(0.0), temp, ldtemp); + for(size_t i = 0; i < n2; ++i){ + temp[i+i*ldtemp] += 1.; + } // temp = (1 - S_12(0,l)S_21(l,N)) + + RNP::TBLAS::MultMV<'N'>(n2, n2, std::complex(1.0), &S0l[0+n2*n4], n4, + S22bN, 1, + std::complex(0.0), al, 1); // al = S_12(0,l)S_22(l,N)bN + RNP::TBLAS::Axpy(n2, std::complex(1.0), S11a0, 1, al, 1); // al = S_11(0,l)*a0 + S_12(0,l)S_22(l,N)bN + + RNP::LinearSolve<'N'>(n2, 1, temp, ldtemp, al, n2, &info, pivots); + // al done + + // Make the other matrix + // Compute S_21(l,N)S_12(0,l) + RNP::TBLAS::MultMM<'N','N'>(n2, n2, n2, std::complex(-1.0), &SlN[n2+0*n4], n4, + &S0l[0+n2*n4], n4, + std::complex(0.0), temp, ldtemp); + for(size_t i = 0; i < n2; ++i){ + temp[i+i*ldtemp] += 1.; + } // temp = (1 - S_21(l,N)S_12(0,l)) + + RNP::TBLAS::MultMV<'N'>(n2, n2, std::complex(1.0), &SlN[n2+0*n4], n4, + S11a0, 1, + std::complex(0.0), bl, 1); // bl = S_21(l,N)S_11(0,l)a0 + RNP::TBLAS::Axpy(n2, std::complex(1.0), S22bN, 1, bl, 1); // bl = S_21(l,N)S_11(0,l)a0 + S_22(l,N)bN + RNP::LinearSolve<'N'>(n2, 1, temp, ldtemp, bl, n2, &info, pivots); + +#ifdef DUMP_MATRICES + DUMP_STREAM << "al:" << std::endl; + RNP::IO::PrintVector(n2,al,1, DUMP_STREAM) << std::endl << std::endl; + DUMP_STREAM << "bl:" << std::endl; + RNP::IO::PrintVector(n2,bl,1, DUMP_STREAM) << std::endl << std::endl; +#endif + + if(NULL == work_ || lwork < lwork_needed){ + rcwa_free(work); + } + if(NULL == iwork){ + rcwa_free(pivots); + } + return 0; +} + + + +void TranslateAmplitudes( + size_t n, // glist.n + const std::complex *q, // length 2*glist.n + double thickness, + double dz, + std::complex *ab +){ + const size_t n2 = 2*n; + for(size_t i = 0; i < n2; ++i){ + std::complex iq = std::complex(0,1)*q[i]; + // When extrapolating, the exponentials can be huge, so we need to check + // against overflow. This doesn't solve the problem completely, but it helps. + if(0. != ab[i]){ + ab[i] *= std::exp(iq*dz); + } + if(0. != ab[i+n2]){ + ab[i+n2] *= std::exp(iq*(thickness-dz)); + } + } +} + +void GetZPoyntingFlux( + size_t n, // glist.n + const double *kx, const double *ky, + std::complex omega, + const std::complex *q, // length 2*glist.n + const std::complex *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix + int epstype, + const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) + const std::complex *phi, // size (2*glist.n)^2 + const std::complex *ab, // length 4*glist.n + std::complex *forward, + std::complex *backward, + std::complex *work +){ + const size_t n2 = 2*n; + + std::complex *a2 = work; + if(NULL == work){ + a2 = (std::complex *)rcwa_malloc(sizeof(std::complex) * 4*n2); + } + std::complex *b2 = a2 + n2; + std::complex *a3 = b2 + n2; + std::complex *b3 = a3 + n2; + + memcpy(a2, ab, sizeof(std::complex) * 2*n2); + for(size_t i = 0; i < n2; ++i){ a2[i] /= (omega*q[i]); } + for(size_t i = 0; i < n2; ++i){ b2[i] /= (omega*q[i]); } + if(NULL == phi){ + RNP::TBLAS::CopyMatrix<'A'>(n2,2, a2,n2, a3,n2); + }else{ + RNP::TBLAS::MultMM<'N','N'>(n2,2,n2, std::complex(1.),phi,n2, a2,n2, std::complex(0.), a3,n2); + } + MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv, epstype, kp, 2, a3,n2, a2,n2); + if(NULL == phi){ + RNP::TBLAS::CopyMatrix<'A'>(n2,2, a2,n2, a3,n2); + }else{ + RNP::TBLAS::MultMM<'C','N'>(n2,2,n2, std::complex(1.),phi,n2, a2,n2, std::complex(0.), a3,n2); + } + + // At this point a3[n2] is alpha and b3[n2] is beta, + // ab[n2] is a and (ab+n2)[n2] is b + + *forward = RNP::TBLAS::ConjugateDot(n2, ab ,1, a3,1).real(); + *backward = -RNP::TBLAS::ConjugateDot(n2, &ab[n2],1, b3,1).real(); + std::complex diff = 0.5*(RNP::TBLAS::ConjugateDot(n2, &ab[n2],1, a3,1) - RNP::TBLAS::ConjugateDot(n2, b3,1, ab,1)); + *forward += diff; + *backward += std::conj(diff); + + if(NULL == work){ + rcwa_free(a2); + } +} + +void GetZPoyntingFluxComponents( + size_t n, // glist.n + const double *kx, const double *ky, + std::complex omega, + const std::complex *q, // length 2*glist.n + const std::complex *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix + int epstype, + const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) + const std::complex *phi, // size (2*glist.n)^2 + const std::complex *ab, // length 4*glist.n + std::complex *forward, + std::complex *backward, + std::complex *work +){ + const size_t n2 = 2*n; + + std::complex *a2 = work; + if(NULL == work){ + a2 = (std::complex *)rcwa_malloc(sizeof(std::complex) * 4*n2); + } + std::complex *b2 = a2 + n2; + std::complex *a3 = b2 + n2; + std::complex *b3 = a3 + n2; + + memcpy(a2, ab, sizeof(std::complex) * 2*n2); + for(size_t i = 0; i < n2; ++i){ a2[i] /= (omega*q[i]); } + for(size_t i = 0; i < n2; ++i){ b2[i] /= (omega*q[i]); } + if(NULL == phi){ + RNP::TBLAS::CopyMatrix<'A'>(n2,2, a2,n2, a3,n2); + }else{ + RNP::TBLAS::MultMM<'N','N'>(n2,2,n2, std::complex(1.),phi,n2, a2,n2, std::complex(0.), a3,n2); + } + MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv, epstype, kp, 2, a3,n2, a2,n2); + // At this point, a2[n2] contains alpha_e, b2[n2] contains beta_e + + if(NULL == phi){ + RNP::TBLAS::CopyMatrix<'A'>(n2,2, ab,n2, a3,n2); + }else{ + RNP::TBLAS::MultMM<'N','N'>(n2,2,n2, std::complex(1.),phi,n2, ab,n2, std::complex(0.), a3,n2); + } + // At this point, a3[n2] contains alpha_h, b3[n2] contains beta_h + + for(size_t i = 0; i < n; ++i){ + forward[i] = 0; + backward[i] = 0; + for(size_t j = 0; j < n2; j+=n){ + const size_t k = i+j; + forward[i] += std::real(std::conj(a2[k])*a3[k]); + backward[i] -= std::real(std::conj(b2[k])*b3[k]); + std::complex diff = 0.5*(std::conj(b3[k])*a2[k] - std::conj(b2[k])*a3[k]); + forward[i] += diff; + backward[i] += std::conj(diff); + } + } + + if(NULL == work){ + rcwa_free(a2); + } +} + +static void GetInPlaneFieldVector( + size_t n, // glist.n + const double *kx, const double *ky, + std::complex omega, + const std::complex *q, // length 2*glist.n + const std::complex *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix + int epstype, + const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) + const std::complex *phi, // size (2*glist.n)^2 + const std::complex *ab, // length 4*glist.n + std::complex *eh // length 8*2*glist.n +){ + // ab are the mode amplitudes times the f(z) operators in the column vector + // on the bottom row of eqn 35. By the time we enter this function, + // TranslateAmplitudes has already been called which does the multiplcation + // by the f(z), f(d-z) operators. Those operators are the things that take + // the mode amplitudes at each interface and use the + // eigenvalues/propagation constants q to find the values of the mode + // amplitudes at a particular z coordinate. The array has length 4n. The + // forward in the first 2n locations, and the backward mode amplitudes are + // stored in the remaining 2n locations. The forward amplitudes are + // relative to the front of the layer, while the backward amplitudes are + // relative to the back of the layer. + // So ab = [f(z)a, f(d-z)b] + + const size_t n2 = 2*n; + const size_t n4 = 2*n2; + + // This is the column vector on the bottom line of eqn 35 by q^-1 and + // storing it in eh[8N:12N]. This will be used for computing the top two + // blocks of the matrix in eqn 35 + // TODO: I don't understand why the omega is in here + for(size_t i = 0; i < n2; ++i){ + eh[4*n2+i] = ab[i]/(omega*q[i]); + eh[5*n2+i] = -ab[i+n2]/(omega*q[i]); + } + // This copies the array ab, which is of length 4n, to the 12N-th element + // of eh. eh is only 16N elements long, so this takes up the last section + // of space in the array. At this point eh[8N:16N] = [f(z)a/q -f(d-z)b/q + // f(z)a f(d-z)b]. The minus sign in the 2nd section is needed later, + // because the top right block of the matrix in eqn 35 has a minus sign. + RNP::TBLAS::Copy(n4, ab,1, &eh[6*n2],1); + if(NULL == phi){ + // This operation copies eh[8N:16N] (8N elements) to eh[0:8N] (also 8N + // elements) + RNP::TBLAS::CopyMatrix<'A'>(n2,4, &eh[4*n2],n2, &eh[0],n2); + }else{ + // This computes following matrix op + // C := alpha*A*B + beta*C + // m = 2N, n = 4, k = 2N + // A = phi (m x k) + // B = eh[8N:] (k x n) + // C = eh[0] (m x n) + // alpha = 1, beta = 0 + // This computes the first step of the operation in eqn 35 and stores + // it at the beginning of the eh array eh[0:8N] + RNP::TBLAS::MultMM<'N','N'>(n2,4,n2, std::complex(1.),phi,n2, &eh[4*n2],n2, std::complex(0.),&eh[0],n2); + } + // At this point, the 4 columns of &eh[0] (length n2) are + // [ phi.inv(q).a -phi.inv(q).b phi.a phi.b ] + // kp = k-parallel matrix + // kp = omega^2 - Kappa = omega^2 - [ ky*epsinv*ky -ky*epsinv*kx ] + // [ -kx*epsinv*ky kx*epsinv*kx ] + // = omega^2 + [ -ky ] [ epsinv ] [ ky -kx ] + // [ kx ] + // So kp is the thing inside the parentheses in the upper row of the matrix + // in equation 35 + // This function call peforms the matrix multiplication in the top row of + // eqn 35. There are 2 "columns" of 2N elements, so the first 4N elements + // of eh (see top of comments for contents of eh) get multiplied by the kp matrix + // and stored in eh[8N:12N] + MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv, epstype, kp, 2, &eh[0],n2, &eh[4*n2],n2); + // Computes eh[4N:6N] + eh[6N:8N] and stores in in eh[6N:8N]. This is + // adding the two bottom blocks of eqn 35 together after multiplication to + // get the hx and hy components + RNP::TBLAS::Axpy(n2, std::complex(1.),&eh[2*n2],1, &eh[3*n2],1); + // Computes eh[10N:12N] + eh[8N:10N] and stores it in eh[8N:10N]. This is + // just adding the two top blocks in eqn 35 together after all the + // M + // multiplication to get the -ey and ex components + RNP::TBLAS::Axpy(n2, std::complex(1.),&eh[5*n2],1, &eh[4*n2],1); + /* + // hx = eh[6N:7N] + const std::complex *hx = &eh[3*n2+0]; + // hy = eh[7N:8N] + const std::complex *hy = &eh[3*n2+n]; + // -ey = eh[8N:9N] + const std::complex *ney = &eh[4*n2+0]; + // ex = eh[9N:10N] + const std::complex *ex = &eh[4*n2+n]; + */ +} + +void GetFieldAtPoint( + size_t n, // glist.n + const double *kx, + const double *ky, + std::complex omega, + const std::complex *q, // length 2*glist.n + const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) + const std::complex *phi, // size (2*glist.n)^2 + const std::complex *epsilon_inv, // size (glist.n)^2, non NULL for efield != NULL + int epstype, + const std::complex *ab, // length 4*glist.n + const double r[2], // coordinates within layer + std::complex efield[3], + std::complex hfield[3], + std::complex *work // 8*n2 +){ + // This is just a way of instantiating 1 with a complex number type, and 0 + // with a complex number type, which I think is necessary for type + // compatibility and whatnot with all the lapack functions. + const std::complex z_zero(0.); + const std::complex z_one(1.); + const size_t n2 = 2*n; + + // This is the memory workspace for the fourier coeffs of the fields. Its + // of length 16*N + std::complex *eh = work; + if(NULL == work){ + eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); + } + + // This is the function I need to modify. Right now it puts ex, ey, hx, hy + // inside eh (i.e the fourier coefficients of the transverse field + // components). I need to get at the fourier coefficients of d_normal and + // e_tangential and return them, then use the code below to get the real + // space reconstructions + GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); + // So eh contains the fourier coefficients of hx, hy, -ey and ex. This code + // below sets up pointers to the various continous chunks of the array that + // contain the relevant fourier components. Notice the first 6N (3*2*N) + // elements of the array are skipped. + const std::complex *hx = &eh[3*n2+0]; + const std::complex *hy = &eh[3*n2+n]; + const std::complex *ney = &eh[4*n2+0]; + const std::complex *ex = &eh[4*n2+n]; + + std::complex fE[3], fH[3]; + fE[0] = 0; + fE[1] = 0; + fE[2] = 0; + fH[0] = 0; + fH[1] = 0; + fH[2] = 0; + + if(NULL != efield && NULL != epsilon_inv){ + // This is setting up the left side of equation 11 of the paper and + // storing in the first N elements of the eh array + for(size_t i = 0; i < n; ++i){ + eh[i] = (ky[i]*hx[i] - kx[i]*hy[i]); + } + // This multiplies epsilon_inv by eh[0:N] and copies it into the + // memory space of eh[N:2N]. Looking at equation 11 of the paper, you + // can see after setting up the left size we still need to multiply it + // by \hat{\epsilon}_z^-1. We haven't divided by omega yet, for some + // reason that is done in the next loop. I think we could do it by + // replaceing epsilon_inv[0] in the Scale call by epsilon_inv[0]/omega + // and z_one by 1/omega in the MultMV call. Perhaps there is a reason + // the author didn't do it this way + if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ + RNP::TBLAS::Scale(n, epsilon_inv[0], eh,1); + RNP::TBLAS::Copy(n, eh,1, &eh[n], 1); + }else{ + // eh[n] = (1+0i) * epsilon_inv*eh + 0 * eh[n] + // The 0 * eh[n] part seems silly but it allows us to copy the + // result of the first term into eh[n] + RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1); + } + } + + // This loop below is basically equation 5 from the paper!! + for(size_t i = 0; i < n; ++i){ + // This is (\vec{k} + \vec{G}) \cdot \vec{x} at each G vector + const double theta = (kx[i]*r[0] + ky[i]*r[1]); + // This is using Euler's rule to compute e^{i (\vec{k} + \vec{G}) \cdot \vec{x}) + const std::complex phase(cos(theta),sin(theta)); + // And this is just where each individual term is added to the total + fH[0] += hx[i]*phase; + fH[1] += hy[i]*phase; + fE[0] += ex[i]*phase; + fE[1] -= ney[i]*phase; + // These is eqn 14 from the paper. z component of h is expressed in + // terms of transverse components of e + fH[2] += (kx[i] * -ney[i] - ky[i] * ex[i]) * (phase / omega); + // This is the final bit of equation 11 from the paper. We need to + // divide by omega to get the fourier coefficients of e_z, then + // multiply by phase and sum to get real space resconstruction + fE[2] += eh[n+i] * (phase / omega); + } + + if(NULL != efield && NULL != epsilon_inv){ + efield[0] = fE[0]; + efield[1] = fE[1]; + efield[2] = fE[2]; + } + if(NULL != hfield){ + hfield[0] = fH[0]; + hfield[1] = fH[1]; + hfield[2] = fH[2]; + } + + if(NULL == work){ + rcwa_free(eh); + } +} + + +void GetFieldAtPointImproved( + size_t n, // glist.n + const double *kx, + const double *ky, + std::complex omega, + const std::complex *q, // length 2*glist.n + const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) + const std::complex *phi, // size (2*glist.n)^2 + const std::complex *epsilon_inv, // size (glist.n)^2, non NULL for efield != NULL. This is \hat{\eta} in paper notation + const std::complex *P, // Projection operator onto tangential. 2nx2n matrix, size 4n^2 + const std::complex *W, // Weismann operator. 2nx2n matrix, size 4n^2 + const std::complex *Epsilon2, // curly Epsilon2 matrix (see eqn 51) + const std::complex epsilon, // The scalar, real space value of epsilon + int epstype, + const std::complex *ab, // length 4*glist.n + const double r[2], // coordinates within layer + std::complex efield[3], + std::complex hfield[3], + std::complex *work // 8*n2 +){ + // This is just a way of instantiating 1 with a complex number type, and 0 + // with a complex number type, which I think is necessary for type + // compatibility and whatnot with all the lapack functions. + const std::complex z_zero(0.); + const std::complex z_one(1.); + const size_t n2 = 2*n; + +#ifdef DUMP_MATRICES + if(NULL != P){ + DUMP_STREAM << "P:" << std::endl; + RNP::IO::PrintMatrix(n2,n2,P,n2, DUMP_STREAM) << std::endl << std::endl; + DUMP_STREAM << "W:" << std::endl; + RNP::IO::PrintMatrix(n2,n2,W,n2, DUMP_STREAM) << std::endl << std::endl; + } +#endif +#ifdef DUMP_MATRICES + if(NULL != W){ + DUMP_STREAM << "Epsilon2:" << std::endl; + RNP::IO::PrintMatrix(n2,n2,Epsilon2,n2, DUMP_STREAM) << std::endl << std::endl; + } +#endif + + // This is the memory workspace for the fourier coeffs of the fields. Its + // of length 16*N + std::complex *eh = work; + if(NULL == work){ + eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); + } + + // The function below puts ex, ey, hx, hy inside eh (i.e the fourier + // coefficients of the transverse field components). I need to get at the + // fourier coefficients of d_normal and e_tangential, then use the code + // below this function call to get the real space reconstructions + // hx = eh[6N:7N] + // hy = eh[7N:8N] + // -ey = eh[8N:9N] + // ex = eh[9N:10N] + GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); + // So eh contains the fourier coefficients of hx, hy, -ey and ex. This code + // below sets up pointers to the various continous chunks of the array that + // contain the relevant fourier components. Notice the first 6N (3*2*N) + // elements of the array are skipped. + // START HERE + // 1) I need to get access to Epsilon2, and use it to extract the fourier + // coefficients of dx, dy by multiplying it with [-ey ex]. In other words + // [-dy dx] = Epsilon2 [-ey ex]. + // 2) I then use P to project onto normal basis, and I - P to project onto + // tangential basis. + const std::complex *hx = &eh[3*n2+0]; + const std::complex *hy = &eh[3*n2+n]; + const std::complex *ney = &eh[4*n2+0]; + const std::complex *ex = &eh[4*n2+n]; + + // Arrays for the real space reconstructions of the 3 field components + std::complex fE[3], fH[3]; + fE[0] = 0; + fE[1] = 0; + fE[2] = 0; + fH[0] = 0; + fH[1] = 0; + fH[2] = 0; + + // So the equations look like this: + // [-d_{n,y} d_{n,x}] = W [-ey ex] + // \hat{T} = I - P + // [-e_{t,y} e_{t,x}] = T [-ey, ex] + // Remeber P is NULL for unpatterned layers, and we don't need to use this + // procedure for unpatterned layers anyway! + std::complex *dn_and_et = (std::complex*)rcwa_malloc(sizeof(std::complex) * 2*n2); + // These are just some convenience pointers to the various sections of + // the dn and et array + const std::complex *ndny = &dn_and_et[0]; + const std::complex *dnx = &dn_and_et[n]; + const std::complex *nety = &dn_and_et[n2]; + const std::complex *etx = &dn_and_et[n2+n]; + if(NULL != W){ + // We now have all we need to get the Fourier coefficients of the + // normal component of d and the tangential components of e. I + // allocated a new array for the dnormal components and etangential + // components. dnormal will be in the first 2N elements, etangential in + // the last 2N elements + // First dnormal + // This is my approach + /* RNP::TBLAS::MultMV<'N'>(n2,n2,z_one,N,n2,&eh[4*n2],1, z_zero,&dn_and_et[0],1); */ + // This is Weismann's approach verbatim + RNP::TBLAS::MultMV<'N'>(n2,n2,z_one,W,n2,&eh[4*n2],1, z_zero,&dn_and_et[0],1); +#ifdef DUMP_MATRICES + DUMP_STREAM << "Dnormal:" << std::endl; + RNP::IO::PrintVector(n2,dn_and_et,1,DUMP_STREAM) << std::endl << std::endl; +#endif + // Then lets construct \hat{T}. This is the operator that projects onto the + // direction tangential to a material interface, and is just I - P + std::complex *T; + // std::complex diag = std::complex(1.0*n); + std::complex diag = std::complex(1.0); + T = (std::complex*)rcwa_malloc(sizeof(std::complex) * n2*n2); + // Set T to the identity first + RNP::TBLAS::SetMatrix<'A'>(n2, n2, std::complex(0.), diag, T, n2); +#ifdef DUMP_MATRICES + DUMP_STREAM << "T:" << std::endl; + RNP::IO::PrintMatrix(n2,n2,N,n2, DUMP_STREAM) << std::endl << std::endl; +#endif + // Now subtract P from it. This loops through each row and treats each row + // as a vector, computing + // T[i,:] = -1*P[i,:] + T[i,:] + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Axpy(n2, std::complex(-1.), &P[0+i*n2], 1, &T[0+i*n2], 1); + } +#ifdef DUMP_MATRICES + DUMP_STREAM << "T:" << std::endl; + RNP::IO::PrintMatrix(n2,n2,T,n2, DUMP_STREAM) << std::endl << std::endl; + /* DUMP_STREAM << "Epsilon_inv:" << std::endl; */ + /* RNP::IO::PrintMatrix(n,n, epsilon_inv,n, DUMP_STREAM) << std::endl << std::endl; */ +#endif + // Now we compute the matrix product of N and Epsilon2 and store it in + // the memory space of N + /* RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, std::complex(1.),N,n2,Epsilon2,n2,std::complex(0.),N,n2); */ + // Now etangential + RNP::TBLAS::MultMV<'N'>(n2,n2,z_one,T,n2,&eh[4*n2],1, z_zero,&dn_and_et[n2],1); +#ifdef DUMP_MATRICES + DUMP_STREAM << "Etangential:" << std::endl; + RNP::IO::PrintVector(n2,&dn_and_et[n2],1,DUMP_STREAM) << std::endl << std::endl; +#endif + if(NULL != T){ + rcwa_free(T); + } + } + if(NULL != efield && NULL != epsilon_inv){ + // This is setting up the left side of equation 11 of the paper and + // storing in the first N elements of the eh array. The goal is to + // compute the ez components + for(size_t i = 0; i < n; ++i){ + eh[i] = (ky[i]*hx[i] - kx[i]*hy[i]); + } + // This multiplies epsilon_inv by eh[0:N] and copies it into the + // memory space of eh[N:2N]. Looking at equation 11 of the paper, you + // can see after setting up the left size we still need to multiply it + // by \hat{\epsilon}_z^-1. We haven't divided by omega yet, for some + // reason that is done in the next loop. I think we could do it by + // replaceing epsilon_inv[0] in the Scale call by epsilon_inv[0]/omega + // and z_one by 1/omega in the MultMV call. Perhaps there is a reason + // the author didn't do it this way + if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ + RNP::TBLAS::Scale(n, epsilon_inv[0], eh,1); + RNP::TBLAS::Copy(n, eh,1, &eh[n], 1); + }else{ + // eh[n] = (1+0i) * epsilon_inv*eh[0:N] + 0 * eh[N] + // The 0 * eh[n] part seems silly but it allows us to copy the + // result of the first term into eh[n] + RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1); + } + } + + // This loop below is basically equation 5 from the paper!! + for(size_t i = 0; i < n; ++i){ + // This is (\vec{k} + \vec{G}) \cdot \vec{x} at each G vector + const double theta = (kx[i]*r[0] + ky[i]*r[1]); + // This is using Euler's rule to compute e^{i (\vec{k} + \vec{G}) \cdot \vec{x}) + const std::complex phase(cos(theta),sin(theta)); + // And this is just where each individual term is added to the total + fH[0] += hx[i]*phase; + fH[1] += hy[i]*phase; + if(NULL != P){ + fE[0] += dnx[i]*(phase/epsilon) + etx[i]*phase; + fE[1] -= ndny[i]*(phase/epsilon) + nety[i]*phase; + } else { + fE[0] += ex[i]*phase; + fE[1] -= ney[i]*phase; + } + // These is eqn 14 from the paper. z component of h is expressed in + // terms of transverse components of e + fH[2] += (kx[i] * -ney[i] - ky[i] * ex[i]) * (phase / omega); + // This is the final bit of equation 11 from the paper. We need to + // divide by omega to get the fourier coefficients of e_z, then + // multiply by phase and sum to get real space resconstruction + fE[2] += eh[n+i] * (phase / omega); + } + + if(NULL != efield && NULL != epsilon_inv){ + efield[0] = fE[0]; + efield[1] = fE[1]; + efield[2] = fE[2]; + } + if(NULL != hfield){ + hfield[0] = fH[0]; + hfield[1] = fH[1]; + hfield[2] = fH[2]; + } + + if(NULL == work){ + rcwa_free(eh); + } + if(NULL != dn_and_et){ + rcwa_free(dn_and_et); + } +} + +void GetFieldOnGridImproved( + size_t n, // glist.n + int *G, + const double *kx, const double *ky, + std::complex omega, + const std::complex *q, // length 2*glist.n + const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) + const std::complex *phi, // size (2*glist.n)^2 + const std::complex *epsilon_inv, // size (glist.n)^2, non NULL for efield != NULL || kp == NULL + const std::complex *P, // Projection operator onto tangential. 2nx2n matrix, size 4n^2 + const std::complex *W, // Weismann operator. 2nx2n matrix, size 4n^2 + const std::complex *epsilon, // The scalar, real space value of epsilon + int epstype, + const std::complex *ab, // length 4*glist.n + const size_t nxy[2], // number of points per lattice direction + std::complex *efield, + std::complex *hfield +){ + const std::complex z_zero(0.); + const std::complex z_one(1.); + const size_t n2 = 2*n; + // Total # of real space sampling points + const size_t N = nxy[0]*nxy[1]; + // Center of grid in real space + const int nxyoff[2] = { (int)(nxy[0]/2), (int)(nxy[1]/2) }; + int inxy[2] = { (int)nxy[0], (int)nxy[1] }; + int inxy_rev[2] = { (int)nxy[1], (int)nxy[0] }; + + std::complex *eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); + + GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); + const std::complex *hx = &eh[3*n2+0]; + const std::complex *hy = &eh[3*n2+n]; + const std::complex *ney = &eh[4*n2+0]; + const std::complex *ex = &eh[4*n2+n]; + + // This is allocating memory for 6 Fourier transforms of size N (number of + // real space grid points requested). This makes sense because there are 3 + // E field components to compute, and 3 H field components to compute (6 + // overall) + // The two arrays below are arrays of pointers. This means each element in + // from/to is a pointer to an entirely new memory space that is allocated + // by the fft_alloc_complex function + std::complex *from[8]; + std::complex *to[8]; + fft_plan plan[8]; + for(unsigned i = 0; i < 8; ++i){ + from[i] = fft_alloc_complex(N); + to[i] = fft_alloc_complex(N); + memset(from[i], 0, sizeof(std::complex) * N); + // This is just a function that allocates a struct that "plans" or + // configures the Fourier transform. Mainly astruct containing the + // pointers to memory space containing real space data (from) and + // pointer to the memory space for the results (to) + plan[i] = fft_plan_dft_2d(inxy_rev, from[i], to[i], 1); + } + + // This is setting up the left side of equation 11 of the paper and + // storing in the first N elements of the eh array + for(size_t i = 0; i < n; ++i){ + eh[i] = (ky[i]*hx[i] - kx[i]*hy[i]); + } + // This multiplies epsilon_inv by eh[0:N] and copies it into the + // memory space of eh[N:2N]. Looking at equation 11 of the paper, you + // can see after setting up the left size we still need to multiply it + // by \hat{\epsilon}_z^-1. We haven't divided by omega yet, for some + // reason that is done in the next loop. I think we could do it by + // replaceing epsilon_inv[0] in the Scale call by epsilon_inv[0]/omega + // and z_one by 1/omega in the MultMV call. Perhaps there is a reason + // the author didn't do it this way + if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ + RNP::TBLAS::Scale(n, epsilon_inv[0], eh,1); + RNP::TBLAS::Copy(n, eh,1, &eh[n], 1); + }else{ + RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1); + } + + // So the equations look like this: + // [-d_{n,y} d_{n,x}] = W [-ey ex] + // \hat{T} = I - P + // [-e_{t,y} e_{t,x}] = T [-ey, ex] + // Remeber P is NULL for unpatterned layers, and we don't need to use this + // procedure for unpatterned layers anyway! + std::complex *dn_and_et = (std::complex*)rcwa_malloc(sizeof(std::complex) * 2*n2); + // These are just some convenience pointers to the various sections of + // the dn and et array + const std::complex *ndny = &dn_and_et[0]; + const std::complex *dnx = &dn_and_et[n]; + const std::complex *nety = &dn_and_et[n2]; + const std::complex *etx = &dn_and_et[n2+n]; + if(NULL != W){ + // We now have all we need to get the Fourier coefficients of the + // normal component of d and the tangential components of e. I + // allocated a new array for the dnormal components and etangential + // components. dnormal will be in the first 2N elements, etangential in + // the last 2N elements + // First dnormal + // This is my approach + /* RNP::TBLAS::MultMV<'N'>(n2,n2,z_one,N,n2,&eh[4*n2],1, z_zero,&dn_and_et[0],1); */ + // This is Weismann's approach verbatim + RNP::TBLAS::MultMV<'N'>(n2,n2,z_one,W,n2,&eh[4*n2],1, z_zero,&dn_and_et[0],1); +#ifdef DUMP_MATRICES + DUMP_STREAM << "Dnormal:" << std::endl; + RNP::IO::PrintVector(n2,dn_and_et,1,DUMP_STREAM) << std::endl << std::endl; +#endif + // Then lets construct \hat{T}. This is the operator that projects onto the + // direction tangential to a material interface, and is just I - P + std::complex *T; + // std::complex diag = std::complex(1.0*n); + std::complex diag = std::complex(1.0); + T = (std::complex*)rcwa_malloc(sizeof(std::complex) * n2*n2); + // Set T to the identity first + RNP::TBLAS::SetMatrix<'A'>(n2, n2, std::complex(0.), diag, T, n2); +#ifdef DUMP_MATRICES + DUMP_STREAM << "T:" << std::endl; + RNP::IO::PrintMatrix(n2,n2,N,n2, DUMP_STREAM) << std::endl << std::endl; +#endif + // Now subtract P from it. This loops through each row and treats each row + // as a vector, computing + // T[i,:] = -1*P[i,:] + T[i,:] + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Axpy(n2, std::complex(-1.), &P[0+i*n2], 1, &T[0+i*n2], 1); + } +#ifdef DUMP_MATRICES + DUMP_STREAM << "T:" << std::endl; + RNP::IO::PrintMatrix(n2,n2,T,n2, DUMP_STREAM) << std::endl << std::endl; + /* DUMP_STREAM << "Epsilon_inv:" << std::endl; */ + /* RNP::IO::PrintMatrix(n,n, epsilon_inv,n, DUMP_STREAM) << std::endl << std::endl; */ +#endif + // Now etangential + RNP::TBLAS::MultMV<'N'>(n2,n2,z_one,T,n2,&eh[4*n2],1, z_zero,&dn_and_et[n2],1); +#ifdef DUMP_MATRICES + DUMP_STREAM << "Etangential:" << std::endl; + RNP::IO::PrintVector(n2,&dn_and_et[n2],1,DUMP_STREAM) << std::endl << std::endl; +#endif + if(NULL != T){ + rcwa_free(T); + } + } + + // Here is where we populate the matrices of fourier coefficients that we + // are then going to DFT + for(size_t i = 0; i < n; ++i){ + // G just contains indices for the two components of the G vectors. A + // pair of these indices defines a point in k-space, and the indices in + // this pair are stored adjacent to one another in the G array. So + // these are basically just (m, n). The indices run from -Nmax/2 to + // Nmax/2 where Nmax is the radius in k-space you get after circular + // truncation. + const int iu = G[2*i+0]; + const int iv = G[2*i+1]; + // This if block has to do with the shifting of the origin. Is it at + // the corner or the center of the unit cell? + if( + (nxyoff[0] - (int)nxy[0] < iu && iu <= nxyoff[0]) && + (nxyoff[1] - (int)nxy[1] < iv && iv <= nxyoff[1]) + ){ + + const int ii = (iu >= 0 ? iu : iu + nxy[0]); + const int jj = (iv >= 0 ? iv : iv + nxy[1]); + // I think I need to jump in here, and just populate from[3] with + // the fourier coefficients of Dnormal,x and Etangential,x and + // from[4] with the y components + // Dnormal_{x,y} inside from[3],from[4] and Etangential_{x,y} + // inside from[6],from[7] + from[0][ii+jj*nxy[0]] = hx[i]; + from[1][ii+jj*nxy[0]] = hy[i]; + from[2][ii+jj*nxy[0]] = (kx[i] * ney[i] + ky[i] * ex[i]) / omega; + from[3][ii+jj*nxy[0]] = dnx[i]; + from[4][ii+jj*nxy[0]] = -ndny[i]; + from[5][ii+jj*nxy[0]] = eh[n+i] / omega; + from[6][ii+jj*nxy[0]] = etx[i]; + from[7][ii+jj*nxy[0]] = -nety[i]; + } + } + + for(unsigned i = 0; i < 8; ++i){ + fft_plan_exec(plan[i]); + } + + for(size_t j = 0; j < nxy[1]; ++j){ + for(size_t i = 0; i < nxy[0]; ++i){ + // This is the real space reconstruction + hfield[3*(i+j*nxy[0])+0] = to[0][i+j*nxy[0]]; + hfield[3*(i+j*nxy[0])+1] = to[1][i+j*nxy[0]]; + hfield[3*(i+j*nxy[0])+2] = to[2][i+j*nxy[0]]; + // x component + efield[3*(i+j*nxy[0])+0] = to[3][i+j*nxy[0]]/epsilon[i+j*nxy[0]] + to[6][i+j*nxy[0]]; + // y component + efield[3*(i+j*nxy[0])+1] = to[4][i+j*nxy[0]]/epsilon[i+j*nxy[0]] + to[7][i+j*nxy[0]];; + // z component + efield[3*(i+j*nxy[0])+2] = to[5][i+j*nxy[0]]; + } + } + + for(unsigned i = 0; i < 8; ++i){ + fft_plan_destroy(plan[i]); + fft_free(to[i]); + fft_free(from[i]); + } + rcwa_free(eh); +} + +void GetFieldOnGrid( + size_t n, // glist.n + int *G, + const double *kx, const double *ky, + std::complex omega, + const std::complex *q, // length 2*glist.n + const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) + const std::complex *phi, // size (2*glist.n)^2 + const std::complex *epsilon_inv, // size (glist.n)^2, non NULL for efield != NULL || kp == NULL + int epstype, + const std::complex *ab, // length 4*glist.n + const size_t nxy[2], // number of points per lattice direction + std::complex *efield, + std::complex *hfield +){ + const std::complex z_zero(0.); + const std::complex z_one(1.); + const size_t n2 = 2*n; + // Total # of real space sampling points + const size_t N = nxy[0]*nxy[1]; + // Center of grid + const int nxyoff[2] = { (int)(nxy[0]/2), (int)(nxy[1]/2) }; + int inxy[2] = { (int)nxy[0], (int)nxy[1] }; + int inxy_rev[2] = { (int)nxy[1], (int)nxy[0] }; + + std::complex *eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); + + GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); + const std::complex *hx = &eh[3*n2+0]; + const std::complex *hy = &eh[3*n2+n]; + const std::complex *ney = &eh[4*n2+0]; + const std::complex *ex = &eh[4*n2+n]; + + std::complex *from[6]; + std::complex *to[6]; + fft_plan plan[6]; + for(unsigned i = 0; i < 6; ++i){ + from[i] = fft_alloc_complex(N); + to[i] = fft_alloc_complex(N); + memset(from[i], 0, sizeof(std::complex) * N); + plan[i] = fft_plan_dft_2d(inxy_rev, from[i], to[i], 1); + } + + for(size_t i = 0; i < n; ++i){ + eh[i] = (ky[i]*hx[i] - kx[i]*hy[i]); + } + if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ + RNP::TBLAS::Scale(n, epsilon_inv[0], eh,1); + RNP::TBLAS::Copy(n, eh,1, &eh[n], 1); + }else{ + RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1); + } + for(size_t i = 0; i < n; ++i){ + const int iu = G[2*i+0]; + const int iv = G[2*i+1]; + if( + (nxyoff[0] - (int)nxy[0] < iu && iu <= nxyoff[0]) && + (nxyoff[1] - (int)nxy[1] < iv && iv <= nxyoff[1]) + ){ + const int ii = (iu >= 0 ? iu : iu + nxy[0]); + const int jj = (iv >= 0 ? iv : iv + nxy[1]); + from[0][ii+jj*nxy[0]] = hx[i]; + from[1][ii+jj*nxy[0]] = hy[i]; + from[2][ii+jj*nxy[0]] = (kx[i] * ney[i] + ky[i] * ex[i]) / omega; + from[3][ii+jj*nxy[0]] = ex[i]; + from[4][ii+jj*nxy[0]] = -ney[i]; + from[5][ii+jj*nxy[0]] = eh[n+i] / omega; + } + } + + for(unsigned i = 0; i < 6; ++i){ + fft_plan_exec(plan[i]); + } + + for(size_t j = 0; j < nxy[1]; ++j){ + for(size_t i = 0; i < nxy[0]; ++i){ + hfield[3*(i+j*nxy[0])+0] = to[0][i+j*nxy[0]]; + hfield[3*(i+j*nxy[0])+1] = to[1][i+j*nxy[0]]; + hfield[3*(i+j*nxy[0])+2] = to[2][i+j*nxy[0]]; + efield[3*(i+j*nxy[0])+0] = to[3][i+j*nxy[0]]; + efield[3*(i+j*nxy[0])+1] = to[4][i+j*nxy[0]]; + efield[3*(i+j*nxy[0])+2] = to[5][i+j*nxy[0]]; + } + } + + for(unsigned i = 0; i < 6; ++i){ + fft_plan_destroy(plan[i]); + fft_free(to[i]); + fft_free(from[i]); + } + rcwa_free(eh); +} + +void GetZStressTensorIntegral( + size_t n, // glist.n + const double *kx, + const double *ky, + std::complex omega, + const std::complex *q, // length 2*glist.n + const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) + const std::complex *phi, // size (2*glist.n)^2 + const std::complex *epsilon_inv, // size (glist.n)^2 + const std::complex *epsilon2, // size (2*glist.n)^2 + int epstype, + const std::complex *ab, // length 4*glist.n + std::complex integral[3], + std::complex *work +){ + const size_t n2 = 2*n; + const std::complex z_zero(0.); + const std::complex z_one(1.); + + std::complex *eh = work; + if(NULL == work){ + eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); + } + + GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); + std::complex *ez = &eh[1*n2+n]; + std::complex *dz = &eh[2*n2+0]; + std::complex *hz = &eh[2*n2+n]; + const std::complex *hx = &eh[3*n2+0]; + const std::complex *hy = &eh[3*n2+n]; + const std::complex *ney = &eh[4*n2+0]; + const std::complex *ex = &eh[4*n2+n]; + std::complex *ndy = &eh[5*n2+0]; + std::complex *dx = &eh[5*n2+n]; + + integral[0] = 0; + integral[1] = 0; + integral[2] = 0; + + for(size_t i = 0; i < n; ++i){ + hz[i] = (kx[i] * -ney[i] - ky[i] * ex[i]) / omega; + dz[i] = (ky[i]*hx[i] - kx[i]*hy[i]) / omega; + } + if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ + RNP::TBLAS::Copy(n, dz,1, ez, 1); + RNP::TBLAS::Scale(n, epsilon_inv[0], ez,1); + }else{ + RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, dz,1, z_zero,ez,1); + } + RNP::TBLAS::MultMV<'N'>(n2,n2, z_one,epsilon2,n2, ney,1, z_zero,ndy,1); + + for(size_t i = 0; i < n; ++i){ + std::complex cdz = std::conj(dz[i]); + std::complex cbz = std::conj(hz[i]); + integral[0] += ex[i]*cdz; + integral[1] +=-ney[i]*cdz; + integral[2] += 0.5*ez[i]*cdz; + integral[2] -= 0.5*ney[i]*std::conj(ndy[i]); + integral[2] -= 0.5*ex[i]*std::conj(dx[i]); + + integral[0] += hx[i]*cbz; + integral[1] += hy[i]*cbz; + integral[2] += 0.5*hz[i]*std::conj(hz[i]); + integral[2] -= 0.5*hy[i]*std::conj(hy[i]); + integral[2] -= 0.5*hx[i]*std::conj(hx[i]); + } + + if(NULL == work){ + rcwa_free(eh); + } +} + +std::complex zsinc(const std::complex &z){ + if(std::abs(z) < DBL_EPSILON){ + return 1.0; + }else{ + return std::sin(z)/z; + } +} + +void GetLayerVolumeIntegral( + char which, + size_t n, // glist.n + const double *kx, + const double *ky, + std::complex omega, + double thickness, + const std::complex *q, // length 2*glist.n + const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) + const std::complex *phi, // size (2*glist.n)^2 + const std::complex *epsilon_inv, // size (glist.n)^2 + const std::complex *epsilon2, // size (2*glist.n)^2 + int epstype, + const std::complex *ab, // length 4*glist.n + std::complex *integral, + std::complex *work +){ + const size_t n2 = 2*n; + const size_t n4 = 2*n2; + const std::complex omega2 = omega*omega; + const double aomega2(std::norm(omega)); + const double omega_2(1./aomega2); + std::complex omega2_2(1.); + if(0 != omega.imag()){ + omega2_2 = omega_2*omega2; + } + const std::complex z_zero(0.); + const std::complex z_one(1.); + + std::complex *Q = work; + if(NULL == work){ + Q = (std::complex*)rcwa_malloc(sizeof(std::complex)*n4*n4); + } + RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,0., Q,n4); + + // Form CsC (in Q) + if('E' == which || 'U' == which){ + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, epsilon2,n2, &Q[0+0*n4],n4); + }else if('e' == which){ + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., &Q[0+0*n4],n4); + } + if('H' == which || 'U' == which){ + for(size_t i = 0; i < n; ++i){ + Q[i+i*n4] += omega_2*kx[i]*kx[i]; + } + for(size_t i = 0; i < n; ++i){ + Q[i+n+i*n4] += omega_2*ky[i]*kx[i]; + } + for(size_t i = 0; i < n; ++i){ + Q[i+(i+n)*n4] += omega_2*kx[i]*ky[i]; + } + for(size_t i = 0; i < n; ++i){ + Q[i+n+(i+n)*n4] += omega_2*ky[i]*ky[i]; + } + } + // At this point, top block of CsC is done + if('E' == which || 'U' == which){ + MakeKPMatrix(omega, n, kx, ky, epsilon_inv, epstype, kp, &Q[n2+n2*n4],n4); + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Scale(n2, -omega_2, &Q[n2+(i+n2)*n4],1); + Q[i+n2+(i+n2)*n4] += z_one; + Q[i+n2+(i+n2)*n4] *= omega2_2; + } + }else if('e' == which){ + if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ + RNP::TBLAS::SetMatrix<'A'>(n,n, 0.,std::norm(epsilon_inv[0]), &Q[n2+n2*n4],n4); + }else{ + RNP::TBLAS::MultMM<'C','N'>(n,n,n, z_one,epsilon_inv,n, epsilon_inv,n, z_zero,&Q[n2+n2*n4],n4); + } + RNP::TBLAS::CopyMatrix<'A'>(n,n, &Q[n2+n2*n4],n4, &Q[n2+n+n2*n4],n4); + RNP::TBLAS::CopyMatrix<'A'>(n,n, &Q[n2+n2*n4],n4, &Q[n2+(n2+n)*n4],n4); + RNP::TBLAS::CopyMatrix<'A'>(n,n, &Q[n2+n2*n4],n4, &Q[n2+n+(n2+n)*n4],n4); + for(size_t i = 0; i < n; ++i){ + RNP::TBLAS::Scale(n2, ky[i], &Q[n2+0+(n2+i)*n4],1); + RNP::TBLAS::Scale(n2, -kx[i], &Q[n2+0+(n2+n+i)*n4],1); + RNP::TBLAS::Scale(n2, ky[i], &Q[n2+i+n2*n4],n4); + RNP::TBLAS::Scale(n2, -kx[i], &Q[n2+n+i+n2*n4],n4); + } + } + if('H' == which || 'U' == which){ + for(size_t i = 0; i < n2; ++i){ + Q[i+n2+(i+n2)*n4] += z_one; + } + } + + // Apply the amplitude-to-field matrix M to either side + // CsC = [ A ] M = [ kp.phi.inv(q) -kp.phi.inv(q) ] = [ fie -fie ] + // [ B ] [ phi phi ] [ phi phi ] + // Q = M^H * CsC * M = [ S+T -S+T ], S = fie^H * A * fie + // [ -S+T S+T ] T = phi^H * B * phi + // integral = int_0^thickness [x(z)^H * Q * x(z)] dz, x(z) = [a*exp(iqz);b*exp(iq(d-z))] + + // Make T + if(NULL != phi){ + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, z_one,&Q[n2+n2*n4],n4, phi,n2, z_zero,&Q[0+n2*n4],n4); + RNP::TBLAS::MultMM<'C','N'>(n2,n2,n2, z_one,phi,n2, &Q[0+n2*n4],n4, z_zero, &Q[n2+n2*n4],n4); + } + // At this point: Q = [ A ] + // [ T ] + if(NULL == kp){ + MakeKPMatrix(omega, n, kx, ky, epsilon_inv, epstype, kp, &Q[0+n2*n4],n4); + } + const std::complex *kp_use = (kp != NULL ? kp : &Q[0+n2*n4]); + const size_t ldkp = (kp != NULL ? n2 : n4); + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, z_one,&Q[0+0*n4],n4, kp_use,ldkp, z_zero,&Q[n2+0*n4],n4); + RNP::TBLAS::MultMM<'C','N'>(n2,n2,n2, z_one,kp_use,ldkp, &Q[n2+0*n4],n4, z_zero, &Q[0+0*n4],n4); + if(NULL != phi){ + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, z_one,&Q[0+0*n4],n4, phi,n2, z_zero,&Q[n2+0*n4],n4); + RNP::TBLAS::MultMM<'C','N'>(n2,n2,n2, z_one,phi,n2, &Q[n2+0*n4],n4, z_zero, &Q[0+0*n4],n4); + } + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Scale(n2, 1./(omega*q[i]), &Q[0+i*n4],1); + RNP::TBLAS::Scale(n2, 1./(omega*q[i]), &Q[i+0*n4],n4); + } + // At this point: Q = [ S ] + // [ T ] + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Q[n2+n2*n4],n4, &Q[n2+0*n4],n4); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Q[n2+n2*n4],n4, &Q[0+n2*n4],n4); + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Axpy(n2, -1., &Q[0+i*n4],1, &Q[n2+i*n4],1); + RNP::TBLAS::Axpy(n2, -1., &Q[0+i*n4],1, &Q[0+(i+n2)*n4],1); + } + // At this point: Q = [ S T-S ] + // [ T-S T ] + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Axpy(n2, 1., &Q[0+i*n4],1, &Q[n2+(i+n2)*n4],1); + } + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Q[n2+n2*n4],n4, &Q[0+0*n4],n4); + // Q done + + // The integral we need to construct is + // \int_0^d x(z)^H * Q * x(z) dz, where x(z) = [ a*e^{iqz}; b*e^{iq(d-z)} ] + // We are summing terms like + // \int_0^d x_i(z)^* x_j(z) Q_{ij} dz + // The generic form is + // Q_{ij} ab[i]^* ab[j] \int_0^d e^{-iq^* L_i(z)} e^{iq L_j(z)}, + // where L_i(z) is either z or d-z, and s_i is either 1 or -1. + // Note that L_i(d) = d or 0, so L_i(d) = d*(1+s_i)/2, and L_i(0) = d*(1-s_i)/2 + // The integral simplifies to + // \frac{1}{i(s_j q - s_i q^*)} [ e^{iq L_j(d)-iq^* L_i(d)} - e^{iq L_j(0)-iq^* L_i(0)} ] + // \frac{1}{i(s_j q - s_i q^*)} [ e^{id/2 (q (s_j+1)-q^* (s_i+1))} - e^{id/2 (q (1-s_j)-q^* (1-s_i))} ] + // \frac{1}{i(s_j q - s_i q^*)} [ e^{id/2 (q-q^*)} e^{id/2 (s_j q - s_i q^*)} - e^{id/2 (q-q^*)} e^{-id/2 (s_j q - s_i q^*)} ] + // \frac{e^{id/2 (q-q^*)} }{i(s_j q - s_i q^*)} [ e^{id/2 (s_j q - s_i q^*)} - e^{-id/2 (s_j q - s_i q^*)} ] + // \frac{e^{id/2 (q-q^*)} }{i(s_j q - s_i q^*)} 2 i sin[ d/2 (s_j q - s_i q^*) ] + // \frac{d e^{id/2 (q-q^*)} }{d/2 (s_j q - s_i q^*)} sin[ d/2 (s_j q - s_i q^*) ] + // d e^{id/2 (q-q^*)} sinc[ d/2 (s_j q - s_i q^*) ], where sinc(x) = sin(x)/x + + *integral = 0; + const double d_2 = 0.5*thickness; + for(size_t j = 0; j < n4; ++j){ + const double sj = (j < n2) ? 1 : -1; + const size_t qj = (j < n2) ? j : j-n2; + for(size_t i = 0; i < n4; ++i){ + const double si = (i < n2) ? 1 : -1; + const size_t qi = (i < n2) ? i : i-n2; + std::complex term = Q[i+j*n4] * std::conj(ab[i]) * ab[j]; + term *= thickness*std::exp(std::complex(0,d_2) * (q[qj] - std::conj(q[qi]))); + term *= zsinc(d_2 * (sj*q[qj] - si*std::conj(q[qi]))); + *integral += term; + } + } + if(NULL == work){ + rcwa_free(Q); + } +} + + +// returns Integral[ abs(XX)^2, z] for XX = ex, ey, ez, hx, hy, hz +void GetLayerZIntegral( + size_t n, // glist.n + const double *kx, + const double *ky, + std::complex omega, + const double &thickness, const double r[2], + const std::complex *q, // length 2*glist.n + const std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) + const std::complex *phi, // size (2*glist.n)^2 + const std::complex *epsilon_inv, // size (glist.n)^2 + const std::complex *epsilon2, // size (2*glist.n)^2 + int epstype, + const std::complex *ab, // length 4*glist.n + double integral[6], + std::complex *work +){ + static const std::complex zero(0.); + static const std::complex one(1.); + const size_t n2 = 2*n; + const size_t n4 = 2*n2; + const std::complex iomega(1./omega); + + std::complex *f = work; + if(NULL == work){ + f = (std::complex*)rcwa_malloc(sizeof(std::complex)*12*n4); + } + std::complex *g = f + 6*n4; + RNP::TBLAS::Fill(6*n4, 0., f, 1); + + // We will simultaneously generate f for all 6 components + + // Generate the Fourier phase factors + for(size_t i = 0; i < n; ++i){ // this is for ex + double theta = (kx[i]*r[0] + ky[i]*r[1]); + f[0*n4+1*n+i] = std::complex(cos(theta),-sin(theta)); // note the minus + } + RNP::TBLAS::Copy(n, &f[n],1, &f[1*n4+0*n],1); // -ey + if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ + RNP::TBLAS::Copy(n, &f[n4],1, &f[2*n4+n2],1); // ez + RNP::TBLAS::Scale(n, iomega*epsilon_inv[0], &f[2*n4+n2],1); + }else{ + RNP::TBLAS::MultMV<'C'>(n,n, iomega,epsilon_inv,n, &f[n4],1, zero,&f[2*n4+n2],1); // ez + } + RNP::TBLAS::Copy(n, &f[2*n4+n2],1, &f[2*n4+n2+n],1); // ez + for(size_t i = 0; i < n; ++i){ // ez + f[2*n4+n2+i] *= ky[i]; + } + for(size_t i = 0; i < n; ++i){ // ez + f[2*n4+n2+n+i] *= -kx[i]; + } + RNP::TBLAS::Copy(n, &f[n4],1, &f[3*n4+n2],1); // hx + RNP::TBLAS::Copy(n, &f[n4],1, &f[4*n4+n2+n],1); // hy + for(size_t i = 0; i < n; ++i){ // -hz + f[5*n4+0*n+i] = f[n4+i] * iomega * kx[i]; + } + for(size_t i = 0; i < n; ++i){ // -hz + f[5*n4+1*n+i] = f[n4+i] * iomega * ky[i]; + } + + // Form M^H C^H * f + // M = [ kp.phi.inv(q) -kp.phi.inv(q) ] + // [ phi phi ] + if(NULL != phi){ + MultKPMatrix("C", omega, n, kx, ky, epsilon_inv, epstype, kp, 6, f,n4, &g[n2],n4); + RNP::TBLAS::MultMM<'C','N'>(n2,6,n2, one,phi,n2, &g[n2],n4, zero,&g[0 ],n4); + RNP::TBLAS::MultMM<'C','N'>(n2,6,n2, one,phi,n2, &f[n2],n4, zero,&g[n2],n4); + }else{ + MultKPMatrix("C", omega, n, kx, ky, epsilon_inv, epstype, kp, 6, f,n4, &g[0],n4); + RNP::TBLAS::CopyMatrix<'A'>(n2,6, &f[n2],n4, &g[n2],n4); + } // top half of g is phi^H kp^H ftop, bottom half is phi^H fbot + for(size_t i = 0; i < n2; ++i){ + RNP::TBLAS::Scale(6, 1./std::conj(omega*q[i]), &g[i],n4); + } + for(size_t j = 0; j < 6; ++j){ + for(size_t i = 0; i < n2; ++i){ + std::complex t(g[i+j*n4]); + g[i+j*n4] += g[i+n2+j*n4]; + g[i+n2+j*n4] -= t; + } + } + + for(size_t i = 0; i < 6; ++i){ + integral[i] = 0; + } + + // At this point, each column of g contains a vector v such that + // v v^H is a rank-1 matrix that must be integrated over. + const double d_2 = 0.5*thickness; + for(size_t j = 0; j < n4; ++j){ + const double sj = (j < n2) ? 1 : -1; + const size_t qj = (j < n2) ? j : j-n2; + for(size_t i = 0; i < n4; ++i){ + const double si = (i < n2) ? 1 : -1; + const size_t qi = (i < n2) ? i : i-n2; + std::complex term = std::conj(ab[i]) * ab[j]; + term *= thickness*std::exp(std::complex(0,d_2) * (q[qj] - std::conj(q[qi]))); + term *= zsinc(d_2 * (sj*q[qj] - si*std::conj(q[qi]))); + for(size_t ii = 0; ii < 6; ++ii){ + integral[ii] += std::real(std::conj(g[j+ii*n4])*g[i+ii*n4] * term); + } + } + } + + if(NULL == work){ + rcwa_free(f); + } +} diff --git a/examples/bontempi_et_al_Nanoscale_2017.png b/examples/bontempi_et_al_Nanoscale_2017.png new file mode 100644 index 0000000..049e568 Binary files /dev/null and b/examples/bontempi_et_al_Nanoscale_2017.png differ diff --git a/gensetupHPC.py.sh b/gensetupHPC.py.sh new file mode 100644 index 0000000..6cf65b2 --- /dev/null +++ b/gensetupHPC.py.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +OBJDIR="$1" +LIBFILE="$2" +LIBS="$3" +x1="$4" +x2="$5" + +cat < setup.py +from distutils.core import setup, Extension + +libs = ['S4', 'stdc++'] +libs.extend( [lib[2::] for lib in '$LIBS'.split()]) + +S4module = Extension('S4', + sources = [ + 'S4/main_python.c' + ], + libraries = libs, + library_dirs = ['$OBJDIR', $x2], + extra_link_args = [ + '$LIBFILE', $x1 + ], + extra_compile_args=['-std=gnu99'] +) + +setup(name = 'S4', + version = '1.1', + description = 'Stanford Stratified Structure Solver (S4): Fourier Modal Method', + ext_modules = [S4module] +) +SETUPPY