diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b66b2c2a..b14383b4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -57,6 +57,7 @@ jobs: shell: bash run: | export PARRSB_RSB_ALGO=0 + export PARRSB_VERBOSE_LEVEL=2 cp ${EXAMPLESDIR}/genmap ${CIDIR}/${{ matrix.test }} cd ${CIDIR}/${{ matrix.test }} @@ -70,6 +71,7 @@ jobs: run: | export PARRSB_RSB_ALGO=1 export PARRSB_RSB_MG_FACTOR=2 + export PARRSB_VERBOSE_LEVEL=2 cp ${EXAMPLESDIR}/genmap ${CIDIR}/${{ matrix.test }} cd ${CIDIR}/${{ matrix.test }} @@ -83,20 +85,7 @@ jobs: run: | export PARRSB_RSB_ALGO=1 export PARRSB_RSB_MG_FACTOR=4 - - cp ${EXAMPLESDIR}/genmap ${CIDIR}/${{ matrix.test }} - cd ${CIDIR}/${{ matrix.test }} - - tol=(`cat test.txt | grep tol`); tol=${tol[2]} - ${MPIEXE} -np ${{ matrix.np }} ./genmap --mesh ${{ matrix.test }} \ - --tol=${tol} --dump=0 --test=1 - - name: genmap-mg-factor-4-smooth - if: always() - shell: bash - run: | - export PARRSB_RSB_ALGO=1 - export PARRSB_RSB_MG_FACTOR=4 - export PARRSB_RSB_MG_SMOOTH_AGGREGATION=1 + export PARRSB_VERBOSE_LEVEL=2 cp ${EXAMPLESDIR}/genmap ${CIDIR}/${{ matrix.test }} cd ${CIDIR}/${{ matrix.test }} diff --git a/.github/workflows/coarse.yml b/.github/workflows/coarse.yml deleted file mode 100644 index 3d19b685..00000000 --- a/.github/workflows/coarse.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: Coarse tests -on: - push: - branch: [main] - pull_request: - branch: [main] -env: - GITHUB.TOKEN: ${{ secrets.token }} - CIDIR: parRSB-github-ci - EXAMPLESDIR: build/examples - MPIEXE: "mpirun --oversubscribe" -jobs: - coarse: - runs-on: ubuntu-latest - strategy: - matrix: - test: [box_2x2x2, box_10x1x1, box_3x5x7, pyramid, solid, ethier, vortex, expansion] - np: [2, 3, 4] - fail-fast: false - name: "Coarse: ${{ matrix.test }}, NP = ${{ matrix.np }}" - env: - GSVER: 1.0.7 - CC: mpicc - steps: - - uses: actions/checkout@v3 - - name: Install apt dependencies - shell: bash - run: | - sudo apt -y update - sudo apt install -y openmpi-bin libopenmpi-dev - sudo apt install -y libblas-dev liblapack-dev - sudo apt install -y build-essential - - name: Build parRSB and clone tests - shell: bash - run: | - # Build gslib - git clone https://github.com/Nek5000/gslib.git - make -C gslib -j4 - - # Build parRSB - export GSLIBPATH=`pwd`/gslib/build/ - make -j4 examples - - # Clone tests - git clone https://github.com/thilinarmtb/${CIDIR}.git - - name: schur - if: always() - shell: bash - run: | - cp ${EXAMPLESDIR}/coarse ${CIDIR}/${{ matrix.test }} - cd ${CIDIR}/${{ matrix.test }} - - tol=(`cat test.txt | grep tol`); tol=${tol[2]} - ${MPIEXE} -np ${{ matrix.np }} ./coarse --mesh ${{ matrix.test }} \ - --tol=${tol} --crs_tol=1e-12 diff --git a/.github/workflows/ilu.yml b/.github/workflows/ilu.yml deleted file mode 100644 index 763d1c18..00000000 --- a/.github/workflows/ilu.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: ILU tests -on: - push: - branch: [main] - pull_request: - branch: [main] -env: - GITHUB.TOKEN: ${{ secrets.token }} - CIDIR: parRSB-github-ci - EXAMPLESDIR: build/examples - MPIEXE: "mpirun --oversubscribe" -jobs: - ilu: - runs-on: ubuntu-latest - strategy: - matrix: - test: [box_2x2x2, box_10x1x1, box_3x5x7, pyramid, solid, ethier, vortex, expansion] - np: [1, 2, 3, 4] - fail-fast: false - name: "ILU: ${{ matrix.test }}, NP = ${{ matrix.np }}" - env: - GSVER: 1.0.7 - CC: mpicc - steps: - - uses: actions/checkout@v3 - - name: Install apt dependencies - shell: bash - run: | - sudo apt -y update - sudo apt install -y openmpi-bin libopenmpi-dev - sudo apt install -y libblas-dev liblapack-dev - sudo apt install -y build-essential - sudo apt install -y octave - - name: Build parRSB and clone tests - shell: bash - run: | - # Build gslib - git clone https://github.com/Nek5000/gslib.git - make -C gslib -j4 - - # Build parRSB - export GSLIBPATH=`pwd`/gslib/build/ - make -j4 examples - - # Clone tests - git clone https://github.com/thilinarmtb/${CIDIR}.git - - name: ilu0 - if: always() - shell: bash - run: | - export PARRSB_DUMP_ILU=1 - - cp ${EXAMPLESDIR}/ilu .github/workflows/ilu0.m ${CIDIR}/${{ matrix.test }} - cd ${CIDIR}/${{ matrix.test }} - - tol=(`cat test.txt | grep tol`); tol=${tol[2]} - ${MPIEXE} -np ${{ matrix.np }} ./ilu --mesh ${{ matrix.test }} \ - --tol=${tol} --ilu_type=0 - - octave-cli ilu0.m - - name: iluc - if: always() - shell: bash - run: | - export PARRSB_DUMP_ILU=1 - - cp ${EXAMPLESDIR}/ilu .github/workflows/iluc.m ${CIDIR}/${{ matrix.test }} - cd ${CIDIR}/${{ matrix.test }} - - tol=(`cat test.txt | grep tol`); tol=${tol[2]} - ${MPIEXE} -np ${{ matrix.np }} ./ilu --mesh ${{ matrix.test }} \ - --tol=${tol} --ilu_type=1 - - octave-cli iluc.m diff --git a/.github/workflows/ilu0.m b/.github/workflows/ilu0.m deleted file mode 100644 index 92e959bc..00000000 --- a/.github/workflows/ilu0.m +++ /dev/null @@ -1,11 +0,0 @@ -load 'A.txt'; -load 'B.txt'; -A = spconvert(A); -B = spconvert(B); - -[L, U] = ilu(A); -n = size(A, 1); -I = speye(n); -err = norm(L + U - B - I, Inf); -printf('ILU err = %f', err); -assert(err < 1e-8); diff --git a/.github/workflows/iluc.m b/.github/workflows/iluc.m deleted file mode 100644 index cba26949..00000000 --- a/.github/workflows/iluc.m +++ /dev/null @@ -1,10 +0,0 @@ -load 'A.txt' -load 'LL.txt' -load 'UU.txt' - -A = spconvert(A); -LL = spconvert(LL); -UU = spconvert(UU); -err = norm(A - LL * UU, Inf); -printf('LU error: %f\n', err); -assert(err < 1e-8); diff --git a/Makefile b/Makefile index 555766cd..60a845f8 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ CC ?= mpicc -CFLAGS ?= +CFLAGS ?= -Wall -Wextra -Wpedantic -Wno-unused-function -Wno-unused-parameter -std=c99 LDFLAGS ?= DEBUG ?= 0 MPI ?= 1 @@ -8,6 +8,7 @@ SYNC_BY_REDUCTION ?= 1 BLAS ?= 0 BLASDIR ?= BLASFLAGS ?= -lblas -llapack +GSLIBPATH ?= ########################## Don't touch what follows ########################### ifeq ($(GSLIBPATH),) @@ -19,9 +20,10 @@ SRCROOT := $(realpath $(patsubst %/,%,$(dir $(MKFILEPATH)))) SRCDIR = $(SRCROOT)/src EXAMPLEDIR = $(SRCROOT)/examples BUILDROOT = $(SRCROOT)/build -INSTALLROOT = $(BUILDROOT)/install -ifneq ($(strip $(DESTDIR)),) - INSTALLROOT = $(realpath $(DESTDIR)) +ifneq (,$(strip $(DESTDIR))) +INSTALLROOT = $(DESTDIR) +else +INSTALLROOT = $(SRCROOT)/install endif SRCS = $(wildcard $(SRCDIR)/*.c) diff --git a/examples/coarse.c b/examples/coarse.c deleted file mode 100644 index 67909113..00000000 --- a/examples/coarse.c +++ /dev/null @@ -1,153 +0,0 @@ -//============================================================================= -// Test Schur complement solver -// -#include "coarse.h" -#include "parRSB.h" - -#include -#include - -static double check_err(double *b, double *x, uint nelt, uint nv, - const slong *vtx, MPI_Comm comm) { - struct comm c; - comm_init(&c, comm); - - slong out[2][1], buf[2][1], in = nelt; - comm_scan(out, &c, gs_long, gs_add, &in, 1, buf); - ulong start = out[0][0] + 1; - - ulong *eid = tcalloc(ulong, nelt); - for (uint i = 0; i < nelt; i++) - eid[i] = start + i; - - struct crystal cr; - crystal_init(&cr, &c); - - buffer bfr; - buffer_init(&bfr, 1024); - - struct array nbrs, eij; - find_nbrs(&nbrs, eid, vtx, nelt, nv, &cr, &bfr); - compress_nbrs(&eij, &nbrs, &bfr); - - struct par_mat M; - par_csr_setup(&M, &eij, 1, &bfr); - assert(M.rn > 0); - - free(eid), array_free(&nbrs), array_free(&eij); - - struct gs_data *gsh = setup_Q(&M, &c, &bfr); - double *bl = tcalloc(double, nelt); - double *wrk = tcalloc(double, M.rn + M.adj_off[M.rn]); - mat_vec_csr(bl, x, &M, gsh, wrk, &bfr); - - crystal_free(&cr), comm_free(&c); - gs_free(gsh), par_mat_free(&M); - - double norm = 0.0; - for (uint i = 0; i < nelt; i++) - norm += (bl[i] - b[i]) * (bl[i] - b[i]); - MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE, MPI_SUM, comm); - - free(wrk), free(bl); - buffer_free(&bfr); - - return sqrt(norm); -} - -static void setup_rhs(double *b, const unsigned int nelt, MPI_Comm comm) { - srand(time(NULL)); - double sum = 0; - for (int i = 0; i < nelt; i++) { - b[i] = (rand() % 50 + 1.0) / 10; - sum += b[i]; - } - MPI_Allreduce(MPI_IN_PLACE, &sum, 1, MPI_DOUBLE, MPI_SUM, comm); - - long long ng = nelt; - MPI_Allreduce(MPI_IN_PLACE, &ng, 1, MPI_LONG_LONG, MPI_SUM, comm); - sum /= ng; - - double norm = 0; - for (int i = 0; i < nelt; i++) { - b[i] -= sum; - norm += b[i] * b[i]; - } - - MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE, MPI_SUM, comm); - norm = sqrt(norm); - - for (int i = 0; i < nelt; i++) - b[i] /= norm; -} - -static void setup_and_solve(unsigned nelt, unsigned nv, const long long *vl, - const scalar *centroids, - const parrsb_cmd_line_opts *in, MPI_Comm comm) { - // Setup the coarse solve with schur complement solver - struct comm c; - comm_init(&c, comm); - - comm_barrier(&c); - double t = comm_time(); - struct coarse *crs = - coarse_setup(nelt, nv, vl, centroids, 1, in->crs_type, &c); - double tsetup = comm_time() - t; - - scalar *b = tcalloc(scalar, 2 * nelt); - setup_rhs(b, nelt, comm); - - comm_barrier(&c); - t = comm_time(); - scalar *x = b + nelt; - coarse_solve(x, crs, b, in->crs_tol); - double tsolve = MPI_Wtime() - t; - - double enorm = check_err(b, x, nelt, nv, vl, comm); - if (c.id == 0) { - printf("MPI Ranks = %d\ncoarse_setup: %lf\ncoarse_solve = %lf\nerr = %lf\n", - c.np, tsetup, tsolve, enorm); - fflush(stdout); - } - int err = (enorm > 10 * in->crs_tol); - parrsb_check_error(err, comm); - - // Free resources - coarse_free(crs), free(b); - comm_free(&c); -} - -int main(int argc, char *argv[]) { - MPI_Init(&argc, &argv); - MPI_Comm comm = MPI_COMM_WORLD; - - parrsb_cmd_line_opts *in = parrsb_parse_cmd_opts(argc, argv); - parrsb_check_error(in == NULL, comm); - - // Read the geometry from the .re2 file, find connectiviy, partition and then - // distribute the mesh. - unsigned nelt, nv; - long long *vl = NULL; - double *coord = NULL; - int err = parrsb_setup_mesh(&nelt, &nv, &vl, &coord, in, comm); - parrsb_check_error(err, comm); - - int ndim = (nv == 8) ? 3 : 2; - double *centroids = tcalloc(double, nelt *ndim); - for (uint i = 0; i < nelt; i++) { - for (int j = 0; j < nv; j++) { - for (int d = 0; d < ndim; d++) - centroids[i * ndim + d] += coord[i * ndim * nv + j * ndim + d]; - } - for (int d = 0; d < ndim; d++) - centroids[i * ndim + d] /= nv; - } - - setup_and_solve(nelt, nv, vl, centroids, in, comm); - - free(vl), free(coord), free(centroids); - parrsb_cmd_opts_free(in); - MPI_Finalize(); - - return 0; -} diff --git a/examples/genmap.c b/examples/genmap.c index a5307487..90941149 100644 --- a/examples/genmap.c +++ b/examples/genmap.c @@ -51,7 +51,7 @@ int main(int argc, char *argv[]) { parrsb_check_error(part == NULL, comm); parrsb_options options = parrsb_default_options; - err = parrsb_part_mesh(part, NULL, vl, coord, nelt, nv, options, comm); + err = parrsb_part_mesh(part, vl, coord, NULL, nelt, nv, &options, comm); parrsb_check_error(err, comm); // Redistribute data based on identified partitions diff --git a/examples/ilu.c b/examples/ilu.c deleted file mode 100644 index 0ade8e51..00000000 --- a/examples/ilu.c +++ /dev/null @@ -1,36 +0,0 @@ -//============================================================================= -// Test ILU factorization -// -#include "ilu.h" -#include "parRSB.h" - -int main(int argc, char *argv[]) { - MPI_Init(&argc, &argv); - MPI_Comm comm = MPI_COMM_WORLD; - - parrsb_cmd_line_opts *in = parrsb_parse_cmd_opts(argc, argv); - parrsb_check_error(in == NULL, comm); - - // Read the geometry from the .re2 file, find connectiviy, partition and then - // distribute the mesh. - unsigned int nelt, nv; - long long *vl = NULL; - double *coord = NULL; - parrsb_setup_mesh(&nelt, &nv, &vl, &coord, in, comm); - - // Setup ILU - ilu_options iluopt = {.type = in->ilu_type, - .tol = in->ilu_tol, - .pivot = in->ilu_pivot, - .verbose = in->verbose, - .nnz_per_row = 0}; - struct ilu *ilu = ilu_setup(nelt, nv, vl, &iluopt, comm); - ilu_free(ilu); - - // Free resources - free(vl), free(coord); - parrsb_cmd_opts_free(in); - MPI_Finalize(); - - return 0; -} diff --git a/src/coarse-impl.h b/src/coarse-impl.h deleted file mode 100644 index 65f8ff9c..00000000 --- a/src/coarse-impl.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef _PARRSB_COARSE_IMPL_H_ -#define _PARRSB_COARSE_IMPL_H_ - -#include "coarse.h" - -uint unique_ids(sint *perm, ulong *uid, uint n, const ulong *ids, buffer *bfr); - -struct coarse { - unsigned type; // type = schur-2-lvl, schur-3-lvl - unsigned null_space; // Is there a null space or not - uint un; // User vector size - uint cn; // Compressed (ignoring duplicates and zero global ids) vector size - uint an; // Assembled size -- this is the local size of the assmebled coarse - // matrix - sint *u2c; // Mapping from user vector to compress vector - struct gs_data *c2a; // Mapping from compressed vector to assmbled vector - buffer bfr; - - ulong s[3], ng[3]; - uint n[3]; - struct comm c; - void *solver; -}; - -int schur_setup(struct coarse *crs, struct array *eij, struct crystal *cr, - buffer *bfr); -int schur_solve(scalar *x, struct coarse *crs, scalar *b, scalar tol, - buffer *bfr); -int schur_free(struct coarse *crs); - -#endif diff --git a/src/coarse-laplacian.c b/src/coarse-laplacian.c deleted file mode 100644 index 9dc2dd6d..00000000 --- a/src/coarse-laplacian.c +++ /dev/null @@ -1,293 +0,0 @@ -#include "coarse-impl.h" -#include "metrics.h" -#include - -//------------------------------------------------------------------------------ -// Setup coarse grid system. Initial dumb API. -// -// Number rows, local first then interface. Returns global number of local -// elements. -struct rcb_t { - uint i, s; - double coord[3]; - slong vtx[8]; -}; - -static void nmbr_local_rcb(struct array *a, uint s, uint e, const unsigned nc, - const unsigned ndim, const unsigned level, - struct comm *c, buffer *bfr) { - sint size = e - s; - if (size <= 1) - return; - - double max[3] = {-DBL_MAX, -DBL_MAX, -DBL_MAX}, - min[3] = {DBL_MAX, DBL_MAX, DBL_MAX}; - - struct rcb_t *pa = (struct rcb_t *)a->ptr; - for (uint i = s; i < e; i++) { - for (int j = 0; j < ndim; j++) { - if (pa[i].coord[j] < min[j]) - min[j] = pa[i].coord[j]; - if (pa[i].coord[j] > max[j]) - max[j] = pa[i].coord[j]; - } - } - - double len = max[0] - min[0]; - int axis = 0; - for (int j = 1; j < ndim; j++) { - if (max[j] - min[j] > len) - axis = j, len = max[j] - min[j]; - } - - struct rcb_t *ps = pa + s; - switch (axis) { - case 0: - sarray_sort(struct rcb_t, ps, size, coord[0], 3, bfr); - break; - case 1: - sarray_sort(struct rcb_t, ps, size, coord[1], 3, bfr); - break; - case 2: - sarray_sort(struct rcb_t, ps, size, coord[2], 3, bfr); - break; - default: - break; - } - - // Number the elements in the interface - uint npts = size * nc; - slong *vtx = tcalloc(slong, npts); - for (uint i = s, k = 0; i < e; i++) { - for (int j = 0; j < nc; j++, k++) - vtx[k] = pa[i].vtx[j]; - } - - struct gs_data *gsh = gs_setup(vtx, npts, c, 0, gs_pairwise, 0); - - sint *dof = tcalloc(sint, npts); - uint mid = (s + e) / 2; - for (uint i = mid, k = (mid - s) * nc; i < e; i++) { - for (int j = 0; j < nc; j++, k++) - dof[k] = 1; - } - - gs(dof, gs_int, gs_add, 0, gsh, bfr); - - for (uint i = mid, k = (mid - s) * nc; i < e; i++) { - for (int j = 0; j < nc; j++, k++) - dof[k] = 0; - } - - gs(dof, gs_int, gs_add, 0, gsh, bfr); - - for (uint i = s, k = 0; i < e; i++, k++) { - for (int j = 0; j < nc; j++) { - if (dof[k * nc + j] > 0 && pa[i].s == INT_MAX) { - pa[i].s = level; - break; - } - } - } - - gs_free(gsh); - free(dof), free(vtx); - - nmbr_local_rcb(a, s, mid, nc, ndim, level + 1, c, bfr); - nmbr_local_rcb(a, mid, e, nc, ndim, level + 1, c, bfr); -} - -// Number the DOFs internal first, faces second and all the rest (wire basket) -// next. This keeps zeros as is and renumber the positive entries in `ids` -// array. -static void number_dual_graph_dofs(ulong *dofs, struct coarse *crs, uint n, - const slong *ids, uint nelt, unsigned ndim, - const scalar *coord, buffer *bfr) { - int nnz = (n > 0); - struct comm c; - comm_split(&crs->c, nnz, crs->c.id, &c); - - unsigned nc = n / nelt; - uint i, j; - if (nnz) { - sint *dof = tcalloc(sint, n); - int level = 1; - while (c.np > 1) { - struct gs_data *gsh = gs_setup(ids, n, &c, 0, gs_pairwise, 0); - - int bin = (c.id >= (c.np + 1) / 2); - for (i = 0; i < n; i++) - dof[i] = bin; - - gs(dof, gs_int, gs_add, 0, gsh, bfr); - - if (bin == 1) { - for (i = 0; i < n; i++) - dof[i] = 0; - } - - gs(dof, gs_int, gs_add, 0, gsh, bfr); - - for (i = 0; i < nelt; i++) { - for (j = 0; j < nc; j++) { - if (dof[i * nc + j] > 0 && !dofs[i]) { - dofs[i] = level; - break; - } - } - } - - gs_free(gsh); - - struct comm t; - comm_split(&c, bin, c.id, &t); - comm_free(&c); - comm_dup(&c, &t); - comm_free(&t); - - level++; - } - free(dof); - } - - for (i = crs->n[0] = crs->n[1] = 0; i < nelt; i++) { - if (dofs[i] > 0) - crs->n[1]++; - else - crs->n[0]++; - } - - slong in[2] = {crs->n[0], crs->n[1]}, out[2][2], wrk[2][2]; - comm_scan(out, &crs->c, gs_long, gs_add, in, 2, wrk); - crs->s[0] = out[0][0] + 1, crs->ng[0] = out[1][0]; - crs->s[1] = out[0][1] + 1, crs->ng[1] = out[1][1]; - - struct array local; - array_init(struct rcb_t, &local, crs->n[0]); - - struct rcb_t t = {.s = INT_MAX}; - ulong s = crs->ng[0] + crs->s[1]; - for (uint i = 0; i < nelt; i++) { - if (dofs[i] > 0) - dofs[i] = s++; - else { - t.i = i; - memcpy(t.coord, &coord[i * ndim], ndim * sizeof(scalar)); - memcpy(t.vtx, &ids[i * nc], nc * sizeof(slong)); - array_cat(struct rcb_t, &local, &t, 1); - } - } - - if (local.n > 0) { - nmbr_local_rcb(&local, 0, local.n, nc, ndim, 1, &c, bfr); - sarray_sort(struct rcb_t, local.ptr, local.n, s, 0, bfr); - struct rcb_t *pl = (struct rcb_t *)local.ptr; - ulong s = crs->s[0]; - for (sint i = local.n - 1; i >= 0; i--) - dofs[pl[i].i] = s++; - } - - comm_free(&c); - array_free(&local); -} - -struct coarse *coarse_setup(unsigned n, unsigned nc, const long long *vl, - const scalar *coord, unsigned null_space, - unsigned type, struct comm *c) { - comm_barrier(c); - double tcrs = comm_time(); - - // crs->un is the user vector size. - struct coarse *crs = tcalloc(struct coarse, 1); - crs->null_space = null_space, crs->type = type, crs->un = n; - for (unsigned i = 0; i < 3; i++) - crs->ng[i] = crs->s[i] = crs->n[i] = 0; - - // Setup the buffer and duplicate the communicator. - buffer_init(&crs->bfr, 1024); - comm_dup(&crs->c, c); - - uint size = n * nc; - slong *tid = tcalloc(slong, size); - for (uint i = 0; i < size; i++) - tid[i] = vl[i]; - - ulong *nid = tcalloc(ulong, n); - unsigned ndim = (nc == 8) ? 3 : 2; - number_dual_graph_dofs(nid, crs, size, tid, crs->un, ndim, coord, &crs->bfr); - - // Find unique ids and user vector to compressed vector mapping. - // In the case of dual-graph Laplacian, all the ids are unique. - // But here we arrange them in the sorted order. - ulong *uid = tcalloc(ulong, n); - crs->u2c = tcalloc(sint, n); - crs->cn = unique_ids(crs->u2c, uid, crs->un, nid, &crs->bfr); - crs->an = crs->cn; - - struct crystal cr; - crystal_init(&cr, &crs->c); - - struct array nbrs, eij; - find_nbrs(&nbrs, nid, tid, n, nc, &cr, &crs->bfr); - // Convert `struct nbr` -> `struct mij` and compress entries which share the - // same (r, c) values. Set the diagonal element to have zero row sum - compress_nbrs(&eij, &nbrs, &crs->bfr); - array_free(&nbrs); - - switch (type) { - case 0: - schur_setup(crs, &eij, &cr, &crs->bfr); - break; - default: - break; - } - - array_free(&eij), crystal_free(&cr); - free(tid), free(nid), free(uid); - - return crs; -} - -void coarse_solve(scalar *x, struct coarse *crs, scalar *b, scalar tol) { - metric_init(); - - scalar *rhs = tcalloc(scalar, 2 * crs->an), *xx = rhs + crs->an; - for (uint i = 0; i < crs->un; i++) { - if (crs->u2c[i] >= 0) - rhs[crs->u2c[i]] += b[i]; - } - - switch (crs->type) { - case 0: - schur_solve(xx, crs, rhs, tol, &crs->bfr); - break; - default: - break; - } - - for (uint i = 0; i < crs->un; i++) { - if (crs->u2c[i] >= 0) - x[i] = xx[crs->u2c[i]]; - } - free(rhs); - - metric_push_level(); - metric_crs_print(&crs->c, 1); - metric_finalize(); -} - -void coarse_free(struct coarse *crs) { - if (crs != NULL) { - switch (crs->type) { - case 0: - schur_free(crs); - break; - default: - break; - } - if (crs->u2c) - free(crs->u2c); - comm_free(&crs->c), buffer_free(&crs->bfr); - free(crs), crs = NULL; - } -} diff --git a/src/coarse.c b/src/coarse.c deleted file mode 100644 index 08ee3155..00000000 --- a/src/coarse.c +++ /dev/null @@ -1,394 +0,0 @@ -#include "coarse-impl.h" -#include "metrics.h" -#include "sort.h" - -//------------------------------------------------------------------------------ -// Better API for coarse grid system. -// -uint unique_ids(sint *perm, ulong *uid, uint n, const ulong *ids, buffer *bfr) { - struct id_t { - ulong id; - uint idx; - sint perm; - }; - - struct array arr; - array_init(struct id_t, &arr, n); - - uint i; - struct id_t t = {.id = 0, .idx = 0, .perm = -1}; - for (i = 0; i < n; i++) { - t.id = ids[i], t.idx = i; - array_cat(struct id_t, &arr, &t, 1); - } - - sarray_sort(struct id_t, arr.ptr, arr.n, id, 1, bfr); - struct id_t *pa = (struct id_t *)arr.ptr; - - // Ignore the ids numbered zero - sint un = 0; - ulong last = 0; - for (uint i = 0; i < arr.n; i++) { - ulong v = pa[i].id; - if (v != last) - last = uid[un] = v, un++; - pa[i].perm = un - 1; - } - - sarray_sort(struct id_t, pa, n, idx, 0, bfr); - pa = (struct id_t *)arr.ptr; - for (i = 0; i < n; i++) - perm[i] = pa[i].perm; - - array_free(&arr); - return un; -} - -// Number rows, local first then interface. Returns global number of local -// elements. -struct rsb_t { - uint i, s; - slong vtx[8]; -}; - -static void number_dofs(slong *nid, struct coarse *crs, const slong *ids, - const ulong *uid) { - uint un = crs->un; - buffer *bfr = &crs->bfr; - struct comm *ci = &crs->c; - sint *u2c = crs->u2c; - - int nnz = (un > 0); - struct comm c; - comm_split(ci, nnz, ci->id, &c); - - uint i, j; - if (nnz) { - sint *dof = tcalloc(sint, un); - int level = 1; - while (c.np > 1) { - struct gs_data *gsh = gs_setup(ids, un, &c, 0, gs_pairwise, 0); - - int bin = (c.id >= (c.np + 1) / 2); - for (i = 0; i < un; i++) - dof[i] = bin; - - gs(dof, gs_int, gs_add, 0, gsh, bfr); - - if (bin == 1) { - for (i = 0; i < un; i++) - dof[i] = 0; - } - - gs(dof, gs_int, gs_add, 0, gsh, bfr); - - for (i = 0; i < un; i++) { - if (dof[i] > 0 && u2c[i] >= 0 && !nid[u2c[i]]) - nid[u2c[i]] = level; - } - - gs_free(gsh); - - struct comm t; - comm_split(&c, bin, c.id, &t); - comm_free(&c); - comm_dup(&c, &t); - comm_free(&t); - - level++; - } - free(dof); - } - - // Calculate unqiue local and interface nodes based on compress ids. - // Finding unique local ids is easy. To find unique interface ids, we - // will have to sort in parallel and then manually find the unique ids. - struct dof_t { - ulong id, nid; - uint p, p0, idx; - }; - - struct array arr; - array_init(struct dof_t, &arr, crs->cn); - - uint ln = 0; - struct dof_t t = {.id = 0, .nid = 0, .p = 0, .p0 = ci->id, .idx = 0}; - for (i = 0; i < crs->cn; i++) { - if (!nid[i]) - ln++; - else - t.id = uid[i], t.idx = i, array_cat(struct dof_t, &arr, &t, 1); - } - crs->n[0] = ln; - - slong cnt[1] = {ln}, out[2][1], wrk[2][1]; - comm_scan(out, ci, gs_long, gs_add, cnt, 1, wrk); - crs->s[0] = out[0][0] + 1, crs->ng[0] = out[1][0]; - - for (i = 0, ln = 0; i < crs->cn; i++) { - if (!nid[i]) - nid[i] = crs->s[0] + ln, ln++; - } - assert(crs->n[0] == ln); - - // parallel_sort and set nid and send back to p0 - parallel_sort(struct dof_t, &arr, id, gs_long, 0, 0, ci, bfr); - - uint in = 0; - if (arr.n > 0) { - struct dof_t *pa = (struct dof_t *)arr.ptr; - for (i = in = 1; i < arr.n; i++) - in += (pa[i].id != pa[i - 1].id); - } - - cnt[0] = in; - comm_scan(out, ci, gs_long, gs_add, cnt, 1, wrk); - crs->ng[1] = out[1][0]; - slong s = crs->ng[0] + out[0][0] + 1; - - if (in) { - struct dof_t *pa = (struct dof_t *)arr.ptr; - i = 0; - while (i < arr.n) { - for (j = i + 1; j < arr.n && pa[j].id == pa[i].id; j++) - ; - for (; i < j; i++) - pa[i].nid = s; - s++; - } - } - - struct crystal cr; - crystal_init(&cr, ci); - sarray_transfer(struct dof_t, &arr, p0, 0, &cr); - crystal_free(&cr); - - sarray_sort(struct dof_t, arr.ptr, arr.n, id, 1, bfr); - struct dof_t *pa = (struct dof_t *)arr.ptr; - for (i = 0; i < arr.n; i++) - nid[pa[i].idx] = pa[i].nid; - - array_free(&arr); - comm_free(&c); -} - -// n = ncr * nelt -// nz = ncr * ncr * nelt -struct coarse *crs_parrsb_setup(uint n, const ulong *id, uint nz, - const uint *Ai, const uint *Aj, const scalar *A, - unsigned null_space, unsigned type, - const struct comm *c) { - comm_barrier(c); - double tcrs = comm_time(); - - // crs->un is the user vector size. - struct coarse *crs = tcalloc(struct coarse, 1); - crs->null_space = null_space, crs->type = type, crs->un = n; - for (unsigned i = 0; i < 3; i++) - crs->ng[i] = crs->s[i] = crs->n[i] = 0; - - // Setup the buffer and duplicate the communicator. - buffer_init(&crs->bfr, 1024); - comm_dup(&crs->c, c); - - // Let's renumber the ids just in case its the schur solver. Schwarz solver - // doesn't need re-numbering but we are going to go ahead and do it. - slong *tid = tcalloc(slong, crs->un); - for (uint i = 0; i < n; i++) - tid[i] = id[i]; - - // Find the mapping from user ids to unique ids (compressed ids) local to the - // processor. Compressed vector size is `crs->cn`. - ulong *uid = tcalloc(ulong, crs->un); - crs->u2c = tcalloc(sint, crs->un); - crs->cn = unique_ids(crs->u2c, uid, crs->un, tid, &crs->bfr); -#if 0 - for (uint i = 0; i < crs->un; i++) { - printf("p = %d i = %u perm[i] = %d\n", c->id, i, crs->u2c[i]); - fflush(stdout); - } -#endif - - // Now renumber unique ids based on whether they are internal or on interface. - slong *nid = tcalloc(slong, crs->cn); - number_dofs(nid, crs, tid, uid); - free(tid), free(uid); - - // Now let's setup the coarse system. Create `struct mij` entries and pass - // them into schur setup. Which processor owns the dof? All the local dofs - // are owned by those specific preocessors -- interface dofs are owned in - // a load balanced manner. - uint nr = crs->ng[1] / c->np, nrem = crs->ng[1] - nr * c->np; - uint p0 = c->np - nrem; - ulong s0 = p0 * nr; - - struct array mijs; - array_init(struct mij, &mijs, n); - - struct mij m = {.r = 0, .c = 0, .idx = 0, .p = 0, .v = 0}; - for (uint k = 0; k < nz; k++) { - sint i = crs->u2c[Ai[k]], j = crs->u2c[Aj[k]]; - if (i < 0 || j < 0 || A[k] == 0) - continue; - m.r = nid[i], m.c = nid[j], m.v = A[k], m.p = c->id; - if (m.r > crs->ng[0]) { - if (m.r - crs->ng[0] <= s0) - m.p = (m.r - crs->ng[0] - 1) / nr; - else - m.p = p0 + (m.r - crs->ng[0] - s0 - 1) / (nr + 1); - } - array_cat(struct mij, &mijs, &m, 1); - } - - // Now let's assemble the matrix by sending load balancing the interface rows. - // Assembled size is `an`. - struct crystal cr; - crystal_init(&cr, c); - sarray_transfer(struct mij, &mijs, p, 1, &cr); - - nid = trealloc(slong, nid, crs->cn + crs->n[0] + nr + 1); - for (uint i = 0; i < crs->cn; i++) - nid[i] = -nid[i]; - - crs->an = 0; - if (mijs.n > 0) { - sarray_sort_2(struct mij, mijs.ptr, mijs.n, r, 1, c, 1, &crs->bfr); - struct mij *pm = (struct mij *)mijs.ptr; - uint i = 0, j; - while (i < mijs.n) { - for (j = i + 1; j < mijs.n && pm[j].r == pm[i].r; j++) - ; - nid[crs->cn + crs->an] = pm[i].r, crs->an++, i = j; - } - } - crs->n[1] = crs->an - crs->n[0]; - crs->s[1] = nid[crs->cn + crs->n[0]]; - crs->c2a = gs_setup(nid, crs->cn + crs->an, c, 0, gs_pairwise, 0); - - tcrs = comm_time() - tcrs; - double wrk, min = tcrs, max = tcrs; - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - if (c->id == 0) { - printf("parrsb_crs_setup: %g %g (min max)\n", min, max); - fflush(stdout); - } - - comm_barrier(c); - tcrs = comm_time(); - - switch (type) { - case 0: - schur_setup(crs, &mijs, &cr, &crs->bfr); - break; - default: - break; - } - - min = max = comm_time() - tcrs; - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - if (c->id == 0) { - printf("schur_setup: %g %g (min max)\n", min, max); - fflush(stdout); - } - - array_free(&mijs), crystal_free(&cr); - - return crs; -} - -void crs_parrsb_solve(scalar *x, struct coarse *crs, scalar *b, scalar tol) { - metric_init(); - - scalar *rhs = tcalloc(scalar, crs->cn + crs->an); - for (uint i = 0; i < crs->un; i++) { - if (crs->u2c[i] >= 0) - rhs[crs->u2c[i]] += b[i]; - } - -#if 0 - for (uint i = 0; i < crs->cn; i++) { - printf("p = %d i = %u before b[i] = %lf\n", crs->c.id, i, rhs[i]); - fflush(stdout); - } -#endif - - gs(rhs, gs_double, gs_add, 1, crs->c2a, &crs->bfr); - -#if 0 - char name[BUFSIZ]; - snprintf(name, BUFSIZ, "rsb_b_np_%d_id_%d_nl_%lld_ni_%lld.txt", crs->c.np, - crs->c.id, crs->n[0], crs->n[1]); - FILE *fp = fopen(name, "w"); - if (fp) { - for (uint i = 0; i < crs->an; i++) - fprintf(fp, "%lf\n", rhs[crs->cn + i]); - fclose(fp); - } -#endif - -#if 0 - for (uint i = 0; i < crs->an; i++) { - printf("p = %d i = %u after b[i] = %lf\n", crs->c.id, i, rhs[crs->cn + i]); - fflush(stdout); - } -#endif - - switch (crs->type) { - case 0: - schur_solve(rhs + crs->cn, crs, rhs + crs->cn, tol, &crs->bfr); - break; - default: - break; - } - -#if 0 - for (uint i = 0; i < crs->an; i++) { - printf("p = %d i = %u x[i] = %lf w[i] = %lf\n", crs->c.id, i, - rhs[crs->cn + i], weights[crs->cn + i]); - fflush(stdout); - } -#endif - - gs(rhs, gs_double, gs_add, 0, crs->c2a, &crs->bfr); - for (uint i = 0; i < crs->un; i++) { - if (crs->u2c[i] >= 0) - x[i] = rhs[crs->u2c[i]]; - else - x[i] = 0; - } - free(rhs); - -#if 0 - snprintf(name, BUFSIZ, "rsb_x_np_%d_id_%d_un_%u.txt", crs->c.np, crs->c.id, - crs->un); - fp = fopen(name, "w"); - if (fp) { - for (uint i = 0; i < crs->un; i++) - fprintf(fp, "%lf\n", x[i]); - fclose(fp); - } -#endif - - metric_push_level(); - metric_crs_print(&crs->c, 1); - metric_finalize(); -} - -void crs_parrsb_free(struct coarse *crs) { - if (crs != NULL) { - switch (crs->type) { - case 0: - schur_free(crs); - break; - default: - break; - } - if (crs->u2c) - free(crs->u2c); - gs_free(crs->c2a); - comm_free(&crs->c), buffer_free(&crs->bfr); - free(crs), crs = NULL; - } -} diff --git a/src/coarse.h b/src/coarse.h deleted file mode 100644 index 49698985..00000000 --- a/src/coarse.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef _PARRSB_COARSE_H_ -#define _PARRSB_COARSE_H_ - -#include "gslib.h" -#include "mat.h" - -struct coarse; - -// API for the Laplacian (which involves solving for the dual graph) -struct coarse *coarse_setup(unsigned nelt, unsigned nv, const long long *vtx, - const scalar *coord, unsigned null_space, - unsigned type, struct comm *c); -void coarse_solve(scalar *x, struct coarse *crs, scalar *b, scalar tol); -void coarse_free(struct coarse *crs); - -// Alternative API for a general matrix -#define crs_parrsb_setup PREFIXED_NAME(crs_parrsb_setup) -#define crs_parrsb_solve PREFIXED_NAME(crs_parrsb_solve) -#define crs_parrsb_free PREFIXED_NAME(crs_parrsb_free) - -struct coarse *crs_parrsb_setup(uint n, const ulong *id, uint nz, - const uint *Ai, const uint *Aj, const scalar *A, - unsigned null_space, unsigned type, - const struct comm *comm); -void crs_parrsb_solve(scalar *x, struct coarse *crs, scalar *b, scalar tol); -void crs_parrsb_free(struct coarse *crs); - -#endif diff --git a/src/components.c b/src/components.c index 28eda2f8..3efb84fb 100644 --- a/src/components.c +++ b/src/components.c @@ -36,7 +36,8 @@ uint get_components(sint *component, struct array *elems, unsigned nv, struct comm cc; uint count = 0; - slong nnz1, nnzg, nnzg0, nnzb, nmarked = 0; + slong nnz1, nnzg, nnzg0, nnzb; + ulong nmarked = 0; do { // Count unmarked elements arr.n = 0; @@ -120,13 +121,13 @@ struct cmp_t { }; static sint find_or_insert(struct array *cids, struct cmp_t *t) { - // If there are no elements in the array, insert and exit + // If there are no elements in the array, insert and exit. if (cids->n == 0) { array_cat(struct cmp_t, cids, t, 1); return -1; } - // Otherwise, we will do a binary search + // Otherwise, we will do a binary search. struct cmp_t *pc = (struct cmp_t *)cids->ptr; sint s = 0, e = cids->n - 1, mid = 0; while (s <= e) { @@ -139,7 +140,7 @@ static sint find_or_insert(struct array *cids, struct cmp_t *t) { s = mid + 1; } - // Okay, not found -- insert at `mid` or `mid + 1` + // Okay, not found -- insert at `mid` or `mid + 1`. uint max = cids->max; if (max == cids->n) { max += max / 2 + 1; @@ -158,7 +159,7 @@ static sint find_or_insert(struct array *cids, struct cmp_t *t) { } pc[n] = t0, cids->n++; - // Sanity check + // Sanity check. for (unsigned i = 1; i < cids->n; i++) assert(pc[i - 1].c < pc[i].c); @@ -177,8 +178,9 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv, if (nelg == 0) return 0; - uint nev = nelt * nv; - sint *p0 = tcalloc(sint, 2 * nev), *p = p0 + nev; + const uint nev = nelt * nv; + sint *p0 = tcalloc(sint, nev); + sint *p = tcalloc(sint, nev); slong *ids = tcalloc(slong, nev); uint *inds = tcalloc(uint, nev); @@ -190,9 +192,10 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv, component[e] = -1; struct comm c; - slong nmkd = 0, nc = 0; + ulong nmkd = 0; + slong nc = 0; do { - // Copy unmarked elements to ids + // Copy unmarked elements to ids. uint unmkd = 0; for (uint e = 0; e < nelt; e++) { if (component[e] == -1) { @@ -206,28 +209,30 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv, int bin = (unmkd > 0); comm_split(ci, bin, ci->id, &c); - slong nnzg = 0, nnzg0 = 0, ncg = 0; + slong nnzg = 0, ncg = 0; if (bin == 1) { - // Setup gs - struct gs_data *gsh = gs_setup(ids, unmkd * nv, &c, 0, gs_pairwise, 0); - - // Mark the first unmarked element as seed for the component c.id + // Mark the first unmarked element as seed for the component c.id. for (uint v = 0; v < nv; v++) p[0 * nv + v] = c.id; - // Initialize the rest of p + // Initialize the rest of p. for (uint e = 1; e < unmkd; e++) for (uint v = 0; v < nv; v++) p[e * nv + v] = -1; - sint nnz, changed; + // Setup gather-scatter to do BFS. + struct gs_data *gsh = gs_setup(ids, unmkd * nv, &c, 0, gs_pairwise, 0); + + // Perform BFS. + sint changed; do { for (uint i = 0; i < unmkd * nv; i++) p0[i] = p[i]; gs(p, gs_int, gs_max, 0, gsh, bfr); - nnz = changed = 0; + changed = 0; + sint nnz = 0; for (uint e = 0; e < unmkd; e++) { sint v0 = -1; for (uint v = 0; v < nv; v++) { @@ -239,7 +244,8 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv, } } - // There was one non-zero vertex in the element + // If there was at least one non-zero vertex in the element, we mark + // the element with that value. if (v0 > -1) { sint c = p[e * nv + v0]; for (uint v = 0; v < nv; v++) @@ -247,6 +253,7 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv, nnz++; } + // Check if the component id changed. for (uint v = 0; v < nv; v++) { if (p[e * nv + v] != p0[e * nv + v]) { changed = 1; @@ -255,14 +262,15 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv, } } - nnzg0 = nnzg, nnzg = nnz; + nnzg = nnz; comm_allreduce(&c, gs_long, gs_add, &nnzg, 1, wrk); comm_allreduce(&c, gs_int, gs_add, &changed, 1, wrk); } while (changed); + gs_free(gsh); // Find unique local components and then use them to find unique - // global components + // global components. struct array cids; array_init(struct cmp_t, &cids, 100); @@ -276,10 +284,13 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv, struct crystal cr; crystal_init(&cr, &c); - sarray_transfer(struct cmp_t, &cids, p, 1, &cr); - // find unique components and number them + // Send the component id `C` to `C % P` where `P` is the number of + // processors. + sarray_transfer(struct cmp_t, &cids, p, 1, &cr); sarray_sort(struct cmp_t, cids.ptr, cids.n, c, 0, bfr); + + // Find unique components and number them globally. uint cnt = 0; if (cids.n > 0) { cnt++; @@ -307,8 +318,9 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv, sarray_transfer(struct cmp_t, &cids, p, 0, &cr); crystal_free(&cr); - sarray_sort(struct cmp_t, cids.ptr, cids.n, c, 0, bfr); + + // Now assign the global component id to the marked elements. for (uint e = 0; e < unmkd; e++) { if (p[e * nv + 0] > -1) { t.c = p[e * nv + 0]; @@ -328,9 +340,9 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv, nc += ncg; } while (nmkd < nelg); - free(p0), free(ids), free(inds); if (null_input == 1) free(component); + free(p0), free(p), free(ids), free(inds); return nc; } diff --git a/src/con-check.c b/src/con-check.c index f0e57937..dfe9e86a 100644 --- a/src/con-check.c +++ b/src/con-check.c @@ -24,8 +24,8 @@ typedef struct { } ProcID; static VToEMap *getVToEMap(Mesh m, struct comm *c, buffer *bfr) { - sint nelt = m->nelt; - sint nv = m->nv; + uint nelt = m->nelt; + uint nv = m->nv; slong out[2][1], buf[2][1], in = nelt; comm_scan(out, c, gs_long, gs_add, &in, 1, buf); @@ -38,7 +38,7 @@ static VToEMap *getVToEMap(Mesh m, struct comm *c, buffer *bfr) { // Create (globalId, elementId) pairs and send them to globalId % np Point ptr = m->elements.ptr; - sint i, j; + uint i, j; for (i = 0; i < nelt; i++) { for (j = 0; j < nv; j++) { ulong globalId = ptr[i * nv + j].globalId + 1; @@ -90,7 +90,7 @@ static VToEMap *getVToEMap(Mesh m, struct comm *c, buffer *bfr) { array_init(ProcID, &procs, 10); vPtr = vtcsCmpct.ptr; - sint s = 0, e; + uint s = 0, e; vertex t; ProcID p; while (s < vtcsCmpct.n) { @@ -168,12 +168,12 @@ static VToEMap *getVToEMap(Mesh m, struct comm *c, buffer *bfr) { } // key must be present in globalIds -static int getPosition(VToEMap *map, ulong key) { +static uint getPosition(VToEMap *map, ulong key) { ulong *globalIds = map->globalIds; - int begin = 0; - int end = map->size; - int mid = 0; + uint begin = 0; + uint end = map->size; + uint mid = 0; while (begin < end) { mid = (begin + end) / 2; @@ -186,7 +186,7 @@ static int getPosition(VToEMap *map, ulong key) { }; if (globalIds[mid] != key) - return -1; + return UINT_MAX; return mid; } @@ -197,11 +197,11 @@ static void freeVToEMap(VToEMap *map) { free(map); } -int faceCheck(Mesh mesh, struct comm *c, buffer *bfr) { +int face_check(Mesh mesh, struct comm *c, buffer *bfr) { VToEMap *map = getVToEMap(mesh, c, bfr); - sint nelt = mesh->nelt; - sint ndim = mesh->ndim; + uint nelt = mesh->nelt; + uint ndim = mesh->ndim; int faces[GC_MAX_FACES][GC_MAX_FACE_VERTICES]; if (ndim == 3) @@ -210,24 +210,24 @@ int faceCheck(Mesh mesh, struct comm *c, buffer *bfr) { memcpy(faces, faces2D, GC_MAX_FACES * GC_MAX_FACE_VERTICES * sizeof(int)); Point ptr = mesh->elements.ptr; - int nf = (ndim == 3) ? 6 : 4; - int nfv = (ndim == 3) ? 4 : 2; - int nv = (ndim == 3) ? 8 : 4; + uint nf = (ndim == 3) ? 6 : 4; + uint nfv = (ndim == 3) ? 4 : 2; + uint nv = (ndim == 3) ? 8 : 4; struct array shared; array_init(LongID, &shared, 200); int err = 0; - int i, j, k, l; + uint i, j, k, l; for (i = 0; i < nelt && err == 0; i++) { for (j = 0; j < nf && err == 0; j++) { shared.n = 0; for (k = 0; k < nfv; k++) { ulong globalId = ptr[i * nv + faces[j][k] - 1].globalId + 1; - int indx = getPosition(map, globalId); - assert(indx >= 0); + uint indx = getPosition(map, globalId); + assert(indx < UINT_MAX); LongID elemId; for (l = map->offsets[indx]; l < map->offsets[indx + 1]; l++) { elemId.id = map->elements[l]; @@ -265,10 +265,10 @@ int faceCheck(Mesh mesh, struct comm *c, buffer *bfr) { return err; } -int elementCheck(Mesh mesh, struct comm *c, buffer *bfr) { +int element_check(Mesh mesh, struct comm *c, buffer *bfr) { uint nelt = mesh->nelt; uint ndim = mesh->ndim; - int nv = (ndim == 3) ? 8 : 4; + uint nv = (ndim == 3) ? 8 : 4; LongID globalIds[8]; Point ptr = mesh->elements.ptr; diff --git a/src/con-impl.h b/src/con-impl.h index 252acdd8..07bd8aa0 100644 --- a/src/con-impl.h +++ b/src/con-impl.h @@ -3,7 +3,6 @@ #include "parrsb-impl.h" #include "sort.h" -#include /* Preprocessor Corner notation: Symmetric Corner notation: @@ -126,10 +125,10 @@ int send_back(Mesh mesh, struct comm *c, buffer *bfr); int find_unique_vertices(Mesh mesh, struct comm *c, scalar tol, int verbose, buffer *bfr); -int matchPeriodicFaces(Mesh mesh, struct comm *c, buffer *bfr); +int match_periodic_faces(Mesh mesh, struct comm *c, int verbose, buffer *bfr); -int elementCheck(Mesh mesh, struct comm *c, buffer *bfr); +int element_check(Mesh mesh, struct comm *c, buffer *bfr); -int faceCheck(Mesh mesh, struct comm *c, buffer *bfr); +int face_check(Mesh mesh, struct comm *c, buffer *bfr); #endif // _CON_IMPL_H_ diff --git a/src/con-periodic.c b/src/con-periodic.c index e915d539..b0c61dc8 100644 --- a/src/con-periodic.c +++ b/src/con-periodic.c @@ -1,5 +1,7 @@ #include "con-impl.h" +#include + //============================================================================== // Handle periodic BCs // @@ -26,9 +28,9 @@ static int compressPeriodicVertices(Mesh mesh, struct comm *c, buffer *bfr) { Point points = mesh->elements.ptr; uint npoints = mesh->elements.n; - sint i, nunique = 0; + uint i, nunique = 0; if (npoints > 0) { - slong current = points[0].globalId; + ulong current = points[0].globalId; points[0].globalId = nunique; for (i = 1; i < npoints; i++) if (points[i].globalId == current) @@ -187,7 +189,8 @@ static int findConnectedPeriodicFaces(Mesh mesh, struct array *matched) { for (i = 0; i < bSize - 1; i++) { for (j = i + 1; j < bSize; j++) - if (ptr[j].bc[0] == ptr[i].elementId && ptr[j].bc[1] == ptr[i].faceId) { + if ((ulong)ptr[j].bc[0] == ptr[i].elementId && + (ulong)ptr[j].bc[1] == ptr[i].faceId) { findConnectedPeriodicPairs(mesh, &ptr[i], &ptr[j], matched); } } @@ -195,7 +198,7 @@ static int findConnectedPeriodicFaces(Mesh mesh, struct array *matched) { } static int gatherMatchingPeriodicFaces(Mesh mesh, struct comm *c) { - int size = c->np, rank = c->id; + uint size = c->np; BoundaryFace bPtr = mesh->boundary.ptr; int nFaces = mesh->boundary.n; @@ -208,7 +211,7 @@ static int gatherMatchingPeriodicFaces(Mesh mesh, struct comm *c) { sint i; slong eid; for (i = 0; i < nFaces; i++) { - eid = MAX(bPtr[i].bc[0], bPtr[i].elementId); + eid = MAX((ulong)bPtr[i].bc[0], bPtr[i].elementId); if (eid < N) bPtr[i].proc = eid / nelt; else @@ -263,19 +266,33 @@ static int setPeriodicFaceCoordinates(Mesh mesh, struct comm *c, buffer *buf) { return 0; } -int matchPeriodicFaces(Mesh mesh, struct comm *c, buffer *bfr) { +int match_periodic_faces(Mesh mesh, struct comm *c, int verbose, buffer *bfr) { + const char *functions[6] = { + "set_periodic_face_coords ", "gather_matching_periodic_faces", + "find_connected_periodic_faces ", "renumber_periodic_vertices ", + "compress_periodic_vertices ", "send_back "}; + + parrsb_print(c, verbose, "\t\t%s ...", functions[0]); setPeriodicFaceCoordinates(mesh, c, bfr); + + parrsb_print(c, verbose, "\t\t%s ...", functions[1]); gatherMatchingPeriodicFaces(mesh, c); struct array matched; array_init(struct mpair_t, &matched, 10); matched.n = 0; + parrsb_print(c, verbose, "\t\t%s ...", functions[2]); findConnectedPeriodicFaces(mesh, &matched); + + parrsb_print(c, verbose, "\t\t%s ...", functions[3]); renumberPeriodicVertices(mesh, c, &matched, bfr); array_free(&matched); + parrsb_print(c, verbose, "\t\t%s ...", functions[4]); compressPeriodicVertices(mesh, c, bfr); + + parrsb_print(c, verbose, "\t\t%s ...", functions[5]); send_back(mesh, c, bfr); return 0; diff --git a/src/con-unique-vertices.c b/src/con-unique-vertices.c index f610dc8b..b4564f8a 100644 --- a/src/con-unique-vertices.c +++ b/src/con-unique-vertices.c @@ -63,10 +63,12 @@ static void tuple_sort_(void *ra, uint n, uint usize, uint offset) { tuple_sort_((void *)arr, n, sizeof(T), offsetof(T, index)) static void sort_segments_local(struct array *local, int dim) { - sint npts = local->n; - struct point_t *pts = (struct point_t *)local->ptr; + uint npts = local->n; + if (npts == 0) + return; - sint s = 0, e; + struct point_t *const pts = (struct point_t *const)local->ptr; + uint s = 0, e; while (s < npts) { for (e = s + 1; e < npts && pts[e].ifSegment == 0; e++) ; @@ -99,7 +101,8 @@ static void sort_segments_local(struct array *local, int dim) { } static void sort_segments_shared_aux(struct array *arr, int dim, struct comm *c, - buffer *bfr) { + int verbose, buffer *bfr) { + parrsb_print(c, verbose, "\t\t\t\tsss_aux_parallel_sort: ...\n"); switch (dim) { case 0: parallel_sort(struct point_t, arr, x[0], gs_double, 0, 1, c, bfr); @@ -113,23 +116,85 @@ static void sort_segments_shared_aux(struct array *arr, int dim, struct comm *c, default: break; } + parrsb_print(c, verbose, "\t\t\t\tsss_aux_parallel_sort: done.\n"); // Mark the first point of the segment to have ifSegment = 1 and zero out // everything else. - struct point_t *pts = (struct point_t *)arr->ptr; + struct point_t *const pts = (struct point_t *const)arr->ptr; for (uint i = 0; i < arr->n; i++) pts[i].ifSegment = 0; + sint wrk; sint rank = (arr->n > 0) ? c->id : c->np; - sint wrk[2]; - comm_allreduce(c, gs_int, gs_min, &rank, 1, wrk); + comm_allreduce(c, gs_int, gs_min, &rank, 1, &wrk); - if (c->id == rank) + if ((sint)c->id == rank) pts[0].ifSegment = 1; + + parrsb_print(c, verbose, "\t\t\t\tsss_aux_mark_first_point: done."); +} + +static uint find_bin_scan(const sint sum, const struct comm *c, + const int verbose, buffer *bfr) { + sint out[2][1], wrk[2][1], in = sum; + comm_scan(out, c, gs_int, gs_add, &in, 1, wrk); + return out[0][0]; +} + +static uint find_bin_gs(const slong id, const struct comm *c, const int verbose, + buffer *bfr) { + slong gid = id + 1; + struct gs_data *gsh = gs_setup(&gid, 1, c, 0, gs_crystal_router, verbose); + parrsb_print(c, verbose, "\t\t\tsss_gs_setup: done."); + sint bin = c->id; + gs(&bin, gs_int, gs_min, 0, gsh, bfr); + gs_free(gsh); + + return bin; +} + +static uint find_bin_cr(const slong id, const struct comm *c, const int verbose, + buffer *bfr) { + struct gid_t { + ulong id; + uint proc, procm; + }; + + struct array arr; + array_init(struct gid_t, &arr, 1); + + struct gid_t gid = {.id = id, .proc = id % c->np, .procm = c->id}; + array_cat(struct gid_t, &arr, &gid, 1); + + struct crystal cr; + crystal_init(&cr, c); + + sarray_transfer(struct gid_t, &arr, proc, 1, &cr); + if (arr.n > 0) { + sarray_sort_2(struct gid_t, arr.ptr, arr.n, id, 1, procm, 0, bfr); + struct gid_t *pa = (struct gid_t *)arr.ptr; + uint s = 0; + while (s < arr.n) { + uint e = s + 1; + for (; e < arr.n && pa[s].id == pa[e].id; e++) + pa[e].procm = pa[s].procm; + s = e; + } + } + sarray_transfer(struct gid_t, &arr, proc, 0, &cr); + + crystal_free(&cr); + + assert(arr.n == 1); + struct gid_t *pa = (struct gid_t *)arr.ptr; + uint procm = pa[0].procm; + array_free(&arr); + + return procm; } static void sort_segments_shared(struct array *shared, int dim, struct comm *c, - buffer *bfr) { + int verbose, buffer *bfr) { // Each process can only have at most a single ifSegment = 1 in shared // array. Otherwise, we can always move the segments into the local segments // array till we end up in such a configuration. Let's first check for this @@ -157,13 +222,24 @@ static void sort_segments_shared(struct array *shared, int dim, struct comm *c, } } assert(sum <= 1); - assert(ngids <= 1 || (ngids == 2 && gids[0] + 1 == gids[1])); + assert(ngids <= 1 || (ngids == 2 && gids[1] == gids[0] + 1)); + parrsb_print(c, verbose, "\t\t\tsss_local: done."); + + // Algorithm to be used for finding the bin id for segmented shared sort. + // Default (algo = 0) is the scan. algo = 1 is gs with gs_crystal_router. + // algo = 2 is a custom crystal router implementation. + int algo = 0; + char *val = getenv("PARRSB_FIND_BIN_ALGO"); + if (val) + algo = atoi(val); + assert(algo >= 0 && algo <= 2); // We sort the shared segments in two phases. All the segments having an even // global id are sorted first and then the segments having an odd global id // are sorted. This is done to avoid same process having to work on both the // global ids (if ngids = 2) it owns at the same time. for (int parity = 0; parity < 2; parity++) { + parrsb_print(c, verbose, "\t\t\tsss_parity_%d: ...", parity); int index = INT_MIN; if (gids[0] >= 0 && (gids[0] % 2 == parity)) index = 0; @@ -173,21 +249,30 @@ static void sort_segments_shared(struct array *shared, int dim, struct comm *c, struct comm active, seg; comm_split(c, index >= 0, c->id, &active); if (index >= 0) { - // Setup a gs handle to find the minimum rank with the current global id - // and use that rank as the bin for the comm_split. - slong id = gids[index] + 1; - struct gs_data *gsh = gs_setup(&id, 1, &active, 0, gs_pairwise, 0); - sint bin = active.id; - gs(&bin, gs_int, gs_min, 0, gsh, bfr); - gs_free(gsh); + assert(gids[index] >= 0); + sint bin = -1; + if (algo == 0) { + uint off = (ngids == 1 && sum == 1) || (ngids == 2 && index == 1); + bin = find_bin_scan(sum, &active, verbose - 1, bfr) + off; + } else if (algo == 1) { + bin = find_bin_gs(gids[index], &active, verbose - 1, bfr); + } else if (algo == 2) { + bin = find_bin_cr(gids[index], &active, verbose - 1, bfr); + } + parrsb_print(&active, verbose, + "\t\t\tsss_find_bin_algo_%d_parity_%d: done.", algo, parity); + assert(bin >= 0 && bin <= (sint)active.np); // index >= 0 --> gids[index] >= 0 --> segments[index].n > 0 comm_split(&active, bin, active.id, &seg); - sort_segments_shared_aux(&segments[index], dim, &seg, bfr); + sort_segments_shared_aux(&segments[index], dim, &seg, verbose - 1, bfr); comm_free(&seg); + parrsb_print(&active, verbose, "\t\t\tsss_aux_%d: done.", parity); } comm_free(&active); + parrsb_print(c, verbose, "\t\t\tsss_parity_%d: done.", parity); } + parrsb_print(c, verbose, "\t\t\tsss_shared: done."); // Combine the segments after sorting. shared->n = 0; @@ -216,7 +301,7 @@ static int talk_to_neighbor(struct point_t *pnt, const struct array *arr, struct point_t *pts = (struct point_t *)arr->ptr; sint dest = (sint)c->id + dir; - if (dest >= 0 && dest < c->np) { + if (dest >= 0 && dest < (sint)c->np) { struct point_t p = (dir == 1) ? pts[arr->n - 1] : pts[0]; p.proc = dest; array_cat(struct point_t, &tmp, &p, 1); @@ -294,8 +379,8 @@ static void separate_local_segments(struct array *local, struct array *shared, s = e; } - sint check = lcheck, wrk[2]; - comm_allreduce(c, gs_int, gs_add, &check, 1, wrk); + sint check = lcheck, wrk; + comm_allreduce(c, gs_int, gs_add, &check, 1, &wrk); if (check) { // Bring the first point from next process. Check if `ifSegment` value // of that point is a 1 or a 0. If it is a 1, add the current range to @@ -357,9 +442,9 @@ static slong number_segments(struct array *local, struct array *shared, return st + lt; } -static int number_points(struct array *elems, const struct array *local, - const struct array *shared, const struct comm *c, - buffer *bfr) { +static void number_points(struct array *elems, const struct array *local, + const struct array *shared, const struct comm *c, + buffer *bfr) { // First number local points and then number shared points. slong out[2][1], wrk[2][1], in = local->n; comm_scan(out, c, gs_long, gs_add, &in, 1, wrk); @@ -401,8 +486,8 @@ int find_unique_vertices(Mesh mesh, struct comm *c, scalar tol, int verbose, for (uint i = 0; i < elems->n; i++) pts[i].ifSegment = pts[i].globalId = 0; - slong npts = elems->n, wrk[2]; - comm_allreduce(c, gs_long, gs_add, &npts, 1, wrk); + slong npts = elems->n, wrk; + comm_allreduce(c, gs_long, gs_add, &npts, 1, &wrk); // Initialize shared and local arrays and then copy all points in `elems` // array to shared array first. Shared array contains only the segments which @@ -417,29 +502,33 @@ int find_unique_vertices(Mesh mesh, struct comm *c, scalar tol, int verbose, for (int t = 0; t < ndim; t++) { for (int d = 0; d < ndim; d++) { - debug_print(c, verbose, "\t\tlocglob: %d %d", t + 1, d + 1); - // Sort both local and shared segments. - sort_segments_shared(&shared, d, c, bfr); + parrsb_print(c, verbose - 1, "\t\tsort_shared_segments ..."); + sort_segments_shared(&shared, d, c, verbose - 1, bfr); + parrsb_print(c, verbose - 1, "\t\tsort_local_segments ..."); sort_segments_local(&local, d); // Find segments in local and shared segments now. + parrsb_print(c, verbose - 1, "\t\tfind_shared_segments ..."); find_segments(&shared, d, tol2, c); + parrsb_print(c, verbose - 1, "\t\tfind_local_segments ..."); find_segments(&local, d, tol2, &COMM_NULL); // Separate local segments from the shared segments. + parrsb_print(c, verbose - 1, "\t\tseparate_local_segments ..."); separate_local_segments(&local, &shared, c); // Number the segments. + parrsb_print(c, verbose - 1, "\t\tnumber_segments ..."); slong nseg = number_segments(&local, &shared, c); - debug_print(c, verbose, " %lld %lld\n", nseg, npts); + parrsb_print(c, verbose, "\tlocglob: %d %d %lld %lld", t + 1, d + 1, nseg, + npts); } } // Number points consecutively -- shared points after local and then load // balance. - debug_print(c, verbose, "\t\tnumber points and load balance ..."); + parrsb_print(c, verbose - 1, "\tnumber_points_and_load_balance ..."); number_points(elems, &local, &shared, c, bfr); - debug_print(c, verbose, "done.\n"); array_free(&shared), array_free(&local); return 0; diff --git a/src/con.c b/src/con.c index 60e502c9..c34a918b 100644 --- a/src/con.c +++ b/src/con.c @@ -1,7 +1,4 @@ #include "con-impl.h" -#include "parrsb-impl.h" -#include "sort.h" -#include int PRE_TO_SYM_VERTEX[GC_MAX_VERTICES] = {0, 1, 3, 2, 4, 5, 7, 6}; int PRE_TO_SYM_FACE[GC_MAX_FACES] = {2, 1, 3, 0, 4, 5}; @@ -9,24 +6,13 @@ int NEIGHBOR_MAP[GC_MAX_VERTICES][GC_MAX_NEIGHBORS] = { {1, 2, 4}, {0, 3, 5}, {0, 3, 6}, {1, 2, 7}, {0, 5, 6}, {1, 4, 7}, {2, 4, 7}, {3, 5, 6}}; -void debug_print(struct comm *c, int verbose, const char *fmt, ...) { - comm_barrier(c); - va_list vargs; - va_start(vargs, fmt); - if (c->id == 0 && verbose > 0) { - vprintf(fmt, vargs); - fflush(stdout); - } - va_end(vargs); -} - double diff_sqr(double x, double y) { return (x - y) * (x - y); } //============================================================================== // Mesh struct // -static struct mesh_t *mesh_init(int nelt, int ndim, double *coord, - long long *pinfo, int npinfo, +static struct mesh_t *mesh_init(uint nelt, unsigned ndim, double *coord, + long long *pinfo, uint npinfo, const struct comm *c) { struct mesh_t *m = tcalloc(struct mesh_t, 1); m->nelt = nelt, m->ndim = ndim, m->nnbrs = ndim; @@ -37,7 +23,7 @@ static struct mesh_t *mesh_init(int nelt, int ndim, double *coord, ulong start = out[0][0]; m->nelgt = out[1][0]; - int nv = m->nv; + uint nv = m->nv; array_init(struct point_t, &m->elements, nelt * nv); struct point_t p = {.origin = c->id}; for (uint i = 0; i < nelt; i++) { @@ -79,39 +65,39 @@ static inline double distance_3d(struct point_t *a, struct point_t *b) { return distance_2d(a, b) + diff_sqr(a->x[2], b->x[2]); } -int findMinNeighborDistance(Mesh mesh) { +int find_min_neighbor_distance(Mesh mesh) { struct point_t *p = (struct point_t *)mesh->elements.ptr; - int ndim = mesh->ndim; - int nv = mesh->nv; + uint ndim = mesh->ndim; + uint nv = mesh->nv; - uint i, j, k; - int neighbor; - scalar d; + if (ndim < 2 || ndim > 3) + return 1; + uint i, j, k, neighbor; if (ndim == 3) { for (i = 0; i < mesh->elements.n; i += nv) { for (j = 0; j < nv; j++) { p[i + j].dx = SCALAR_MAX; for (k = 0; k < mesh->nnbrs; k++) { neighbor = NEIGHBOR_MAP[j][k]; - d = distance_3d(&p[i + j], &p[i + neighbor]); + scalar d = distance_3d(&p[i + j], &p[i + neighbor]); p[i + j].dx = MIN(p[i + j].dx, d); } } } - } else if (ndim == 2) { + } + + if (ndim == 2) { for (i = 0; i < mesh->elements.n; i += nv) { for (j = 0; j < nv; j++) { p[i + j].dx = SCALAR_MAX; for (k = 0; k < mesh->nnbrs; k++) { neighbor = NEIGHBOR_MAP[j][k]; - d = distance_2d(&p[i + j], &p[i + neighbor]); + scalar d = distance_2d(&p[i + j], &p[i + neighbor]); p[i + j].dx = MIN(p[i + j].dx, d); } } } - } else { - return 1; } return 0; @@ -120,7 +106,7 @@ int findMinNeighborDistance(Mesh mesh) { //============================================================================== // Global numbering // -static int setGlobalID(Mesh mesh, struct comm *c) { +static int set_global_id(Mesh mesh, struct comm *c) { uint nPoints = mesh->elements.n; Point points = (struct point_t *)mesh->elements.ptr; @@ -128,9 +114,6 @@ static int setGlobalID(Mesh mesh, struct comm *c) { struct comm nonZeroRanks; comm_split(c, bin, c->id, &nonZeroRanks); - sint rank = nonZeroRanks.id; - sint size = nonZeroRanks.np; - if (bin == 1) { slong count = 0; for (uint i = 0; i < nPoints; i++) @@ -167,7 +150,7 @@ int send_back(Mesh mesh, struct comm *c, buffer *bfr) { return 0; } -static int transferBoundaryFaces(Mesh mesh, struct comm *c) { +static int transfer_boundary_faces(Mesh mesh, struct comm *c) { uint size = c->np; struct array *boundary = &mesh->boundary; @@ -200,24 +183,13 @@ static int transferBoundaryFaces(Mesh mesh, struct comm *c) { //============================================================================== // C interface to find_conn // -#define check_error(call, msg) \ - { \ - sint err = (call); \ - sint buf; \ - comm_allreduce(&c, gs_int, gs_max, &err, 1, &buf); \ - if (err) { \ - buffer_free(&bfr), mesh_free(mesh), comm_free(&c); \ - return err; \ - } \ - } - // Input: // nelt: Number of elements, nv: Number of vertices in an element // coord [nelt, nv, ndim]: Coordinates of elements vertices in preprocessor // ordering, nv = 8 if ndim == 3 (Hex) or nv = 4 if ndim = 2 (Quad). // Output: // vtx[nelt, nv]: Global numbering of vertices of elements -int parrsb_conn_mesh(long long *vtx, double *coord, int nelt, int ndim, +int parrsb_conn_mesh(long long *vtx, double *coord, uint nelt, unsigned ndim, long long *pinfo, int npinfo, double tol, MPI_Comm comm) { struct comm c; comm_init(&c, comm); @@ -225,73 +197,77 @@ int parrsb_conn_mesh(long long *vtx, double *coord, int nelt, int ndim, buffer bfr; buffer_init(&bfr, 1024); - int verbose = 0; + int verbose = 1; { const char *val = getenv("PARRSB_VERBOSE_LEVEL"); if (val != NULL) verbose = atoi(val); } - debug_print(&c, verbose, "Running parCon ...\n"); + parrsb_print(&c, verbose, "Running parCon ..."); parrsb_barrier(&c); double tall = comm_time(), t; double duration[8] = {0}; - const char *name[8] = {"transferBoundaryFaces", "findMinNbrDistance ", - "find_unique_vertices ", "setGlobalId ", - "elementCheck ", "faceCheck ", - "matchPeriodicFaces ", "copyOutput "}; + const char *name[8] = { + "transfer_boundary_faces ", "find_min_neighbor_distance ", + "find_unique_vertices ", "set_global_id ", + "element_check ", "face_check ", + "match_periodic_faces ", "copy_output "}; - // debug_print(&c, verbose, "\t%s ..."); - // parrsb_barrier(&c), t = comm_time(); Mesh mesh = mesh_init(nelt, ndim, coord, pinfo, npinfo, &c); - // duration[0] = comm_time() - t; - // debug_print(&c, verbose, "done.\n"); - debug_print(&c, verbose, "\t%s ...", name[0]); + parrsb_print(&c, verbose - 1, "\t%s ...", name[0]); parrsb_barrier(&c), t = comm_time(); - check_error(transferBoundaryFaces(mesh, &c), name[0]); + transfer_boundary_faces(mesh, &c); duration[0] = comm_time() - t; - debug_print(&c, verbose, "done.\n"); - debug_print(&c, verbose, "\t%s ...", name[1]); + parrsb_print(&c, verbose - 1, "\t%s ...", name[1]); parrsb_barrier(&c), t = comm_time(); - check_error(findMinNeighborDistance(mesh), name[1]); + find_min_neighbor_distance(mesh); duration[1] = comm_time() - t; - debug_print(&c, verbose, "done.\n"); - debug_print(&c, verbose, "\t%s ...\n", name[2]); + parrsb_print(&c, verbose - 1, "\t%s ...", name[2]); parrsb_barrier(&c), t = comm_time(); - check_error(find_unique_vertices(mesh, &c, tol, verbose, &bfr), name[2]); + find_unique_vertices(mesh, &c, tol, verbose - 1, &bfr); duration[2] = comm_time() - t; - debug_print(&c, verbose, "\t%s ...", name[3]); + parrsb_print(&c, verbose - 1, "\t%s ...", name[3]); parrsb_barrier(&c), t = comm_time(); - setGlobalID(mesh, &c); + set_global_id(mesh, &c); send_back(mesh, &c, &bfr); duration[3] = comm_time() - t; - debug_print(&c, verbose, "done.\n"); - debug_print(&c, verbose, "\t%s ...", name[4]); +#define check_error(call, msg) \ + { \ + sint err = (call), wrk; \ + comm_allreduce(&c, gs_int, gs_max, &err, 1, &wrk); \ + if (err) { \ + parrsb_print(&c, 1, msg, __FILE__, __LINE__); \ + buffer_free(&bfr), mesh_free(mesh), comm_free(&c); \ + return err; \ + } \ + } + + parrsb_print(&c, verbose - 1, "\t%s ...", name[4]); parrsb_barrier(&c), t = comm_time(); - check_error(elementCheck(mesh, &c, &bfr), name[4]); + check_error(element_check(mesh, &c, &bfr), "\t%s:%d element_check failed."); duration[4] = comm_time() - t; - debug_print(&c, verbose, "done.\n"); - debug_print(&c, verbose, "\t%s ...", name[5]); + parrsb_print(&c, verbose - 1, "\t%s ...", name[5]); parrsb_barrier(&c), t = comm_time(); - check_error(faceCheck(mesh, &c, &bfr), name[5]); + check_error(face_check(mesh, &c, &bfr), "\t%s:%d face_check failed."); duration[5] = comm_time() - t; - debug_print(&c, verbose, "done.\n"); - debug_print(&c, verbose, "\t%s ...", name[6]); +#undef check_error + + parrsb_print(&c, verbose - 1, "\t%s ...", name[6]); parrsb_barrier(&c), t = comm_time(); - check_error(matchPeriodicFaces(mesh, &c, &bfr), name[6]); + match_periodic_faces(mesh, &c, verbose - 1, &bfr); duration[6] = comm_time() - t; - debug_print(&c, verbose, "done.\n"); - debug_print(&c, verbose, "\t%s ...", name[7]); + parrsb_print(&c, verbose - 1, "\t%s ...", name[7]); parrsb_barrier(&c), t = comm_time(); Point ptr = mesh->elements.ptr; for (uint i = 0; i < nelt; i++) { @@ -299,32 +275,29 @@ int parrsb_conn_mesh(long long *vtx, double *coord, int nelt, int ndim, vtx[i * mesh->nv + j] = ptr[i * mesh->nv + j].globalId + 1; } duration[7] = comm_time() - t; - debug_print(&c, verbose, "done.\n"); // Report timing info and finish - double gmin[8], gmax[8], buf[8]; - for (unsigned i = 0; i < 8; i++) - gmax[i] = gmin[i] = duration[i]; - comm_allreduce(&c, gs_double, gs_min, gmin, 8, buf); - comm_allreduce(&c, gs_double, gs_max, gmax, 8, buf); - - if (c.id == 0 && verbose > 1) { - for (unsigned i = 0; i < 7; i++) - printf("%s: %e %e (min max)\n", name[i], gmin[i], gmax[i]); - fflush(stdout); + { + double gmin[8], gmax[8], buf[8]; + for (unsigned i = 0; i < 8; i++) + gmax[i] = gmin[i] = duration[i]; + comm_allreduce(&c, gs_double, gs_min, gmin, 8, buf); + comm_allreduce(&c, gs_double, gs_max, gmax, 8, buf); + + for (unsigned i = 0; i < 7; i++) { + parrsb_print(&c, verbose - 1, "%s: %e %e (min max)", name[i], gmin[i], + gmax[i]); + } } - parrsb_barrier(&c), tall = comm_time() - tall; - if (c.id == 0) { - printf("parCon (tol = %e) finished in %g s\n", tol, tall); - fflush(stdout); - } + parrsb_barrier(&c); + tall = comm_time() - tall; + parrsb_print(&c, verbose, "parCon (tol = %e) finished in %g s", tol, tall); buffer_free(&bfr), mesh_free(mesh), comm_free(&c); return 0; } -#undef check_error //============================================================================= // Fortran interface diff --git a/src/fiedler.c b/src/fiedler.c index 17b6425c..f98edbe7 100644 --- a/src/fiedler.c +++ b/src/fiedler.c @@ -3,6 +3,9 @@ #include "parrsb-impl.h" #include "sort.h" +#include +#include + #define MM 500 extern void matrix_inverse(int N, double *A); @@ -39,25 +42,24 @@ int power_serial(double *y, uint N, double *A, int verbose) { time_t t; srand((unsigned)time(&t)); - int i; scalar norm = 0.0; - for (i = 0; i < N; i++) { + for (uint i = 0; i < N; i++) { y[i] = (rand() % 50) / 50.0; norm += y[i] * y[i]; } scalar normi = 1.0 / sqrt(norm); - for (i = 0; i < N; i++) + for (uint i = 0; i < N; i++) y[i] *= normi; double *Ay = tcalloc(double, N); - int j, k, l; scalar err = 1.0, lambda; + unsigned i; for (i = 0; i < 100; i++) { norm = 0.0; - for (j = 0; j < N; j++) { + for (uint j = 0; j < N; j++) { Ay[j] = 0.0; - for (k = 0; k < N; k++) { + for (uint k = 0; k < N; k++) { Ay[j] += A[j * N + k] * y[k]; } norm += Ay[j] * Ay[j]; @@ -68,10 +70,10 @@ int power_serial(double *y, uint N, double *A, int verbose) { lambda = sqrt(norm); normi = 1.0 / sqrt(norm); - for (j = 0; j < N; j++) + for (uint j = 0; j < N; j++) y[j] = Ay[j] * normi; - if (fabs(err) < 1.e-12) + if (fabs(err) < 1e-12) break; } free(Ay); @@ -81,16 +83,16 @@ int power_serial(double *y, uint N, double *A, int verbose) { int inv_power_serial(double *y, uint N, double *A, int verbose) { double *Ainv = tcalloc(double, N *N); - int j, k; - for (j = 0; j < N; j++) { - for (k = 0; k < N; k++) + for (uint j = 0; j < N; j++) { + for (uint k = 0; k < N; k++) Ainv[j * N + k] = A[k * N + j]; } matrix_inverse(N, Ainv); + uint j; for (j = 0; j < N; j++) { - for (k = 0; k < N; k++) + for (uint k = 0; k < N; k++) A[j * N + k] = Ainv[k * N + j]; } j = power_serial(y, N, Ainv, verbose); @@ -101,7 +103,7 @@ int inv_power_serial(double *y, uint N, double *A, int verbose) { } static int project(scalar *x, uint n, scalar *b, struct laplacian *L, - struct mg *d, struct comm *c, int miter, double tol, + struct mg *d, struct comm *c, unsigned miter, double tol, int null_space, int verbose, buffer *bfr) { slong out[2][1], buf[2][1], in = n; comm_scan(out, c, gs_long, gs_add, &in, 1, buf); @@ -211,10 +213,9 @@ static int project(scalar *x, uint n, scalar *b, struct laplacian *L, // Input z should be orthogonal to 1-vector, have unit norm. // inverse iteration should not change z. -static int inverse(scalar *y, struct array *elements, int nv, scalar *z, - struct comm *gsc, int miter, int mpass, double tol, - int factor, int sagg, int grammian, slong nelg, - buffer *buf) { +static int inverse(scalar *y, struct array *elements, unsigned nv, scalar *z, + struct comm *gsc, unsigned miter, unsigned mpass, double tol, + int factor, int grammian, slong nelg, buffer *buf) { metric_tic(gsc, RSB_INVERSE_SETUP); uint lelt = elements->n; struct rsb_element *elems = (struct rsb_element *)elements->ptr; @@ -241,7 +242,7 @@ static int inverse(scalar *y, struct array *elements, int nv, scalar *z, struct crystal cr; crystal_init(&cr, gsc); struct par_mat *L = par_csr_setup_con(lelt, eid, vtx, nv, 1, gsc, &cr, buf); - struct mg *d = mg_setup(L, factor, sagg, &cr, buf); + struct mg *d = mg_setup(L, factor, &cr, buf); crystal_free(&cr); metric_toc(gsc, RSB_INVERSE_SETUP); @@ -275,7 +276,7 @@ static int inverse(scalar *y, struct array *elements, int nv, scalar *z, ortho(z, lelt, nelg, gsc); - int N = i + 1; + uint N = i + 1; if (grammian == 1) { // if k>1; // Z(:,k)=z-Z(:,1:k-1)*(Z(:,1:k-1)'*z); @@ -378,12 +379,12 @@ static int tqli(scalar *eVectors, scalar *eValues, sint n, scalar *diagonal, e[n - 1] = 0.0; for (i = 0; i < n; i++) { - for (uint j = 0; j < n; j++) + for (sint j = 0; j < n; j++) eVectors[i * n + j] = 0; eVectors[i * n + i] = 1; } - int j, k, l, iter, m; + sint j, k, l, iter, m; for (l = 0; l < n; l++) { iter = 0; do { @@ -463,12 +464,12 @@ static int tqli(scalar *eVectors, scalar *eValues, sint n, scalar *diagonal, for (k = 0; k < n; k++) { e[k] = 0; - for (uint i = 0; i < n; i++) + for (sint i = 0; i < n; i++) e[k] += eVectors[k * n + i] * eVectors[k * n + i]; if (e[k] > 0.0) e[k] = sqrt(fabs(e[k])); scalar scale = 1.0 / e[k]; - for (uint i = 0; i < n; i++) + for (sint i = 0; i < n; i++) eVectors[k * n + i] *= scale; } @@ -566,9 +567,9 @@ static int lanczos_aux(scalar *diag, scalar *upper, scalar *rr, uint lelt, return iter; } -static int lanczos(scalar *fiedler, struct array *elements, int nv, - scalar *initv, struct comm *gsc, int miter, int mpass, - double tol, slong nelg, buffer *bfr) { +static int lanczos(scalar *fiedler, struct array *elements, unsigned nv, + scalar *initv, struct comm *gsc, unsigned miter, + unsigned mpass, double tol, slong nelg, buffer *bfr) { metric_tic(gsc, RSB_LANCZOS_SETUP); uint lelt = elements->n; struct rsb_element *elems = (struct rsb_element *)elements->ptr; @@ -582,7 +583,7 @@ static int lanczos(scalar *fiedler, struct array *elements, int nv, scalar *rr = tcalloc(scalar, (miter + 1) * lelt); scalar *eVectors = tcalloc(scalar, miter * miter); scalar *eValues = tcalloc(scalar, miter); - int iter = miter, ipass; + uint iter = miter, ipass; for (ipass = 0; iter == miter && ipass < mpass; ipass++) { double t = comm_time(); iter = lanczos_aux(alpha, beta, rr, lelt, nelg, miter, tol, initv, wl, gsc, @@ -618,8 +619,12 @@ static int lanczos(scalar *fiedler, struct array *elements, int nv, return (ipass - 1) * miter + iter; } -int fiedler(struct array *elements, int nv, parrsb_options *opts, +int fiedler(struct array *elements, int nv, const parrsb_options *const opts, struct comm *gsc, buffer *buf, int verbose) { + // Return if the number of processes is equal to 1. + if (gsc->np == 1) + return 0; + metric_tic(gsc, RSB_FIEDLER_SETUP); uint lelt = elements->n; slong out[2][1], wrk[2][1], in = lelt; @@ -653,7 +658,7 @@ int fiedler(struct array *elements, int nv, parrsb_options *opts, case 1: iter = inverse(f, elements, nv, initv, gsc, opts->rsb_max_iter, opts->rsb_max_passes, opts->rsb_tol, opts->rsb_mg_factor, - opts->rsb_mg_sagg, opts->rsb_mg_grammian, nelg, buf); + opts->rsb_mg_grammian, nelg, buf); break; default: break; diff --git a/src/helpers.c b/src/helpers.c index 123b5f78..46c282f3 100644 --- a/src/helpers.c +++ b/src/helpers.c @@ -10,25 +10,21 @@ void parrsb_print_stack(void) { void *bt[50]; int bt_size = backtrace(bt, 50); + if (bt_size == 0) { + fprintf(stderr, "backtrace(): Obtained 0 stack frames.\n"); + return; + } + char **symbols = backtrace_symbols(bt, bt_size); - printf("backtrace(): obtained %d stack frames.\n", bt_size); - for (unsigned i = 0; i < bt_size; i++) - printf("%s\n", symbols[i]); + fprintf(stderr, "backtrace(): obtained %d stack frames.\n", bt_size); + for (unsigned i = 0; i < (unsigned)bt_size; i++) + fprintf(stderr, "%s\n", symbols[i]); free(symbols); } #else -void parrsb_print_stack(){}; +void parrsb_print_stack() {} #endif // defined __GLIBC__ -double parrsb_get_max_rss() { - struct rusage r_usage; - getrusage(RUSAGE_SELF, &r_usage); -#if defined(__APPLE__) && defined(__MACH__) - return (double)r_usage.ru_maxrss; -#else - return (double)(r_usage.ru_maxrss * 1024L); -#endif -} int log2ll(long long n) { int k = 0; while (n > 1) @@ -38,7 +34,7 @@ int log2ll(long long n) { } int parrsb_dist_mesh(unsigned int *nelt_, long long **vl_, double **coord_, - int *part, int nv, MPI_Comm comm) { + int *part, unsigned nv, MPI_Comm comm) { typedef struct { int proc; long long vtx[MAXNV]; @@ -60,7 +56,7 @@ int parrsb_dist_mesh(unsigned int *nelt_, long long **vl_, double **coord_, } assert(elements.n == nelt); - int ndim = (nv == 8) ? 3 : 2; + unsigned ndim = (nv == 8) ? 3 : 2; elem_data *ed = elements.ptr; double *coord = (coord_ == NULL ? NULL : *coord_); if (coord != NULL) { @@ -126,7 +122,7 @@ int parrsb_setup_mesh(unsigned *nelt, unsigned *nv, long long **vl, parrsb_check_error(err, comm); parrsb_options opt = parrsb_default_options; - err = parrsb_part_mesh(part, NULL, *vl, *coord, *nelt, *nv, opt, comm); + err = parrsb_part_mesh(part, *vl, *coord, NULL, *nelt, *nv, &opt, comm); parrsb_check_error(err, comm); // Redistribute data based on identified partitions @@ -143,16 +139,13 @@ void parrsb_get_part_stat(int *nc, int *ns, int *nss, int *nel, long long *vtx, struct comm comm; comm_init(&comm, ce); - int np = comm.np; - int id = comm.id; - + uint np = comm.np; if (np == 1) return; - int Npts = nelt * nv; - int i; + size_t Npts = nelt * nv; slong *data = (slong *)malloc((Npts + 1) * sizeof(slong)); - for (i = 0; i < Npts; i++) + for (size_t i = 0; i < Npts; i++) data[i] = vtx[i]; struct gs_data *gsh = gs_setup(data, Npts, &comm, 0, gs_pairwise, 0); @@ -165,11 +158,11 @@ void parrsb_get_part_stat(int *nc, int *ns, int *nss, int *nel, long long *vtx, gs_free(gsh); free(data); - int nelMin, nelMax, nelSum; - int ncMin, ncMax, ncSum; - int nsMin, nsMax, nsSum; - int nssMin, nssMax, nssSum; - int b; + sint nelMin, nelMax, nelSum; + sint ncMin, ncMax, ncSum; + sint nsMin, nsMax, nsSum; + sint nssMin, nssMax, nssSum; + sint b; ncMax = Nmsg; ncMin = Nmsg; @@ -181,7 +174,7 @@ void parrsb_get_part_stat(int *nc, int *ns, int *nss, int *nel, long long *vtx, nsMax = Ncomm[0]; nsMin = Ncomm[0]; nsSum = Ncomm[0]; - for (i = 1; i < Nmsg; ++i) { + for (int i = 1; i < Nmsg; ++i) { nsMax = Ncomm[i] > Ncomm[i - 1] ? Ncomm[i] : Ncomm[i - 1]; nsMin = Ncomm[i] < Ncomm[i - 1] ? Ncomm[i] : Ncomm[i - 1]; nsSum += Ncomm[i]; @@ -265,23 +258,15 @@ parrsb_cmd_line_opts *parrsb_parse_cmd_opts(int argc, char *argv[]) { in->mesh = NULL, in->tol = 2e-1; in->test = 0, in->dump = 0, in->verbose = 0, in->nactive = INT_MAX; - in->ilu_type = 0, in->ilu_tol = 1e-1, in->ilu_pivot = 0; - in->crs_type = 0, in->crs_tol = 1e-3; - - static struct option long_options[] = { - {"mesh", required_argument, 0, 0}, - {"tol", optional_argument, 0, 1}, - {"test", optional_argument, 0, 2}, - {"dump", optional_argument, 0, 3}, - {"nactive", optional_argument, 0, 4}, - {"verbose", optional_argument, 0, 5}, - {"ilu_type", optional_argument, 0, 10}, - {"ilu_tol", optional_argument, 0, 11}, - {"ilu_pivot", optional_argument, 0, 12}, - {"crs_type", optional_argument, 0, 20}, - {"crs_tol", optional_argument, 0, 21}, - {"help", optional_argument, 0, 91}, - {0, 0, 0, 0}}; + + static struct option long_options[] = {{"mesh", required_argument, 0, 0}, + {"tol", optional_argument, 0, 10}, + {"test", optional_argument, 0, 20}, + {"dump", optional_argument, 0, 30}, + {"nactive", optional_argument, 0, 40}, + {"verbose", optional_argument, 0, 50}, + {"help", optional_argument, 0, 99}, + {0, 0, 0, 0}}; size_t len; for (;;) { @@ -295,37 +280,22 @@ parrsb_cmd_line_opts *parrsb_parse_cmd_opts(int argc, char *argv[]) { in->mesh = tcalloc(char, len + 1); strncpy(in->mesh, optarg, len); break; - case 1: + case 10: in->tol = atof(optarg); break; - case 2: + case 20: in->test = 1; break; - case 3: + case 30: in->dump = 1; break; - case 4: + case 40: in->nactive = atoi(optarg); break; - case 5: + case 50: in->verbose = atoi(optarg); break; - case 10: - in->ilu_type = atoi(optarg); - break; - case 11: - in->ilu_tol = atof(optarg); - break; - case 12: - in->ilu_pivot = atoi(optarg); - break; - case 20: - in->crs_type = atoi(optarg); - break; - case 21: - in->crs_tol = atof(optarg); - break; - case 91: + case 99: print_help(); break; default: @@ -400,7 +370,7 @@ int parrsb_vector_dump(const char *fname, scalar *y, struct rsb_element *elm, slong out[2][1], in = nelt; comm_scan(out, c, gs_long, gs_add, &in, 1, wrk); - slong start = out[0][0], nelgt = out[1][0]; + slong nelgt = out[1][0]; int ndim = (nv == 8) ? 3 : 2; uint write_size = ((ndim + 1) * sizeof(double) + sizeof(slong)) * nelt; diff --git a/src/ilu.c b/src/ilu.c deleted file mode 100644 index ac26f42b..00000000 --- a/src/ilu.c +++ /dev/null @@ -1,1513 +0,0 @@ -#include "ilu.h" -#include - -#define CSC 0 -#define CSR 1 - -//============================================================================= -// ILU levels -// -// Currently there are two methods of finding levels -// 1. Based on final element distribution among processors (dst_lvls) -// 2. Based on RSB levels while partitioning (rsb_lvls) -struct key_t { - ulong e; - uint p; -}; - -struct e2n_t { - ulong e, n; -}; - -struct request_t { - ulong r; - uint p, o; -}; - -static int find_unique_nbrs(struct array *e2nm, uint n, int nv, - const ulong *ids, const slong *vtx, - struct crystal *cr, buffer *bfr) { - struct array nbrs; - find_nbrs(&nbrs, ids, vtx, n, nv, cr, bfr); - - array_init(struct e2n_t, e2nm, n * 10); - if (nbrs.n > 0) { - sarray_sort_2(struct nbr, nbrs.ptr, nbrs.n, r, 1, c, 1, bfr); - struct nbr *pn = (struct nbr *)nbrs.ptr; - - struct e2n_t en; - uint i, j; - for (i = 1, j = 0; i < nbrs.n; i++) { - if ((pn[i].r != pn[j].r) || (pn[i].c != pn[j].c)) { - en.e = pn[j].r, en.n = pn[j].c; - array_cat(struct e2n_t, e2nm, &en, 1); - j = i; - } - } - en.e = pn[j].r, en.n = pn[j].c; - array_cat(struct e2n_t, e2nm, &en, 1); - sarray_sort_2(struct e2n_t, e2nm->ptr, e2nm->n, e, 1, n, 1, bfr); - } - array_free(&nbrs); - - return 0; -} - -static int local_dof(const ulong *rows, const ulong I, const uint n) { - for (uint i = 0; i < n; i++) - if (rows[i] == I) - return i; - return n; -} - -// Fill dofs array with unique dofs found in this processr -static int update_keys(struct array *keys, struct array *nbrs, const uint ln, - const ulong *lids, struct crystal *cr, buffer *bfr) { - uint i, j; - struct array temp, rqst; - array_init(struct request_t, &temp, nbrs->n); - array_init(struct request_t, &rqst, nbrs->n); - - struct comm *c = &cr->comm; - struct e2n_t *pn = (struct e2n_t *)nbrs->ptr; - struct request_t t; - for (i = 0; i < nbrs->n; i++) { - t.r = pn[i].n, t.p = t.r % c->np; - t.o = (local_dof(lids, t.r, ln) < ln); - array_cat(struct request_t, &temp, &t, 1); - } - - struct request_t *pt = (struct request_t *)temp.ptr; - if (temp.n > 0) { - sarray_sort(struct request_t, temp.ptr, temp.n, r, 1, bfr); - for (i = 1, j = 0; i < temp.n; i++) { - if (pt[i].r != pt[j].r) { - array_cat(struct request_t, &rqst, &pt[j], 1); - j = i; - } - } - array_cat(struct request_t, &rqst, &pt[j], 1); - } - - sarray_transfer(struct request_t, &rqst, p, 1, cr); - sarray_sort_2(struct request_t, rqst.ptr, rqst.n, r, 1, o, 0, bfr); - - struct request_t *pr = (struct request_t *)rqst.ptr; - if (rqst.n > 0) { - for (i = 1, j = 0; i < rqst.n; i++) { - if (pr[i].r != pr[j].r) { - // owner for dof j, j + 1, ... i - 1 is pr[i - 1].p - assert(pr[i - 1].o == 1); - for (; j < i; j++) - pr[j].o = pr[i - 1].p; - // j = i at the end - } - } - assert(pr[i - 1].o == 1); - for (; j < i; j++) - pr[j].o = pr[i - 1].p; - } - - sarray_transfer(struct request_t, &rqst, o, 0, cr); - sarray_sort_2(struct request_t, rqst.ptr, rqst.n, r, 1, p, 0, bfr); - - // All the requests are forwarded correctly. Send the data back - // to the requesting processors. Note that the requests are unique. - struct key_t *pk = (struct key_t *)keys->ptr; - pr = (struct request_t *)rqst.ptr; - temp.n = 0; - for (i = j = 0; i < rqst.n; i++) { - while (pk[j].e < pr[i].r) - j++; - // Sanity check - assert(pk[j].e == pr[i].r); - t.o = pr[i].p; - for (uint k = j; k < keys->n && pk[k].e == pk[j].e; k++) { - t.r = pk[k].e, t.p = pk[k].p; - array_cat(struct request_t, &temp, &t, 1); - } - } - array_free(&rqst); - - sarray_transfer(struct request_t, &temp, o, 0, cr); - sarray_sort_2(struct request_t, temp.ptr, temp.n, r, 1, p, 0, bfr); - - // Update the keys array. Update here is a complete rewrite. - struct array keyt; - array_init(struct key_t, &keyt, temp.n); - - struct key_t s; - pt = (struct request_t *)temp.ptr; - for (i = 0; i < ln; i++) { - ulong e = lids[i]; - // Find `e` in the nbrs array - for (j = 0; j < nbrs->n && pn[j].e < e; j++) - ; - assert(j < nbrs->n && pn[j].e == e); - // Now go through all the neighbors and update the keys - for (; j < nbrs->n && pn[j].e == e; j++) { - ulong n = pn[j].n; - // find the key of `n` in temp - uint k = 0; - for (; k < temp.n && pt[k].r < n; k++) - ; - assert(k < temp.n && pt[k].r == n); - for (; k < temp.n && pt[k].r == n; k++) { - s.e = e, s.p = pt[k].p; - array_cat(struct key_t, &keyt, &s, 1); - } - } - } - array_free(&temp); - - keys->n = 0; - if (keyt.n > 0) { - sarray_sort_2(struct key_t, keyt.ptr, keyt.n, e, 1, p, 0, bfr); - pk = (struct key_t *)keyt.ptr; - for (i = 1, j = 0; i < keyt.n; i++) { - if ((pk[i].e != pk[j].e) || (pk[i].p != pk[j].p)) { - array_cat(struct key_t, keys, &pk[j], 1); - j = i; - } - } - array_cat(struct key_t, keys, &pk[j], 1); - } - - array_free(&keyt); - - return 0; -} - -// This routine will update `lvl_n`, `lvl_off` and `lvl_ids` with the DOF -// belongig to current level. In the process, it will remove the DOFs and their -// connectivity from ids, and vtx arrays. `n` will be adjusted to reflect -// changes. -static int dst_lvls_aux(int *lvl_n, uint *lvl_off, uint *lvl_owner, - ulong *lvl_ids, uint *n, ulong *ids, slong *vtx, int nv, - struct array *keys, struct comm *c, int verbose) { - // Find the min key size locally. - uint i, j, k; - sint min = INT_MAX; - struct key_t *pk = (struct key_t *)keys->ptr; - if (keys->n > 0) { - for (i = 1, j = 0; i < keys->n; i++) { - if (pk[i].e != pk[j].e) { - // Different element, update min key size if required - min = (min > i - j ? i - j : min); - j = i; - } - } - min = (min > i - j ? i - j : min); - } - - sint buf[2]; - comm_allreduce(c, gs_int, gs_min, &min, 1, buf); - if (min == INT_MAX) - return 0; - - int lvl = *lvl_n; - uint off = lvl_off[lvl]; - if (keys->n > 0) { - for (i = 1, j = 0; i < keys->n; i++) { - if (pk[i].e != pk[j].e) { - if (i - j == min) - lvl_ids[off] = pk[j].e, lvl_owner[off] = pk[i - 1].p, off++; - j = i; - } - } - if (i - j == min) - lvl_ids[off] = pk[j].e, lvl_owner[off] = pk[i - 1].p, off++; - } - - assert(lvl < 50); - lvl++, lvl_off[lvl] = off; - if (verbose > 1) { - printf("id: %d |key| = %d lvl = %d size = %u\n", c->id, min, lvl, - lvl_off[lvl] - lvl_off[lvl - 1]); - fflush(stdout); - } - - // Now we have to update ids and vtx. This can be done in place. - for (i = lvl_off[lvl - 1], j = 0, k = 0; i < lvl_off[lvl]; i++, j++) { - for (; j < *n && ids[j] < lvl_ids[i]; j++, k++) { - ids[k] = ids[j]; - for (int v = 0; v < nv; v++) - vtx[k * nv + v] = vtx[j * nv + v]; - } - assert(j < *n && ids[j] == lvl_ids[i]); - } - for (; j < *n; j++, k++) { - ids[k] = ids[j]; - for (int v = 0; v < nv; v++) - vtx[k * nv + v] = vtx[j * nv + v]; - } - - *n -= lvl_off[lvl] - lvl_off[lvl - 1], *lvl_n = lvl; - - return 0; -} - -static int dst_lvls(uint *lvl_off, uint *lvl_owner, ulong *lvl_ids, - const uint n_, const int nv, const ulong *ids_, - const slong *vtx_, struct crystal *cr, int verbose, - buffer *bfr) { - // Copy ids and vtx since we are going to modify them - uint n = n_; - ulong *ids = tcalloc(ulong, n); - slong *vtx = tcalloc(slong, n * nv); - for (uint i = 0, j = 0; i < n; i++) { - ids[i] = ids_[i]; - for (int v = 0; v < nv; v++, j++) - vtx[j] = vtx_[j]; - } - - struct comm *c = &cr->comm; - - // Initialize keys: set key of each dof to the current MPI rank. - // keys array should has unique entries and should be sorted first - // by .e and then by .p. - struct array keys; - array_init(struct key_t, &keys, n); - struct key_t e2p = {.e = 0, .p = c->id}; - for (uint i = 0; i < n; i++) { - e2p.e = ids[i]; - array_cat(struct key_t, &keys, &e2p, 1); - } - sarray_sort_2(struct key_t, keys.ptr, keys.n, e, 1, p, 0, bfr); - - slong ng = n, buf[2]; - comm_allreduce(c, gs_long, gs_add, &ng, 1, buf); - - int nlvls = 0; - struct array nbrs; - while (ng > 0) { - // Find unique neighbors of a DOF. DOF is a neighbor of itself. - find_unique_nbrs(&nbrs, n, nv, ids, vtx, cr, bfr); - - // Send and receive key to/from neighbors. We forward all the requests - // for the key of a DOF to the processor that owns the DOF and then that - // processor takes care of the request. To do that, we first find all the - // unique requests. - update_keys(&keys, &nbrs, n, ids, cr, bfr); - - // Find the min key size - // Add all the dofs with key size equal to min key size to current level - // Update ids and vtx by removing the dofs with min key size - dst_lvls_aux(&nlvls, lvl_off, lvl_owner, lvl_ids, &n, ids, vtx, nv, &keys, - c, verbose); - - ng = n; - comm_allreduce(c, gs_long, gs_add, &ng, 1, buf); - if (verbose > 1) { - if (c->id == 0) - printf("lvl = %d ng = %lld\n", nlvls, ng); - fflush(stdout); - } - array_free(&nbrs); - } - - free(ids), free(vtx); - - return nlvls; -} - -static int rsb_lvls(uint *lvl_off, uint *lvl_owner, ulong *lvl_ids, - const uint n, const int nv, const ulong *ids, - const slong *vtx, struct comm *ci, int verbose, - buffer *bfr) { - slong ng = n, buf[2]; - comm_allreduce(ci, gs_long, gs_add, &ng, 1, buf); - - // What we are going to do is identify the elements in the interface at each - // level. These elements constitute the level of ILU. Owner of the element is - // the processor which at least own a single vertex (possibly duplicated) of - // the element. - - uint size = n * nv; - sint *in = tcalloc(sint, size); - sint *lvl = tcalloc(sint, n); - sint *owner = tcalloc(sint, n); - if (owner == NULL || lvl == NULL || in == NULL) { - fprintf(stderr, "Failed to allocate lvl, owner or in !\n"); - exit(1); - } - - struct comm c, t; - comm_dup(&c, ci); - - uint i; - sint nlvls = 1, j; - while (c.np > 1) { - struct gs_data *gsh = gs_setup(vtx, size, &c, 0, gs_pairwise, 0); - - int bin = (c.id >= (c.np + 1) / 2); - for (i = 0; i < size; i++) - in[i] = bin; - - gs(in, gs_int, gs_max, 0, gsh, bfr); - - if (bin == 1) { - for (i = 0; i < size; i++) - in[i] = 0; - } - - gs(in, gs_int, gs_max, 0, gsh, bfr); - - sint ownr = 0; - for (i = 0; i < n; i++) { - for (j = 0; j < nv; j++) { - if (in[i * nv + j] > 0) { - if (lvl[i] == 0) { - lvl[i] = nlvls; - ownr = ci->id + 1; - } - break; - } - } - } - - comm_allreduce(&c, gs_int, gs_max, &ownr, 1, buf); - - for (i = 0; i < n; i++) { - if (lvl[i] == nlvls) - owner[i] = ownr - 1; - } - - nlvls++; - - gs_free(gsh); - comm_split(&c, bin, c.id, &t), comm_free(&c); - comm_dup(&c, &t), comm_free(&t); - } - comm_free(&c); - - int rem = 0; - for (uint i = 0; i < n; i++) { - if (lvl[i] == 0) { - lvl[i] = nlvls; - owner[i] = ci->id; - rem = 1; - } - } - nlvls += rem; - comm_allreduce(ci, gs_int, gs_max, &nlvls, 1, buf); - - // Reverse the level numbers - for (uint i = 0; i < n; i++) - lvl[i] = nlvls - lvl[i]; - - struct linfo_t { - uint lvl, owner; - ulong id; - }; - - struct array linfos; - array_init(struct linfo_t, &linfos, n); - - struct linfo_t linfo = {.lvl = 0, .owner = 0, .id = 0}; - for (uint i = 0; i < n; i++) { - linfo.lvl = lvl[i], linfo.owner = owner[i], linfo.id = ids[i]; - array_cat(struct linfo_t, &linfos, &linfo, 1); - } - sarray_sort(struct linfo_t, linfos.ptr, linfos.n, lvl, 0, bfr); - - if (linfos.n > 0) { - struct linfo_t *pl = (struct linfo_t *)linfos.ptr; - for (uint l = 0, i = 0; l < nlvls; l++) { - for (; i < linfos.n && pl[i].lvl == l; i++) - lvl_ids[i] = pl[i].id, lvl_owner[i] = pl[i].owner; - lvl_off[l + 1] = i; - } - } - - array_free(&linfos); - free(owner), free(lvl), free(in); - - return nlvls; -} - -static int find_lvls(uint *lvl_off, uint *lvl_owner, ulong *lvl_ids, - const uint n, const int nv, const ulong *ids, - const slong *vtx, int type, struct crystal *cr, - int verbose, buffer *bfr) { - int nlvls = 0; - switch (type) { - case 0: - nlvls = dst_lvls(lvl_off, lvl_owner, lvl_ids, n, nv, ids, vtx, cr, verbose, - bfr); - break; - case 1: - nlvls = rsb_lvls(lvl_off, lvl_owner, lvl_ids, n, nv, ids, vtx, &cr->comm, - verbose, bfr); - break; - default: - break; - } - return nlvls; -} - -//============================================================================= -// ILU -// -struct ilu { - int pivot, verbose; - // 1st dropping rule: An entry a_ij is dropped abs(a_ij) < tol - scalar tol; - // 2nd dropping rule: Entries are dropped so that total nnz per row/col < p - uint nnz_per_row; - - // Calculated values internal to ILU - uint nlvls, *lvl_off; - ulong *perm; - struct par_mat A, L, U; - struct crystal cr; -}; - -//============================================================================= -// ILU(0) -// -static int ilu0_get_rows(struct par_mat *E, int lvl, uint *lvl_off, - struct par_mat *A, struct crystal *cr, buffer *bfr) { - struct owner { - ulong ri; - uint rp, p; - }; - - assert(IS_CSR(A) && !IS_DIAG(A)); - - struct array owners, requests; - array_init(struct owner, &owners, A->rn * 30); - array_init(struct owner, &requests, A->rn * 30); - - struct comm *c = &cr->comm; - struct owner t; - for (uint i = lvl_off[lvl - 1]; i < lvl_off[lvl]; i++) { - ulong I = A->rows[i]; - for (uint j = A->adj_off[i]; - j < A->adj_off[i + 1] && A->cols[A->adj_idx[j]] < I; j++) { - t.ri = A->cols[A->adj_idx[j]], t.rp = c->np, t.p = t.ri % c->np; - array_cat(struct owner, &owners, &t, 1); - } - } - - for (uint i = lvl_off[0]; i < lvl_off[lvl]; i++) { - t.ri = A->rows[i], t.rp = c->id, t.p = t.ri % c->np; - array_cat(struct owner, &owners, &t, 1); - } - - sarray_sort_2(struct owner, owners.ptr, owners.n, ri, 1, rp, 0, bfr); - struct owner *ptr = (struct owner *)owners.ptr; - uint i, j; - for (i = 0; i < owners.n; i = j) { - for (j = i + 1; j < owners.n && ptr[j].ri == ptr[i].ri; j++) - ; - array_cat(struct owner, &requests, &ptr[i], 1); - } - array_free(&owners); - - // Match row ids and set `p` to the original processor - sarray_transfer(struct owner, &requests, p, 1, cr); - - // Set rp to the owner - sarray_sort_2(struct owner, requests.ptr, requests.n, ri, 1, rp, 0, bfr); - ptr = (struct owner *)requests.ptr; - for (i = 0; i < requests.n; i = j) { - assert(ptr[i].rp < c->np); - for (j = i + 1; j < requests.n && ptr[j].ri == ptr[i].ri; j++) { - assert(ptr[j].rp == c->np); - ptr[j].rp = ptr[i].rp; - } - } - - // Forward requests to the owner processor - sarray_transfer(struct owner, &requests, rp, 0, cr); - - sarray_sort_2(struct owner, requests.ptr, requests.n, ri, 1, p, 0, bfr); - ptr = (struct owner *)requests.ptr; - - struct array sends; - array_init(struct mij, &sends, A->rn * 30); - - for (i = 0; i < requests.n; i = j) { - ulong ri = ptr[i].ri; - uint ro = local_dof(A->rows, ri, A->rn); - assert(ro < A->rn); - for (j = i; j < requests.n && ptr[j].ri == ri; j++) { - // No need to send to owner - if (ptr[j].p != c->id) { - // copy_row(&sends, ro, ptr[j].p, A); - struct mij m = {.r = A->rows[ro], .idx = 0, .p = ptr[j].p}; - for (uint k = A->adj_off[ro], ke = A->adj_off[ro + 1]; k < ke; k++) { - m.c = A->cols[A->adj_idx[k]], m.v = A->adj_val[k]; - array_cat(struct mij, &sends, &m, 1); - } - } - } - } - array_free(&requests); - - sarray_transfer(struct mij, &sends, p, 1, cr); - par_csr_setup(E, &sends, 0, bfr); - array_free(&sends); - - return 0; -} - -static void ilu0_update_row(const uint io, const uint k, struct par_mat *A, - struct par_mat *E, int verbose, int lvl) { - uint *off = A->adj_off, *idx = A->adj_idx; - uint *koff = A->adj_off, *kidx = A->adj_idx; - ulong *cols = A->cols, *kcols = A->cols; - scalar *val = A->adj_val, *kval = A->adj_val; - - const ulong K = cols[idx[k]]; - const ulong I = A->rows[io]; - - // Find offsets of K in A - sint ko = -1; - uint j; - for (j = 0; j < A->rn; j++) { - if (A->rows[j] == K) { - ko = j; - break; - } - } - - // Search in E if K is not found in A - if (ko == -1 && E != NULL) { - koff = E->adj_off, kidx = E->adj_idx; - kval = E->adj_val, kcols = E->cols; - for (j = 0; j < E->rn; j++) { - if (E->rows[j] == K) { - ko = j; - break; - } - } - } - - // Oops, K is no where to be found - if (ko == -1) { - fprintf(stderr, "%s:%d lvl = %d, k = %u ko = %d\n", __FILE__, __LINE__, lvl, - k, ko); - exit(1); - } - - // Calculate a_ik = a_ik / a_kk - scalar a_kk = 0; - for (j = koff[ko]; j < koff[ko + 1]; j++) { - if (kcols[kidx[j]] == K) { - a_kk = kval[j]; - break; - } - } - - if (fabs(a_kk) < 1e-10) { - fprintf(stderr, "%s:%d ilu0: Diagonal is zero ! k = %llu\n", __FILE__, - __LINE__, K); - exit(1); - } - - // cols[idx[k]] = K and val[k] = a_ik - scalar a_ik = val[k] / a_kk; - if (verbose) { - printf("a_kk = %lf a_ik = %lf a_ik/a_kk = %lf\n", a_kk, val[j], a_ik); - fflush(stdout); - } - val[k] = a_ik; - - uint kj; - scalar a_kj; - for (j = k + 1; j < off[io + 1]; j++) { - for (kj = koff[ko]; kj < koff[ko + 1] && kcols[kidx[kj]] < cols[idx[j]]; - kj++) - ; - if (kj < koff[ko + 1] && kcols[kidx[kj]] == cols[idx[j]]) - a_kj = kval[kj]; - else - a_kj = 0; - - if (verbose) { - printf("a_ij = %lf a_ik = %lf a_kj = %lf\n", val[j], a_ik, a_kj); - fflush(stdout); - } - // a_ij = a_ij - a_ik * a_kj - val[j] -= a_ik * a_kj; - } -} - -static void ilu0_level(int lvl, uint *lvl_off, struct par_mat *A, - struct par_mat *E, int verbose) { - ulong *cols = A->cols, *rows = A->rows; - uint *off = A->adj_off, *idx = A->adj_idx, i, k; - for (i = lvl_off[lvl - 1] + (lvl == 1); i < lvl_off[lvl]; i++) - for (k = off[i]; k < off[i + 1] && cols[idx[k]] < rows[i]; k++) - ilu0_update_row(i, k, A, E, verbose, lvl); -} - -static void ilu0(struct ilu *ilu, buffer *bfr) { - ilu0_level(1, ilu->lvl_off, &ilu->A, NULL, 0); - struct par_mat E; - for (int l = 2; l <= ilu->nlvls; l++) { - ilu0_get_rows(&E, l, ilu->lvl_off, &ilu->A, &ilu->cr, bfr); - ilu0_level(l, ilu->lvl_off, &ilu->A, &E, 0); - par_mat_free(&E); - } -} - -//============================================================================= -// ILUC -// -struct eij_t { - ulong r, c; - uint p; - scalar v; -}; - -// We are going to separate A matrix to L and U where L is the strictly lower -// triangular part of A and U is the upper triangular part of A (including the -// diagonal). Since A is in CSR format, extracting U (in CSR format) is easy. -// L will be distributed by columns and we need to figure out the owner of a -// given column. -static void iluc_sep_lu(struct ilu *ilu, buffer *bfr) { - // Recover the communicator - struct crystal *cr = &ilu->cr; - struct comm *c = &cr->comm; - - // Setup U - struct par_mat *A = &ilu->A; - struct array uijs, lijs; - array_init(struct mij, &uijs, A->rn * 30); - array_init(struct mij, &lijs, A->rn * 30); - - struct mij m = {.r = 0, .c = 0, .idx = 0, .p = 0, .v = 0}; - uint i, j, je; - for (i = 0; i < A->rn; i++) { - m.r = A->rows[i]; - j = A->adj_off[i], je = A->adj_off[i + 1]; - for (; j < je && A->cols[A->adj_idx[j]] < m.r; j++) { - m.c = A->cols[A->adj_idx[j]], m.v = A->adj_val[j]; - m.p = m.c % c->np, m.idx = (local_dof(A->rows, m.c, A->rn) < A->rn); - array_cat(struct mij, &lijs, &m, 1); - } - // Add the unit diagonal to L (We actually don't need to send this) - m.c = m.r, m.v = 1, m.p = m.c % c->np, m.idx = 1; - array_cat(struct mij, &lijs, &m, 1); - - for (; j < je; j++) { - m.c = A->cols[A->adj_idx[j]], m.v = A->adj_val[j]; - array_cat(struct mij, &uijs, &m, 1); - } - } - - par_mat_setup(&ilu->U, &uijs, CSR, 0, bfr); - array_free(&uijs); - - // Setup L - sarray_transfer(struct mij, &lijs, p, 1, cr); - if (lijs.n > 0) { - sarray_sort_2(struct mij, lijs.ptr, lijs.n, c, 1, idx, 0, bfr); - struct mij *pl = (struct mij *)lijs.ptr; - for (i = 1, j = 0; i < lijs.n; i++) { - if (pl[i].c != pl[j].c) { - assert(pl[i - 1].idx == 1); - for (; j < i; j++) - pl[j].p = pl[i - 1].p; - // j == i at the end - } - } - // residual - assert(pl[i - 1].idx == 1); - for (; j < i; j++) - pl[j].p = pl[i - 1].p; - } - - sarray_transfer(struct mij, &lijs, p, 0, cr); - par_mat_setup(&ilu->L, &lijs, CSC, 0, bfr); - array_free(&lijs); -} - -static void iluc_fwrd_rqsts(struct array *fwds, struct array *rqsts, - const int type, const ulong K, - const struct array *A, struct crystal *cr, - buffer *bfr) { - fwds->n = rqsts->n = 0; - struct request_t t = {.r = 0, .p = 0, .o = 1}; - - struct comm *c = &cr->comm; - -#define INIT_RQST(f, g, arr) \ - do { \ - if (A->n > 0) { \ - sarray_sort_2(struct mij, A->ptr, A->n, f, 1, g, 1, bfr); \ - struct mij *pa = (struct mij *)A->ptr; \ - uint i = 1, j = 0; \ - for (; i < A->n; i++) { \ - if (pa[i].f != pa[j].f) { \ - t.r = pa[j].f, t.p = t.r % c->np; \ - array_cat(struct request_t, arr, &t, 1); \ - j = i; \ - } \ - } \ - if (j < i) { \ - t.r = pa[j].f, t.p = t.r % c->np; \ - array_cat(struct request_t, arr, &t, 1); \ - } \ - } \ - } while (0) - - if (type == CSC) - INIT_RQST(r, c, rqsts); - else - INIT_RQST(c, r, rqsts); -#undef INIT_RQST - - if (K > 0) { - t.r = K, t.p = K % c->np, t.o = 0; - array_cat(struct request_t, rqsts, &t, 1); - } - - sarray_transfer(struct request_t, rqsts, p, 1, cr); - - // Okay, we got all the requests (if any) and non-zero row/col ids in the same - // processor. Now we forward the requests to the original owners. - if (rqsts->n > 0) { - sarray_sort_2(struct request_t, rqsts->ptr, rqsts->n, r, 1, o, 0, bfr); - struct request_t *pr = (struct request_t *)rqsts->ptr; - uint s = 0, e = 1; - for (; e < rqsts->n; e++) { - if (pr[e].r != pr[s].r) { - if (pr[s].o == 0) { // This is a request - uint p = pr[s].p; - for (s = s + 1; s < e; s++) { - pr[s].o = p; - array_cat(struct request_t, fwds, &pr[s], 1); - } - } - s = e; - } - } - if (s < e && pr[s].o == 0) { - uint p = pr[s].p; - for (s = s + 1; s < e; s++) { - pr[s].o = p; - array_cat(struct request_t, fwds, &pr[s], 1); - } - } - } - - sarray_transfer(struct request_t, fwds, p, 0, cr); -} - -static void iluc_send_data(struct array *data, const int type, struct array *A, - struct array *work, struct crystal *cr, - buffer *bfr) { - if (type == CSC) { - sarray_sort_2(struct mij, A->ptr, A->n, c, 1, r, 1, bfr); - sarray_sort_2(struct eij_t, work->ptr, work->n, r, 1, c, 1, bfr); - } else { - sarray_sort_2(struct mij, A->ptr, A->n, r, 1, c, 1, bfr); - sarray_sort_2(struct eij_t, work->ptr, work->n, c, 1, r, 1, bfr); - } - - // We only have one request per processor, so sorting by processor is the - // same as sorting by row id. But just to be safe we will sort by row id. - data->n = 0; - if (work->n > 0) { - struct eij_t *pw = (struct eij_t *)work->ptr; - uint i = 1, j = 0; - for (; i < work->n; i++) { - if ((pw[i].r != pw[j].r) || (pw[i].c != pw[j].c)) { - array_cat(struct eij_t, data, &pw[j], 1); - j = i; - } else - pw[j].v += pw[i].v; - } - if (j < i) - array_cat(struct eij_t, data, &pw[j], 1); - } - - sarray_transfer(struct eij_t, data, p, 0, cr); -} - -static void iluc_get_data(struct array *data, ulong K, int type, - struct array *A, struct array *B, struct crystal *cr, - struct array *rqsts, struct array *fwds, - struct array *work, buffer *bfr) { - iluc_fwrd_rqsts(fwds, rqsts, type, K, A, cr, bfr); - - work->n = 0; - if (fwds->n > 0) { - sarray_sort(struct request_t, fwds->ptr, fwds->n, r, 1, bfr); - struct request_t *pf = (struct request_t *)fwds->ptr; - - uint i, j, k, l, n; - scalar v; - struct eij_t m = {.r = 0, .c = 0, .p = 0, .v = 0}; - -#define FILL_RQST(f, g, nd) \ - do { \ - struct mij *pa = (struct mij *)A->ptr; \ - struct mij *pb = (struct mij *)B->ptr; \ - for (i = 0, j = 0; i < fwds->n; i++) { \ - l = 0; \ - m.f = pf[i].r, m.p = pf[i].o; \ - for (; j < A->n && pa[j].f < m.f; j++) \ - ; \ - assert(j < A->n && pa[j].f == m.f); \ - for (k = j; k < A->n && pa[k].f == m.f && pa[k].g < m.f; k++) { \ - v = pa[k].v; \ - for (; l < B->n && pb[l].f < pa[k].g; l++) \ - ; \ - assert(l < B->n && pb[l].f == pa[k].g); \ - for (n = l; n < B->n && pb[n].f == pa[k].g && (pb[n].g < m.f + nd); \ - n++) \ - ; \ - for (; n < B->n && pb[n].f == pa[k].g; n++) { \ - m.g = pb[n].g, m.v = -v * pb[n].v; \ - array_cat(struct eij_t, work, &m, 1); \ - } \ - } \ - } \ - } while (0) - - if (type == CSC) - FILL_RQST(r, c, 0); - else - FILL_RQST(c, r, 1); - -#undef FILL_RQST - } - - iluc_send_data(data, type, A, work, cr, bfr); -} - -static void iluc_update(struct array *tij, ulong K, struct array *data, int row, - buffer *bfr) { - // FIXME: This can be done more efficiently - struct mij m = {.r = 0, .c = 0, .idx = 0, .p = 0, .v = 0}; - uint j; - if (K) { - if (row) { - sarray_sort(struct eij_t, data->ptr, data->n, c, 1, bfr); - struct eij_t *pd = (struct eij_t *)data->ptr; - m.r = K; - for (j = 0; j < data->n; j++) { - m.c = pd[j].c, m.v = pd[j].v; - array_cat(struct mij, tij, &m, 1); - } - } else { - sarray_sort(struct eij_t, data->ptr, data->n, r, 1, bfr); - struct eij_t *pd = (struct eij_t *)data->ptr; - m.c = K; - for (; j < data->n; j++) { - m.r = pd[j].r, m.v = pd[j].v; - array_cat(struct mij, tij, &m, 1); - } - } - } - - struct array tmp; - array_init(struct mij, &tmp, tij->n + 1); - - if (tij->n > 0) { - uint i = 1, j = 0; - struct mij *pt = NULL; - if (row) { - sarray_sort(struct mij, tij->ptr, tij->n, c, 1, bfr); - pt = (struct mij *)tij->ptr; - for (; i < tij->n; i++) { - if (pt[i].c != pt[j].c) { - array_cat(struct mij, &tmp, &pt[j], 1); - j = i; - } else - pt[j].v += pt[i].v; - } - } else { - sarray_sort(struct mij, tij->ptr, tij->n, r, 1, bfr); - pt = (struct mij *)tij->ptr; - for (; i < tij->n; i++) { - if (pt[i].r != pt[j].r) { - array_cat(struct mij, &tmp, &pt[j], 1); - j = i; - } else - pt[j].v += pt[i].v; - } - } - if (j < i && pt) - array_cat(struct mij, &tmp, &pt[j], 1); - - tij->n = 0; - array_cat(struct mij, tij, tmp.ptr, tmp.n); - } - - array_free(&tmp); -} - -static void iluc_level(struct array *lij, struct array *uij, int lvl, - struct ilu *ilu, struct array *data, struct array *work, - buffer *bfr) { - // Work arrays - struct array rij, cij; - array_init(struct mij, &rij, 30); - array_init(struct mij, &cij, 30); - - struct array rqst, fwds; - array_init(struct request_t, &rqst, 30); - array_init(struct request_t, &fwds, 30); - - struct par_mat *L = &ilu->L, *U = &ilu->U; - struct crystal *cr = &ilu->cr; - - // Figure out start and end of the level and agree on a range - uint *lvl_off = ilu->lvl_off, s = lvl_off[lvl - 1]; - sint buf[2], size = lvl_off[lvl] - s; - comm_allreduce(&cr->comm, gs_int, gs_max, &size, 1, buf); - uint e = s + size; - - uint i, j, je, k; - for (k = s; k < e; k++) { - ulong K = (k < lvl_off[lvl]) ? U->rows[k] : 0; - - // Fetch required data (combine with the other call below) - iluc_get_data(data, K, CSC, lij, uij, cr, &rqst, &fwds, work, bfr); - - // Init z[1:K] = 0, z[K:n] = a_{K, K:n}, i.e., z = u_{K,:} - rij.n = 0; - if (K) { - struct mij m = {.r = K, .c = 0, .idx = 0, .p = 0, .v = 0}; - for (j = U->adj_off[k], je = U->adj_off[k + 1]; j < je; j++) { - m.c = U->cols[U->adj_idx[j]], m.v = U->adj_val[j]; - array_cat(struct mij, &rij, &m, 1); - } - } - // Update z if l_KI != 0 for all I, 1 <= I < K - iluc_update(&rij, K, data, 1, bfr); - - // Fetch required data (combine with the other call above) - iluc_get_data(data, K, CSR, uij, lij, cr, &rqst, &fwds, work, bfr); - - // Init w[1:K] = 0, w[K] = 1, w[K+1:n] = a_{K+1:n, K}, i.e., w = l_{:, K} - cij.n = 0; - if (K) { - struct mij m = {.r = 0, .c = K, .idx = 0, .p = 0, .v = 0}; - for (j = L->adj_off[k] + 1, je = L->adj_off[k + 1]; j < je; j++) { - m.r = L->rows[L->adj_idx[j]], m.v = L->adj_val[j]; - array_cat(struct mij, &cij, &m, 1); - } - } - // Update w if u_IK != 0 for all I, 1 <= I < K - iluc_update(&cij, K, data, 0, bfr); - - // Set u_{k, :} = z and find u_kk - scalar u_kk = 1; - struct mij *pt = (struct mij *)rij.ptr; - if (K) { - if (rij.n > 0 && fabs(pt[0].v) > 1e-12) - u_kk = pt[0].v; - array_cat(struct mij, uij, rij.ptr, rij.n); - } - - // Set l_{:, K} = w/u_KK and l_KK = 1 - pt = (struct mij *)cij.ptr; - for (j = 0; j < cij.n; j++) - pt[j].v /= u_kk; - - if (K) { - struct mij m = {.r = K, .c = K, .idx = 0, .p = 0, .v = 1}; - array_cat(struct mij, &cij, &m, 1); - array_cat(struct mij, lij, cij.ptr, cij.n); - } - } - - array_free(&rij), array_free(&cij); - array_free(&rqst), array_free(&fwds); -} - -//============================================================================= -// ILUCP -// -struct pivot_t { - ulong k; - uint p, pivot; -}; - -static void ilucp_get_data(struct array *data, ulong P, int type, - struct array *A, ulong K, struct array *B, - struct array *pvts, struct crystal *cr, - struct array *rqsts, struct array *fwds, - struct array *work, buffer *bfr) { - iluc_fwrd_rqsts(fwds, rqsts, type, P, A, cr, bfr); - - work->n = 0; - if (fwds->n > 0) { - sarray_sort(struct request_t, fwds->ptr, fwds->n, r, 1, bfr); - struct request_t *pf = (struct request_t *)fwds->ptr; - - uint i, j, k, l, n, o; - scalar v; - struct eij_t m = {.r = 0, .c = 0, .p = 0, .v = 0}; - -#define FILL_RQST(f, g, nd) \ - do { \ - struct mij *pa = (struct mij *)A->ptr; \ - struct mij *pb = (struct mij *)B->ptr; \ - for (i = 0, j = 0; i < fwds->n; i++) { \ - l = 0; \ - m.f = pf[i].r, m.p = pf[i].o; \ - for (; j < A->n && pa[j].f < m.f; j++) \ - ; \ - assert(j < A->n && pa[j].f == m.f); \ - for (k = j; k < A->n && pa[k].f == m.f && pa[k].g < K; k++) { \ - v = pa[k].v; \ - for (; l < B->n && pb[l].f < pa[k].g; l++) \ - ; \ - assert(l < B->n && pb[l].f == pa[k].g); \ - for (n = l; n < B->n && pb[n].f == pa[k].g && (pb[n].g < K + nd); n++) \ - ; \ - if (pvts != NULL) { \ - struct pivot_t *pp = (struct pivot_t *)pvts->ptr; \ - o = 0; \ - for (; n < B->n && pb[n].f == pa[k].g; n++) { \ - m.g = pb[n].g, m.v = -v * pb[n].v; \ - while (o < pvts->n && pp[o].k < m.g) \ - o++; \ - assert(o < pvts->n && pp[o].k == m.g); \ - if (!pp[o].pivot) \ - array_cat(struct eij_t, work, &m, 1); \ - } \ - } else { \ - for (; n < B->n && pb[n].f == pa[k].g; n++) { \ - m.g = pb[n].g, m.v = -v * pb[n].v; \ - array_cat(struct eij_t, work, &m, 1); \ - } \ - } \ - } \ - } \ - } while (0) - - if (type == CSC) - FILL_RQST(r, c, 0); - else - FILL_RQST(c, r, 1); - -#undef FILL_RQST - } - - iluc_send_data(data, type, A, work, cr, bfr); -} - -static ulong ilucp_find_pvt(ulong *perm, uint k, int lvl, uint *lvl_off, - struct array *row, struct crystal *cr, - buffer *bfr) { - // First sort by the absolute value and then setup a gs handle to iteratively - // select a pivot - ulong p = 0; - if (k < lvl_off[lvl]) { - scalar v = 0; - struct mij *pr = (struct mij *)row->ptr; - for (uint i = 0; i < row->n && pr[i].c < lvl_off[lvl]; i++) { - if (fabs(pr[i].v) > v) { - v = fabs(pr[i].v); - p = pr[i].c; - } - } - perm[k] = p; - } - return p; -} - -static void ilucp_update_pvts(struct array *pvts, struct array *rij, - ulong *perm, uint k, int lvl, uint *lvl_off, - struct crystal *cr, buffer *bfr) { - struct comm *c = &cr->comm; - - struct pivot_t t = {.k = 0, .pivot = 0}; - struct mij *pr = (struct mij *)rij->ptr; - for (uint i = 0; i < rij->n; i++) { - t.k = pr[i].c, t.p = t.k % c->np; - array_cat(struct pivot_t, pvts, &t, 1); - } - - uint e = (k < lvl_off[lvl] ? k : lvl_off[lvl]); - t.pivot = 1; - for (uint i = 0; i < e; i++) { - t.k = perm[i], t.p = t.k % c->np; - array_cat(struct pivot_t, pvts, &t, 1); - } - - if (pvts->n > 0) { - struct array temp; - array_init(struct pivot_t, &temp, pvts->n + 1); - - sarray_sort_2(struct pivot_t, pvts->ptr, pvts->n, k, 1, pivot, 1, bfr); - struct pivot_t *pp = (struct pivot_t *)pvts->ptr; - uint i = 1, j = 0; - for (; i < pvts->n; i++) { - if (pp[i].k != pp[j].k) { - array_cat(struct pivot_t, &temp, &pp[i - 1], 1); - j = i; - } - } - if (j < i) - array_cat(struct pivot_t, &temp, &pp[i - 1], 1); - pvts->n = 0; - array_cat(struct pivot_t, pvts, temp.ptr, temp.n); - array_free(&temp); - } - - sarray_transfer(struct pivot_t, pvts, p, 1, cr); - sarray_sort_2(struct pivot_t, pvts->ptr, pvts->n, k, 1, pivot, 0, bfr); - - if (pvts->n > 0) { - struct pivot_t *pp = (struct pivot_t *)pvts->ptr; - uint i = 1, j = 0; - for (; i < pvts->n; i++) { - if (pp[i].k != pp[j].k) { - for (; j < i - 1; j++) - pp[j].pivot = pp[i - 1].pivot; - j = i; - } - } - if (j < i) { - for (; j < i - 1; j++) - pp[j].pivot = pp[i - 1].pivot; - } - } - - sarray_transfer(struct pivot_t, pvts, p, 1, cr); - sarray_sort(struct pivot_t, pvts->ptr, pvts->n, k, 1, bfr); -} - -static void ilucp_level(struct array *lij, struct array *uij, int lvl, - struct ilu *ilu, struct array *pvts, struct array *data, - struct array *work, buffer *bfr) { - // Work arrays - struct array rij, cij; - array_init(struct mij, &rij, 30); - array_init(struct mij, &cij, 30); - - struct array rqst, fwds; - array_init(struct request_t, &rqst, 30); - array_init(struct request_t, &fwds, 30); - - struct par_mat *L = &ilu->L, *U = &ilu->U; - struct crystal *cr = &ilu->cr; - - // Figure out start and end of the level and agree on a range - uint *lvl_off = ilu->lvl_off, s = lvl_off[lvl - 1]; - sint buf[2], size = lvl_off[lvl] - s; - comm_allreduce(&cr->comm, gs_int, gs_max, &size, 1, buf); - uint e = s + size; - - uint i, j, je, k, l; - for (k = s; k < e; k++) { - ulong K = (k < lvl_off[lvl]) ? U->rows[k] : 0; - - // Fetch required data. We will skip the data in the columns which were - // choosen as pivots. - ilucp_get_data(data, K, CSC, lij, K, uij, pvts, cr, &rqst, &fwds, work, - bfr); - - // Init z[1:K] = 0, z[K:n] = a_{K, K:n}, i.e., z = u_{K,:} and skip the - // columns which have been choosen as pivots. - rij.n = 0; - if (K) { - struct mij m = {.r = K, .c = 0, .idx = 0, .p = 0, .v = 0}; - struct pivot_t *pp = (struct pivot_t *)pvts->ptr; - for (j = U->adj_off[k], je = U->adj_off[k + 1], l = 0; j < je; j++) { - m.c = U->cols[U->adj_idx[j]], m.v = U->adj_val[j]; - while (l < pvts->n && pp[l].k < m.c) - l++; - assert(pp[l].k == m.c); - if (!pp[l].pivot) - array_cat(struct mij, &rij, &m, 1); - } - } - - // Update z if l_KI != 0 for all I, 1 <= I < K - iluc_update(&rij, K, data, 1, bfr); - - // Select the pivot now -- all the active processors have to agree on their - // own pivot. If two processors share the same pivot, smallest one wins and - // others have to concede and find another one. So we will send a pivot - // candidate list and make each processor pick one. Right now the candidate - // list = updated row. - ulong P = ilucp_find_pvt(ilu->perm, k, lvl, lvl_off, &rij, cr, bfr); - - // Sync the pivots: Basically everyone gets updated about which cols of U - // have become pivots. Can't be done through a gs call, will have to send - // all the cols in U and the current row a_k along with the info if its - // a pivot. - ilucp_update_pvts(pvts, &rij, ilu->perm, k, lvl, lvl_off, cr, bfr); - - // Fetch required data for col updated. Can't combine with above call when - // we pivot? Will need to reimplement this part - ilucp_get_data(data, P, CSR, uij, K, lij, NULL, cr, &rqst, &fwds, work, - bfr); - - // Init w[1:K] = 0, w[K] = 1, w[K+1:n] = a_{K+1:n, K}, i.e., w = l_{:, K} - cij.n = 0; - if (K) { - struct mij m = {.r = 0, .c = K, .idx = 0, .p = 0, .v = 0}; - for (j = L->adj_off[k] + 1, je = L->adj_off[k + 1]; j < je; j++) { - m.r = L->rows[L->adj_idx[j]], m.v = L->adj_val[j]; - array_cat(struct mij, &cij, &m, 1); - } - } - // Update w if u_IK != 0 for all I, 1 <= I < K - iluc_update(&cij, K, data, 0, bfr); - - // Set u_{k, :} = z and find u_kk - // FIXME: This should u_{perm[k],perm[k]}}, not u_kk - scalar u_kk = 1; - struct mij *pt = (struct mij *)rij.ptr; - if (K) { - if (rij.n > 0 && fabs(pt[0].v) > 1e-12) - u_kk = pt[0].v; - array_cat(struct mij, uij, rij.ptr, rij.n); - } - - // Set l_{:, K} = w/u_KK and l_KK = 1 - pt = (struct mij *)cij.ptr; - for (j = 0; j < cij.n; j++) - pt[j].v /= u_kk; - - if (K) { - struct mij m = {.r = K, .c = K, .idx = 0, .p = 0, .v = 1}; - array_cat(struct mij, &cij, &m, 1); - array_cat(struct mij, lij, cij.ptr, cij.n); - } - } - - array_free(&rij), array_free(&cij), array_free(&rqst), array_free(&fwds); -} - -static void iluc(struct ilu *ilu, buffer *bfr) { - struct crystal *cr = &ilu->cr; - struct comm *c = &cr->comm; - - // Setup L and U - iluc_sep_lu(ilu, bfr); - - struct par_mat *A = &ilu->A, *L = &ilu->L, *U = &ilu->U; - - struct array uij, lij, data, work; - array_init(struct mij, &uij, A->rn * 30 + 1); - array_init(struct mij, &lij, A->rn * 30 + 1); - array_init(struct eij_t, &data, A->rn * 30 + 1); - array_init(struct eij_t, &work, A->rn * 30 + 1); - - struct array pvts; - array_init(struct pivot_t, &pvts, L->cn + 1); - - if (ilu->pivot) { - ilu->perm = tcalloc(ulong, A->rn); - // Initialize with the columns of U, i.e, columns of L - struct pivot_t t = {.k = 0, .p = 0, .pivot = 0}; - for (uint i = 0; i < U->cn; i++) { - t.k = U->cols[i], t.p = t.k % c->np; - array_cat(struct pivot_t, &pvts, &t, 1); - } - - for (int l = 1; l <= ilu->nlvls; l++) - ilucp_level(&lij, &uij, l, ilu, &pvts, &data, &work, bfr); - } else { - for (int l = 1; l <= ilu->nlvls; l++) - iluc_level(&lij, &uij, l, ilu, &data, &work, bfr); - } - - par_mat_free(L), par_mat_free(U); - par_mat_setup(U, &uij, CSR, 0, bfr); - par_mat_setup(L, &lij, CSC, 0, bfr); - - const char *val = getenv("PARRSB_DUMP_ILU"); - if (val != NULL && atoi(val) != 0) { - par_mat_dump("LL.txt", L, cr, bfr); - par_mat_dump("UU.txt", U, cr, bfr); - } - - array_free(&pvts); - array_free(&lij), array_free(&uij); - array_free(&work), array_free(&data); -} - -//============================================================================= -// ILU API related functions -// -// `vtx` array is in the order of sorted element ids -static int ilu_setup_aux(struct ilu *ilu, int nlvls, uint *lvl_off, - uint *lvl_owner, ulong *lvl_ids, const uint n, - const int nv, const slong *vtx, const int verbose, - buffer *bfr) { - struct elm { - slong vtx[8]; - uint p, lvl; - ulong e; - }; - - struct crystal *cr = &ilu->cr; - struct comm *c = &cr->comm; - - // Send the elements in each level to the owner - struct array elms; - array_init(struct elm, &elms, n); - - struct elm elm; - for (int l = 0; l < nlvls; l++) { - for (uint i = lvl_off[l]; i < lvl_off[l + 1]; i++) { - elm.lvl = l + 1, elm.e = lvl_ids[i], elm.p = lvl_owner[i]; - array_cat(struct elm, &elms, &elm, 1); - } - } - sarray_sort(struct elm, elms.ptr, elms.n, e, 1, bfr); - - struct elm *pe = (struct elm *)elms.ptr; - if (elms.n > 0) { - // Sanity check - assert(elms.n == n); - for (uint i = 0; i < n; i++) { - for (int v = 0; v < nv; v++) - pe[i].vtx[v] = vtx[i * nv + v]; - } - } - - sarray_transfer(struct elm, &elms, p, 1, cr); - sarray_sort_2(struct elm, elms.ptr, elms.n, lvl, 0, e, 1, bfr); - - // Setup the ILU structure: allocate ILU data structures. - ilu->nlvls = nlvls; - ilu->lvl_off = (uint *)tcalloc(uint, ilu->nlvls + 1); - - uint s = 0, e = 0; - ilu->lvl_off[0] = s; - pe = (struct elm *)elms.ptr; - for (int l = 1; l <= ilu->nlvls; l++) { - while (e < elms.n && pe[e].lvl == l) - e++; - ilu->lvl_off[l] = ilu->lvl_off[l - 1] + e - s; - s = e; - } - - // Number rows now: All the elements in Level 0 are numbered before Level - // 1 and so on. - ulong *ids = trealloc(ulong, ids, elms.n); - ulong ng = 0; - for (int l = 0; l < ilu->nlvls; l++) { - e = ilu->lvl_off[l + 1], s = ilu->lvl_off[l]; - slong out[2][1], buf[2][1], in = e - s; - comm_scan(out, c, gs_long, gs_add, &in, 1, buf); - ulong start = ng + out[0][0] + 1; - for (; s < e; s++) - ids[s] = start++; - ng += out[1][0]; - } - - slong *vrt = tcalloc(slong, elms.n * nv); - for (uint i = 0; i < elms.n; i++) { - for (int j = 0; j < nv; j++) - vrt[i * nv + j] = pe[i].vtx[j]; - } - - if (verbose > 1) { - for (uint i = 0; i < elms.n; i++) { - printf("fid = %llu, ", ids[i]); - for (int v = 0; v < nv; v++) - printf("%lld, ", vrt[i * nv + v]); - printf("\n"); - fflush(stdout); - } - } - - // Find and compress neighbors in order to form the Laplacian - struct array nbrs, eij; - find_nbrs(&nbrs, ids, vrt, elms.n, nv, cr, bfr); - compress_nbrs(&eij, &nbrs, bfr); - free(ids), free(vrt); - array_free(&elms), array_free(&nbrs); - - // Setup the parallel CSR matrix - par_csr_setup(&ilu->A, &eij, 0, bfr); - array_free(&eij); - - return 0; -} - -struct ilu *ilu_setup(const uint n, const int nv, const long long *llvtx, - const ilu_options *options, MPI_Comm comm) { - struct comm c; - comm_init(&c, comm); - - struct ilu *ilu = tcalloc(struct ilu, 1); - ilu->pivot = options->pivot, ilu->verbose = options->verbose; - ilu->tol = options->tol, ilu->nnz_per_row = options->nnz_per_row; - ilu->lvl_off = NULL, ilu->perm = NULL; - crystal_init(&ilu->cr, &c); - - slong *vtx = tcalloc(slong, n * nv); - for (uint i = 0; i < n * nv; i++) - vtx[i] = llvtx[i]; - - // Establish a numbering based on input - slong out[2][1], buf[2][1], in = n; - comm_scan(out, &c, gs_long, gs_add, &in, 1, buf); - ulong s = out[0][0], ng = out[1][0]; - - ulong *ids = tcalloc(ulong, n); - for (uint i = 0; i < n; i++) - ids[i] = s + i + 1; - - buffer bfr; - buffer_init(&bfr, 1024); - - uint *lvl_off = tcalloc(uint, 100 + n), *lvl_owner = lvl_off + 100; - ulong *lvl_ids = tcalloc(ulong, n); - int nlvls = find_lvls(lvl_off, lvl_owner, lvl_ids, n, nv, ids, vtx, 1, - &ilu->cr, ilu->verbose, &bfr); - ilu_setup_aux(ilu, nlvls, lvl_off, lvl_owner, lvl_ids, n, nv, vtx, - ilu->verbose, &bfr); - - char *val = getenv("PARRSB_DUMP_ILU"); - if (val != NULL && atoi(val) != 0) - par_mat_dump("A.txt", &ilu->A, &ilu->cr, &bfr); - - // Setup the ILU factors - switch (options->type) { - case 0: - ilu0(ilu, &bfr); - break; - case 1: - iluc(ilu, &bfr); - break; - default: - break; - } - - val = getenv("PARRSB_DUMP_ILU"); - if (val != NULL && atoi(val) != 0) - par_mat_dump("B.txt", &ilu->A, &ilu->cr, &bfr); - - free(ids), free(vtx), free(lvl_off), free(lvl_ids); - buffer_free(&bfr), comm_free(&c); - - return ilu; -} - -void ilu_free(struct ilu *ilu) { - if (ilu) { - crystal_free(&ilu->cr); - if (ilu->nlvls > 0) { - par_mat_free(&ilu->A); - // FIXME: Cleanup L and U - // par_mat_free(&ilu->L); - // par_mat_free(&ilu->U); - } - if (ilu->lvl_off) - free(ilu->lvl_off), ilu->lvl_off = NULL; - if (ilu->perm) - free(ilu->perm), ilu->perm = NULL; - free(ilu); - } -} - -#undef CSC -#undef CSR diff --git a/src/ilu.h b/src/ilu.h deleted file mode 100644 index 1460240e..00000000 --- a/src/ilu.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _PARRSB_ILU_H_ -#define _PARRSB_ILU_H_ - -#include "mat.h" - -typedef struct { - // ILU type: ILU(0), ILUC, etc. - int type; - // Verbose level: 0, 1, etc. - int verbose; - // Use pivoting or not: 0 or 1 - int pivot; - // 1st dropping rule: An entry a_ij is dropped abs(a_ij) < tol - scalar tol; - // 2nd dropping rule: Entries are dropped so that total nnz per row/col < p - unsigned int nnz_per_row; -} ilu_options; - -struct ilu; -struct ilu *ilu_setup(const uint n, const int nv, const long long *vtx, - const ilu_options *options, MPI_Comm comm); -void ilu_free(struct ilu *ilu); - -#endif diff --git a/src/io.c b/src/io.c index 6aebe026..47dec216 100644 --- a/src/io.c +++ b/src/io.c @@ -1,5 +1,7 @@ #include "parrsb-impl.h" +#include + #define READ_T(coords, buf, T, nv) \ { memcpy((coords), buf, sizeof(T) * nv); } @@ -62,17 +64,12 @@ static void re2_header(unsigned *nelt_, unsigned *nv_, ulong *nelgt_, static void re2_coord(double **coord_, unsigned int nelt, int nv, MPI_File file, struct comm *c) { - uint rank = c->id, size = c->np; - - slong out[2][1], bfr[2][1], in = nelt; - comm_scan(out, c, gs_long, gs_add, &in, 1, bfr); - slong start = out[0][0]; - - int ndim = (nv == 4) ? 2 : 3; + unsigned ndim = (nv == 4) ? 2 : 3; size_t elem_size = nv * ndim * sizeof(double) + sizeof(double); size_t header_size = GC_RE2_HEADER_LEN + sizeof(float); // Calculate read size for element data on each MPI rank. + uint rank = c->id; size_t read_size = nelt * elem_size + (rank == 0) * header_size; char *buf = (char *)calloc(read_size, sizeof(char)); MPI_Status st; @@ -220,8 +217,6 @@ static void re2_boundary(unsigned int *nbcs_, long long **bcs_, static void read_geometry(unsigned *nelt, unsigned *nv, double **coord, unsigned *nbcs, long long **bcs, char *fname, struct comm *c) { - uint rank = c->id, size = c->np; - MPI_Info info; check_mpi_call(MPI_Info_create(&info), "MPI_Info_create", c); @@ -258,12 +253,12 @@ static int read_connectivity(unsigned int *nelt_, unsigned *nv_, err = MPI_File_read_all(file, buf, GC_CO2_HEADER_LEN, MPI_BYTE, &st); long long nelgt, nelgv; - int nv; + unsigned nv; char version[6]; - sscanf(buf, "%5s %12lld %12lld %d", version, &nelgt, &nelgv, &nv); + sscanf(buf, "%5s %12lld %12lld %u", version, &nelgt, &nelgv, &nv); // TODO: Assert version - int nelt = nelgt / size, nrem = nelgt - nelt * size; + uint nelt = nelgt / size, nrem = nelgt - nelt * size; nelt += (rank > (size - 1 - nrem) ? 1 : 0); if (*nv_ != 0) { @@ -303,10 +298,6 @@ static int read_connectivity(unsigned int *nelt_, unsigned *nv_, MPI_Abort(comm, 911); } - slong out[2][1], bfr[2][1], in = nelt; - comm_scan(out, c, gs_long, gs_add, &in, 1, bfr); - slong start = out[0][0]; - size_t read_size = nelt * (nv + 1) * sizeof(int); size_t header_size = GC_CO2_HEADER_LEN + sizeof(float); if (rank == 0) @@ -318,19 +309,18 @@ static int read_connectivity(unsigned int *nelt_, unsigned *nv_, char *buf0 = buf + (rank == 0) * header_size; long long *vl = *vl_ = tcalloc(long long, nv *nelt); - int j, tmp1, tmp2; + int tmp1, tmp2; for (uint i = 0; i < nelt; i++) { READ_T(&tmp1, buf0, int, 1); buf0 += sizeof(int); - for (j = 0; j < nv; j++) { + for (unsigned j = 0; j < nv; j++) { READ_T(&tmp2, buf0, int, 1); buf0 += sizeof(int); vl[i * nv + j] = tmp2; } } - if (buf) - free(buf); + free(buf); return 0; } @@ -347,7 +337,7 @@ int parrsb_read_mesh(unsigned *nel, unsigned *nv, long long **vl, // Read geometry from .re2 file if (read & 1) { - char geom_name[BUFSIZ]; + char geom_name[BUFSIZ + 1]; strncpy(geom_name, name, BUFSIZ); strncat(geom_name, ".re2", 5); read_geometry(nel, nv, coord, nbcs, bcs, geom_name, &c); @@ -355,7 +345,7 @@ int parrsb_read_mesh(unsigned *nel, unsigned *nv, long long **vl, // Read connectivity from .co2 file if the user asks us to read it. if (read & 2) { - char conn_name[BUFSIZ]; + char conn_name[BUFSIZ + 1]; strncpy(conn_name, name, BUFSIZ); strncat(conn_name, ".co2", 5); read_connectivity(nel, nv, vl, conn_name, &c); @@ -375,7 +365,7 @@ int parrsb_dump_con(char *name, unsigned nelt, unsigned nv, long long *vl, comm_init(&c, comm); uint id = c.id; - char co2_name[BUFSIZ]; + char co2_name[BUFSIZ + 1]; strncpy(co2_name, name, BUFSIZ); strncat(co2_name, ".co2", 5); @@ -413,12 +403,11 @@ int parrsb_dump_con(char *name, unsigned nelt, unsigned nv, long long *vl, buf0 += sizeof(float); } - int i, j, temp; - for (i = 0; i < nelt; i++) { - temp = start + i + 1; + for (unsigned i = 0; i < nelt; i++) { + int temp = start + i + 1; WRITE_INT(buf0, temp); buf0 += sizeof(int); - for (j = 0; j < nv; j++) { + for (unsigned j = 0; j < nv; j++) { temp = vl[i * nv + j]; WRITE_INT(buf0, temp); buf0 += sizeof(int); @@ -442,7 +431,7 @@ int parrsb_dump_map(char *name, unsigned nelt, unsigned nv, long long *vtx, char version[6] = "#v001"; float test = 6.54321; - char ma2_name[BUFSIZ]; + char ma2_name[BUFSIZ + 1]; strncpy(ma2_name, name, BUFSIZ); strncat(ma2_name, ".ma2", 5); @@ -514,64 +503,11 @@ int parrsb_dump_map(char *name, unsigned nelt, unsigned nv, long long *vtx, errs += (err != 0); MPI_Info_free(&infoIn); - if (buf) - free(buf); + free(buf); return errs; } -int parrsb_dump_part(char *name, unsigned nel, unsigned nv, double *coord, - int gid, MPI_Comm comm) { - struct comm c; - comm_init(&c, comm); - - int rank = c.id, size = c.np; - - MPI_File file; - int err = MPI_File_open(comm, name, MPI_MODE_CREATE | MPI_MODE_WRONLY, - MPI_INFO_NULL, &file); - parrsb_check_error(err, comm); - - slong out[2][1], buf[2][1], nelt = nel; - comm_scan(out, &c, gs_long, gs_add, &nelt, 1, buf); - slong start = out[0][0], nelgt = out[1][0]; - - int ndim = (nv == 8) ? 3 : 2; - uint wsize = (ndim * sizeof(double) + sizeof(int)) * nelt; - if (rank == 0) - wsize += sizeof(slong) + sizeof(int); // for nelgt and ndim - - char *pbuf, *pbuf0; - pbuf = pbuf0 = (char *)tcalloc(char, wsize); - if (rank == 0) { - WRITE_T(pbuf0, &nelgt, slong, 1); - WRITE_T(pbuf0, &ndim, int, 1); - } - - uint i, j, k; - double tcoord[3]; - for (i = 0; i < nelt; i++) { - tcoord[0] = tcoord[1] = tcoord[2] = 0.0; - for (j = 0; j < nv; j++) - for (k = 0; k < ndim; k++) - tcoord[k] += coord[i * nv * ndim + j * ndim + k]; - tcoord[0] /= nv, tcoord[1] /= nv, tcoord[2] /= nv; - WRITE_T(pbuf0, tcoord, double, ndim); - WRITE_T(pbuf0, &gid, int, 1); - } - - MPI_Status st; - err = MPI_File_write_ordered(file, pbuf, wsize, MPI_BYTE, &st); - parrsb_check_error(err, comm); - - err += MPI_File_close(&file); - parrsb_check_error(err, comm); - - free(pbuf); - - return err; -} - #undef check_call #undef check_mpi_call diff --git a/src/laplacian.c b/src/laplacian.c index db0b84ba..34067751 100644 --- a/src/laplacian.c +++ b/src/laplacian.c @@ -18,8 +18,9 @@ struct csr_laplacian { }; static void find_nbrs_rsb(struct array *arr, const struct rsb_element *elems, - const uint nelt, const int nv, const struct comm *c, - struct crystal *cr, buffer *buf) { + const uint nelt, const unsigned nv, + const struct comm *c, struct crystal *cr, + buffer *buf) { slong out[2][1], bfr[2][1], in = nelt; comm_scan(out, c, gs_long, gs_add, &in, 1, bfr); ulong eid = out[0][0] + 1; @@ -157,7 +158,8 @@ struct gs_laplacian { }; static int gs_weighted_init(struct laplacian *l, struct rsb_element *elems, - uint lelt, int nv, struct comm *c, buffer *buf) { + const uint lelt, const unsigned nv, struct comm *c, + buffer *buf) { uint npts = nv * lelt; slong *vertices = tcalloc(slong, npts); @@ -188,10 +190,9 @@ static int gs_weighted_init(struct laplacian *l, struct rsb_element *elems, return 0; } -static int gs_weighted(scalar *v, struct laplacian *l, scalar *u, buffer *buf) { +static int gs_weighted(scalar *v, struct laplacian *l, scalar *u, buffer *bfr) { uint lelt = l->nel; - int nv = l->nv; - + unsigned nv = l->nv; struct gs_laplacian *gl = l->data; uint i, j; @@ -199,7 +200,7 @@ static int gs_weighted(scalar *v, struct laplacian *l, scalar *u, buffer *buf) { for (j = 0; j < nv; j++) gl->u[nv * i + j] = u[i]; - gs(gl->u, gs_double, gs_add, 0, gl->gsh, buf); + gs(gl->u, gs_double, gs_add, 0, gl->gsh, bfr); for (i = 0; i < lelt; i++) { v[i] = gl->diag[i] * u[i]; diff --git a/src/mat.c b/src/mat.c index 34c27c21..37bd401c 100644 --- a/src/mat.c +++ b/src/mat.c @@ -21,39 +21,43 @@ int compress_nbrs(struct array *eij, struct array *nbr, buffer *bfr) { return 1; sarray_sort_2(struct nbr, nbr->ptr, nbr->n, r, 1, c, 1, bfr); - struct nbr *ptr = (struct nbr *)nbr->ptr; - struct mij m; - m.idx = 0; - - sint i = 0; - while (i < nbr->n) { - m.r = ptr[i].r, m.c = ptr[i].c; - - sint j = i + 1; - while (j < nbr->n && ptr[j].r == ptr[i].r && ptr[j].c == ptr[i].c) - j++; - - m.v = i - j; // = - (j - i) - array_cat(struct mij, eij, &m, 1); - i = j; + // Set off diagonal entries. + { + const struct nbr *const ptr = (const struct nbr *const)nbr->ptr; + struct mij m = {.idx = 0}; + uint i = 0; + while (i < nbr->n) { + m.r = ptr[i].r, m.c = ptr[i].c; + + uint j = i + 1; + while (j < nbr->n && ptr[j].r == ptr[i].r && ptr[j].c == ptr[i].c) + j++; + + m.v = j - i, m.v = -m.v; + array_cat(struct mij, eij, &m, 1); + i = j; + } } // Now make sure the row sum is zero - struct mij *pe = (struct mij *)eij->ptr; - i = 0; - while (i < eij->n) { - sint j = i, k = -1, s = 0; - while (j < eij->n && pe[j].r == pe[i].r) { - if (pe[j].r == pe[j].c) - k = j; - else - s += pe[j].v; - j++; + { + struct mij *const pe = (struct mij *const)eij->ptr; + uint i = 0; + while (i < eij->n) { + uint j = i; + sint k = -1, s = 0; + while (j < eij->n && pe[j].r == pe[i].r) { + if (pe[j].r == pe[j].c) + k = j; + else + s += pe[j].v; + j++; + } + assert(k >= 0); + pe[k].v = -s; + i = j; } - assert(k >= 0); - pe[k].v = -s; - i = j; } return 0; @@ -201,42 +205,48 @@ int mat_free(struct mat *mat) { // Find neighbors in the graph // void find_nbrs(struct array *arr, const ulong *eid, const slong *vtx, - const uint nelt, const int nv, struct crystal *cr, buffer *buf) { - struct array vertices; - array_init(struct nbr, &vertices, nelt * nv); - + const uint nelt, const unsigned nv, struct crystal *cr, + buffer *buf) { struct comm *c = &cr->comm; - struct nbr v = {.r = 0, .c = 0, .proc = 0}; - uint i, j; - for (i = 0; i < nelt; i++) { - v.r = eid[i]; - assert(v.r > 0); - for (j = 0; j < nv; j++) { - v.c = vtx[i * nv + j], v.proc = v.c % c->np; - array_cat(struct nbr, &vertices, &v, 1); + + struct array vertices; + { + array_init(struct nbr, &vertices, nelt * nv); + struct nbr v = {.r = 0, .c = 0, .proc = 0}; + uint i, j; + for (i = 0; i < nelt; i++) { + v.r = eid[i]; + assert(v.r > 0); + for (j = 0; j < nv; j++) { + v.c = vtx[i * nv + j], v.proc = v.c % c->np; + array_cat(struct nbr, &vertices, &v, 1); + } } } sarray_transfer(struct nbr, &vertices, proc, 1, cr); sarray_sort(struct nbr, vertices.ptr, vertices.n, c, 1, buf); - // FIXME: Assumes quads or hexes - struct nbr *pv = (struct nbr *)vertices.ptr, t = {.r = 0, .c = 0, .proc = 0}; array_init(struct nbr, arr, vertices.n * 10 + 1); - uint s = 0, e; - while (s < vertices.n) { - e = s + 1; - while (e < vertices.n && pv[s].c == pv[e].c) - e++; - for (i = s; i < e; i++) { - t = pv[i]; - for (j = s; j < e; j++) { - t.c = pv[j].r; - assert(t.r > 0 && t.c > 0); - array_cat(struct nbr, arr, &t, 1); + // FIXME: Assumes quads or hexes + { + const struct nbr *const pv = (const struct nbr *const)vertices.ptr; + struct nbr t = {.r = 0, .c = 0, .proc = 0}; + uint s = 0, e; + while (s < vertices.n) { + e = s + 1; + while (e < vertices.n && pv[s].c == pv[e].c) + e++; + for (uint i = s; i < e; i++) { + t = pv[i]; + for (uint j = s; j < e; j++) { + t.c = pv[j].r; + assert(t.r > 0 && t.c > 0); + array_cat(struct nbr, arr, &t, 1); + } } + s = e; } - s = e; } sarray_transfer(struct nbr, arr, proc, 1, cr); @@ -710,38 +720,42 @@ static int compress_mij(struct array *eij, struct array *entries, buffer *bfr) { return 1; sarray_sort_2(struct mij, entries->ptr, entries->n, r, 1, c, 1, bfr); - struct mij *ptr = (struct mij *)entries->ptr; - struct mij m; - m.idx = 0; + { + struct mij m = {.idx = 0}; - uint i = 0; - while (i < entries->n) { - m = ptr[i]; - uint j = i + 1; - while (j < entries->n && ptr[j].r == ptr[i].r && ptr[j].c == ptr[i].c) - m.v += ptr[j].v, j++; + const struct mij *const ptr = (const struct mij *const)entries->ptr; + uint i = 0; + while (i < entries->n) { + m = ptr[i]; + uint j = i + 1; + while (j < entries->n && ptr[j].r == ptr[i].r && ptr[j].c == ptr[i].c) + m.v += ptr[j].v, j++; - array_cat(struct mij, eij, &m, 1); - i = j; + array_cat(struct mij, eij, &m, 1); + i = j; + } } // Now make sure the row sum is zero - struct mij *pe = (struct mij *)eij->ptr; - i = 0; - while (i < eij->n) { - sint j = i, k = -1; - scalar s = 0; - while (j < eij->n && pe[j].r == pe[i].r) { - if (pe[j].r == pe[j].c) - k = j; - else - s += pe[j].v; - j++; + { + struct mij *const pe = (struct mij *const)eij->ptr; + uint i = 0; + while (i < eij->n) { + uint j = i; + sint k = -1; + scalar s = 0; + while (j < eij->n && pe[j].r == pe[i].r) { + if (pe[j].r == pe[j].c) + k = j; + else + s += pe[j].v; + j++; + } + assert(k >= 0); + pe[k].v = -s; + i = j; } - assert(k >= 0); - pe[k].v = -s; - i = j; } return 0; diff --git a/src/mat.h b/src/mat.h index e1da9bd5..2d9d4ccd 100644 --- a/src/mat.h +++ b/src/mat.h @@ -43,7 +43,8 @@ int IS_DIAG(const struct par_mat *A); // Output array `arr` is an array of type `struct nbr` void find_nbrs(struct array *arr, const ulong *eid, const slong *vtx, - const uint nelt, const int nv, struct crystal *cr, buffer *buf); + const uint nelt, const unsigned nv, struct crystal *cr, + buffer *buf); // Output array `eij` is an array of type `struct mij`, input array `nbr` is // an array of type `struct nbr` int compress_nbrs(struct array *eij, struct array *nbr, buffer *bfr); diff --git a/src/metrics.c b/src/metrics.c index 26c85fcc..ac3f2df6 100644 --- a/src/metrics.c +++ b/src/metrics.c @@ -33,9 +33,9 @@ void metric_toc(struct comm *c, metric m) { } double metric_get_value(int level, metric m) { - if (level == -1) + if (level < 0) return metrics[m]; - if (level >= 0 && level < stack_size) + if ((uint)level < stack_size) return stack[level * MAXMETS + m]; return 0.0; } @@ -93,22 +93,19 @@ void metric_rsb_print(struct comm *c, int profile_level) { SUMMARY(i, RSB_LANCZOS)); printf(" RSB_LANCZOS_TQLI : %e/%e/%e\n", SUMMARY(i, RSB_LANCZOS_TQLI)); - printf(" RSB_INVERSE_SETUP : %e/%e/%e\n", - SUMMARY(i, RSB_INVERSE_SETUP)); - printf(" RSB_INVERSE : %e/%e/%e\n", - SUMMARY(i, RSB_INVERSE)); - printf(" RSB_PROJECT_AX : %e/%e/%e\n", - SUMMARY(i, RSB_PROJECT_AX)); - printf(" RSB_PROJECT_MG : %e/%e/%e\n", - SUMMARY(i, RSB_PROJECT_MG)); printf(" RSB_FIEDLER_CALC_NITER : %e/%e/%e\n", SUMMARY(i, RSB_FIEDLER_CALC_NITER)); printf(" RSB_SORT : %e/%e/%e\n", SUMMARY(i, RSB_SORT)); - printf(" RSB_REPAIR : %e/%e/%e\n", - SUMMARY(i, RSB_REPAIR)); + printf(" RSB_COMPONENTS : %e/%e/%e\n", + SUMMARY(i, RSB_COMPONENTS)); + printf(" RSB_COMPONENTS_NCOMP : %e/%e/%e\n", + SUMMARY(i, RSB_COMPONENTS_NCOMP)); + printf(" RSB_NEIGHBORS : %e/%e/%e\n", + SUMMARY(i, RSB_NEIGHBORS)); printf(" RSB_BALANCE : %e/%e/%e\n", SUMMARY(i, RSB_BALANCE)); } + fflush(stdout); } if (wrk) @@ -148,6 +145,7 @@ void metric_crs_print(struct comm *c, int profile_level) { printf(" SCHUR_SOLVE_CHOL2 : %e/%e/%e\n", SUMMARY(i, SCHUR_SOLVE_CHOL2)); } + fflush(stdout); } if (wrk) diff --git a/src/metrics.h b/src/metrics.h index 504aaa55..83d0bc3e 100644 --- a/src/metrics.h +++ b/src/metrics.h @@ -7,22 +7,24 @@ // Metrics // typedef enum { - RSB_COMPONENTS = 0, + RSB_BALANCE = 0, + RSB_COMPONENTS, + RSB_COMPONENTS_NCOMP, RSB_FIEDLER, RSB_FIEDLER_SETUP, RSB_FIEDLER_CALC, RSB_FIEDLER_CALC_NITER, + RSB_INVERSE_SETUP, + RSB_INVERSE, RSB_LANCZOS_SETUP, RSB_LANCZOS, RSB_LANCZOS_TQLI, - RSB_INVERSE_SETUP, + RSB_NEIGHBORS, + RSB_PRE, RSB_PROJECT_AX, RSB_PROJECT_MG, - RSB_INVERSE, - RSB_SORT, - RSB_PRE, RSB_REPAIR, - RSB_BALANCE, + RSB_SORT, SCHUR_PROJECT_NITER, SCHUR_PROJECT_OPERATOR, SCHUR_PROJECT_OPERATOR_FXI, diff --git a/src/multigrid.c b/src/multigrid.c index 3cd59e1b..b6e859a0 100644 --- a/src/multigrid.c +++ b/src/multigrid.c @@ -1,6 +1,10 @@ #include "multigrid.h" #include +#ifndef M_PI +#define M_PI 3.141592653589793 +#endif + struct mg_lvl { uint npres, nposts; scalar over; @@ -8,13 +12,10 @@ struct mg_lvl { struct gs_data *Q; // gs handle for matrix vector product struct par_mat *M; // Operator - - struct gs_data *Qs, *Qst; // gs handle for matrix vector product - struct par_mat *S, *St; // Smooth aggregation }; struct mg { - uint sagg, nlevels, *level_off; + uint nlevels, *level_off; struct mg_lvl **levels; scalar *buf; }; @@ -29,7 +30,7 @@ static scalar sigma_cheb(int k, int n, scalar lmin, scalar lmax) { return 1 / lamk; } -static void inline set_proc(struct mij *m, uint nelt, uint nrem, uint np) { +inline static void set_proc(struct mij *m, uint nelt, uint nrem, uint np) { assert(m->r > 0); if (nrem == 0) { @@ -43,129 +44,99 @@ static void inline set_proc(struct mij *m, uint nelt, uint nrem, uint np) { m->p = s + (m->r - (t + 1)) / (nelt + 1); } - assert(m->p >= 0 && m->p < np); + assert(m->p < np); } -extern int sparse_gemm(struct par_mat *WG, const struct par_mat *W, +static int sparse_gemm(struct par_mat *WG, const struct par_mat *W, const struct par_mat *G, int diag_wg, struct crystal *cr, - buffer *bfr); + buffer *bfr) { + // W is in CSR, G is in CSC; we multiply rows of W by shifting + // the columns of G from processor to processor. This is not scalable + // at all -- need to do a 2D partition of the matrices W and G. + assert(IS_CSR(W) && !IS_DIAG(W)); + assert(IS_CSC(G)); + + // Put G into an array to transfer from processor to processor + struct array gij, sij; + array_init(struct mij, &gij, 100); + array_init(struct mij, &sij, 100); + + struct mij m = {.r = 0, .c = 0, .idx = 0, .p = cr->comm.id, .v = 0}; + uint i, j, je; + for (i = 0; i < G->cn; i++) { + m.c = G->cols[i]; + for (j = G->adj_off[i], je = G->adj_off[i + 1]; j != je; j++) { + m.r = G->rows[G->adj_idx[j]]; + m.v = G->adj_val[j]; + array_cat(struct mij, &gij, &m, 1); + } + } + if (IS_DIAG(G)) { + for (i = 0; i < G->cn; i++) { + m.c = m.r = G->cols[i]; + m.v = G->diag_val[i]; + array_cat(struct mij, &gij, &m, 1); + } + } -static uint mg_setup_aux(struct mg *d, const int factor, const int sagg, - struct crystal *cr, struct array *mijs, buffer *bfr) { - uint lvl = d->nlevels; - struct mg_lvl *l = d->levels[lvl - 1]; + sarray_sort_2(struct mij, gij.ptr, gij.n, c, 1, r, 1, bfr); + struct mij *pg = (struct mij *)gij.ptr; + for (i = 0; i < gij.n; i++) + pg[i].idx = i; + + for (uint p = 0; p < cr->comm.np; p++) { + // Calculate dot product of each row of W with columns of G + for (i = 0; i < W->rn; i++) { + m.r = W->rows[i]; + uint s = 0, e = 0; + while (s < gij.n) { + m.c = pg[s].c, m.v = 0; + for (j = W->adj_off[i], je = W->adj_off[i + 1]; j < je; j++) { + ulong k = W->cols[W->adj_idx[j]]; + while (e < gij.n && pg[s].c == pg[e].c && pg[e].r < k) + e++; + if (e < gij.n && pg[s].c == pg[e].c && pg[e].r == k) + m.v += W->adj_val[j] * pg[e].v; + } + while (e < gij.n && pg[s].c == pg[e].c) + e++; + if (fabs(m.v) > 1e-12) + array_cat(struct mij, &sij, &m, 1); + s = e; + } + } - struct par_mat *Ml = l->M; - uint nnz = ((Ml->rn > 0) ? (Ml->adj_off[Ml->rn] + Ml->rn) : 0); + sint next = (cr->comm.id + 1) % cr->comm.np; + for (i = 0; i < gij.n; i++) + pg[i].p = next; + sarray_transfer(struct mij, &gij, p, 0, cr); - struct mij m = {.r = 0, .c = 0, .idx = 0, .p = 0, .v = 0}; - array_reserve(struct mij, mijs, nnz); + sarray_sort(struct mij, gij.ptr, gij.n, idx, 0, bfr); + pg = gij.ptr; + } - struct comm *c = &cr->comm; - const double sigma = 0.65; - struct par_mat *M; - // Replace M by the following if smooth aggregation is used: - // S = (I - sigma * D^{-1} * Ml) - // M = ST * Ml * S - if (sagg) { - // This is very hacky and not optimal at all. Should be rewritten. - // Create S is in CSR format, with separate diagonal. Then convert - // to CSC with no separate diagonal in order to do the mat-vec. - mijs->n = 0; - for (uint i = 0; i < Ml->rn; i++) { - m.c = m.r = Ml->rows[i], m.v = 1 - sigma; - array_cat(struct mij, mijs, &m, 1); - double di = 1.0 / Ml->diag_val[i]; - for (uint j = Ml->adj_off[i], je = Ml->adj_off[i + 1]; j < je; j++) { - m.c = Ml->cols[Ml->adj_idx[j]]; - m.v = -sigma * di * Ml->adj_val[j]; - array_cat(struct mij, mijs, &m, 1); - } - } - l->S = tcalloc(struct par_mat, 1); - par_mat_setup(l->S, mijs, 1, 1, bfr); - l->Qs = setup_Q(l->S, c, bfr); + par_csr_setup(WG, &sij, diag_wg, bfr); + array_free(&gij), array_free(&sij); - struct par_mat S; - par_csr_to_csc(&S, l->S, 0, cr, bfr); + return 0; +} - // Create N = M in CSR format, no separate diagonal. - mijs->n = 0; - for (uint i = 0; i < Ml->rn; i++) { - m.c = m.r = Ml->rows[i], m.v = Ml->diag_val[i]; - array_cat(struct mij, mijs, &m, 1); - for (uint j = Ml->adj_off[i], je = Ml->adj_off[i + 1]; j < je; j++) { - m.c = Ml->cols[Ml->adj_idx[j]]; - m.v = Ml->adj_val[j]; - array_cat(struct mij, mijs, &m, 1); - } - } - struct par_mat N; - par_mat_setup(&N, mijs, 1, 0, bfr); - - // T = N * S, CSR format, no separate diagonal. - struct par_mat T; - sparse_gemm(&T, &N, &S, 0, cr, bfr); - par_mat_free(&N), par_mat_free(&S); - - // N = T, CSC format, no separate diagonal. - par_csr_to_csc(&N, &T, 0, cr, bfr); - par_mat_free(&T); - - // Setup S^t, CSR format, no separate diagonal. - mijs->n = 0; - for (uint i = 0; i < Ml->rn; i++) { - m.c = m.r = Ml->rows[i], m.v = 1 - sigma; - array_cat(struct mij, mijs, &m, 1); - double di = 1.0 / Ml->diag_val[i]; - for (uint j = Ml->adj_off[i], je = Ml->adj_off[i + 1]; j < je; j++) { - m.r = Ml->cols[Ml->adj_idx[j]]; - m.v = -sigma * di * Ml->adj_val[j]; - array_cat(struct mij, mijs, &m, 1); - } - } - par_mat_setup(&T, mijs, 0, 0, bfr); - par_csc_to_csr(&S, &T, 0, cr, bfr); - par_mat_free(&T); - - // M = ST * N - M = tcalloc(struct par_mat, 1); - sparse_gemm(M, &S, &N, 1, cr, bfr); - par_mat_free(&S), par_mat_free(&N); - - // Normalize M by the largest value - double max = 0; - for (uint i = 0; i < M->rn; i++) { - for (uint j = M->adj_off[i], je = M->adj_off[i + 1]; j < je; j++) - if (fabs(M->adj_val[j]) > max) - max = fabs(M->adj_val[j]); - if (fabs(M->diag_val[i]) > max) - max = fabs(M->diag_val[i]); - } - double wrk[2]; - comm_allreduce(c, gs_double, gs_max, &max, 1, wrk); +static uint mg_setup_aux(struct mg *d, const int factor, struct crystal *cr, + struct array *mijs, buffer *bfr) { + uint lvl = d->nlevels; + struct mg_lvl *l = d->levels[lvl - 1]; - for (uint i = 0; i < M->rn; i++) { - for (uint j = M->adj_off[i], je = M->adj_off[i + 1]; j < je; j++) - M->adj_val[j] /= max; - M->diag_val[i] /= max; - } + struct par_mat *M = l->M; + uint nnz = ((M->rn > 0) ? (M->adj_off[M->rn] + M->rn) : 0); - par_mat_setup(&T, mijs, 0, 0, bfr); - l->St = tcalloc(struct par_mat, 1); - par_csc_to_csr(l->St, &T, 1, cr, bfr); - par_mat_free(&T); - l->Qst = setup_Q(l->St, c, bfr); - } else { - l->S = l->St = NULL; - l->Qs = l->Qst = NULL; - M = Ml; - } + struct mij m = {.r = 0, .c = 0, .idx = 0, .p = 0, .v = 0}; + array_reserve(struct mij, mijs, nnz); // Now we interpolate to find the coarse operator Mc = J^T M J // Calculate coarse level parameters: ngc, npc, nelt, nrem uint size = (M->rn > 0 ? (M->rows[M->rn - 1] - M->rows[0] + 1) : 0); slong ng = size, wrk[2][1]; + struct comm *c = &cr->comm; comm_allreduce(c, gs_long, gs_add, &ng, 1, wrk); // ng > 1 based on while condition in mg_setup(). so ngc >= 1 @@ -197,11 +168,6 @@ static uint mg_setup_aux(struct mg *d, const int factor, const int sagg, array_cat(struct mij, mijs, &m, 1); } - if (sagg) { - par_mat_free(M); - free(M); - } - sarray_transfer(struct mij, mijs, p, 0, cr); sarray_sort_2(struct mij, mijs->ptr, mijs->n, r, 1, c, 1, bfr); @@ -221,14 +187,13 @@ static uint mg_setup_aux(struct mg *d, const int factor, const int sagg, return lvl; } -struct mg *mg_setup(const struct par_mat *M, const int factor, const int sagg, +struct mg *mg_setup(const struct par_mat *M, const int factor, struct crystal *cr, buffer *bfr) { assert(IS_CSR(M)); assert(M->rn == 0 || IS_DIAG(M)); // Allocate memory for struct mg struct mg *d = (struct mg *)tcalloc(struct mg, 1); - d->sagg = sagg; // Setup Level 1, keeps a pointer to input matrix d->nlevels = 1; @@ -254,7 +219,7 @@ struct mg *mg_setup(const struct par_mat *M, const int factor, const int sagg, slong wrk[2], ng = size; comm_allreduce(c, gs_long, gs_add, &ng, 1, wrk); while (ng > 1) { - uint l = mg_setup_aux(d, factor, sagg, cr, &mijs, bfr); + uint l = mg_setup_aux(d, factor, cr, &mijs, bfr); struct par_mat *Ml = d->levels[l]->M; if (Ml->rn > 0 && Ml->adj_off[Ml->rn] + Ml->rn > nnz) nnz = Ml->adj_off[Ml->rn] + Ml->rn; @@ -296,7 +261,7 @@ void mg_vcycle(scalar *u1, scalar *rhs, struct mg *d, struct comm *c, scalar *s = r + nnz, *Gs = s + nnz, *u = Gs + nnz, *wrk = u + nnz; uint i, j, n, off; - for (int lvl = 0; lvl < d->nlevels - 1; lvl++) { + for (uint lvl = 0; lvl < d->nlevels - 1; lvl++) { off = lvl_off[lvl]; n = lvl_off[lvl + 1] - off; @@ -331,10 +296,6 @@ void mg_vcycle(scalar *u1, scalar *rhs, struct mg *d, struct comm *c, r[off + j] = r[off + j] - Gs[off + j]; } - // Apply S^T - if (d->sagg) - mat_vec_csr(r + off, r + off, l->St, l->Qst, wrk, bfr); - // Interpolate to coarser level gs(r + off, gs_double, gs_add, 1, l->J, bfr); } @@ -342,7 +303,6 @@ void mg_vcycle(scalar *u1, scalar *rhs, struct mg *d, struct comm *c, // Coarsest level off = lvl_off[d->nlevels - 1]; n = lvl_off[d->nlevels] - off; - if (n == 1) { struct mg_lvl *l = d->levels[d->nlevels - 1]; struct par_mat *M = l->M; @@ -353,16 +313,12 @@ void mg_vcycle(scalar *u1, scalar *rhs, struct mg *d, struct comm *c, r[off] = u[off]; } - for (int lvl = d->nlevels - 2; lvl >= 0; lvl--) { + for (int lvl = (int)d->nlevels - 2; lvl >= 0; lvl--) { struct mg_lvl *l = d->levels[lvl]; off = lvl_off[lvl]; // J*e gs(r + off, gs_double, gs_add, 0, l->J, bfr); - // Apply S - if (d->sagg) - mat_vec_csr(r + off, r + off, l->S, l->Qs, wrk, bfr); - // u = u + over*S*J*e n = lvl_off[lvl + 1] - off; for (j = 0; j < n; j++) @@ -384,14 +340,6 @@ void mg_free(struct mg *d) { gs_free(l[i]->J), l[i]->J = NULL; if (l[i]->Q != NULL) gs_free(l[i]->Q), l[i]->Q = NULL; - if (l[i]->Qs != NULL) - gs_free(l[i]->Qs), l[i]->Qs = NULL; - if (l[i]->Qst != NULL) - gs_free(l[i]->Qst), l[i]->Qst = NULL; - if (l[i]->S != NULL) - par_mat_free(l[i]->S), l[i]->S = NULL; - if (l[i]->St != NULL) - par_mat_free(l[i]->St), l[i]->St = NULL; if (l[i] != NULL) free(l[i]), l[i] = NULL; } diff --git a/src/multigrid.h b/src/multigrid.h index 07f4e61e..72c9f961 100644 --- a/src/multigrid.h +++ b/src/multigrid.h @@ -4,7 +4,7 @@ #include "mat.h" struct mg; -struct mg *mg_setup(const struct par_mat *M, const int factor, const int sagg, +struct mg *mg_setup(const struct par_mat *M, const int factor, struct crystal *cr, buffer *bfr); void mg_vcycle(scalar *u, scalar *rhs, struct mg *d, struct comm *c, buffer *bfr); diff --git a/src/parRSB.h b/src/parRSB.h index ec3fa119..7a71ca6f 100644 --- a/src/parRSB.h +++ b/src/parRSB.h @@ -20,36 +20,44 @@ extern "C" { // typedef struct { // General options - int partitioner; // Partition algo: 0 - RSB, 1 - RCB, 2 - RIB (Default: 0) - int verbose_level; // Verbose level: 0, 1, 2, .. etc (Default: 1) - int profile_level; // Profile level: 0, 1, 2, .. etc (Default: 1) - int two_level; // Enable two level partitioning (Default: 0) + int partitioner; // Partition algo: 0 - RSB, 1 - RCB, 2 - RIB (Default: 0) + int tagged; // Tagged partitioning: 0 - No, 1 - Yes (Default: 0) + int levels; // Number of levels (levels: 1, 2) int repair; // Repair disconnected components: 0 - No, 1 - Yes (Default: 0) - // RSB common (Lanczos + MG) options + int verbose_level; // Verbose level: 0, 1, 2, .. etc (Default: 1) + int profile_level; // Profile level: 0, 1, 2, .. etc (Default: 0) + // RSB common (Lanczos and MG) options int rsb_algo; // RSB algo: 0 - Lanczos, 1 - MG (Default: 0) int rsb_pre; // RSB pre-partition : 0 - None, 1 - RCB , 2 - RIB (Default: 1) int rsb_max_iter; // Max iterations in Lanczos / MG (Default: 50) int rsb_max_passes; // Max Lanczos restarts / Inverse iterations (Default: 50) double rsb_tol; // Tolerance for Lanczos or RQI (Default: 1e-5) + int rsb_dump_stats; // Dump partition statistics to a text file. // RSB MG specific options int rsb_mg_grammian; // MG Grammian: 0 or 1 (Default: 0) int rsb_mg_factor; // MG Coarsening factor (Default: 2, should be > 1) - int rsb_mg_sagg; // MG smooth aggregation: 0 or 1 (Default: 0) } parrsb_options; extern parrsb_options parrsb_default_options; -int parrsb_part_mesh(int *part, int *seq, long long *vtx, double *coord, - int nel, int nv, parrsb_options options, MPI_Comm comm); +int parrsb_part_mesh(int *part, const long long *const vtx, + const double *const xyz, const int *const tag, + const int nel, const int nv, parrsb_options *const options, + MPI_Comm comm); -#define fparrsb_part_mesh FORTRAN_UNPREFIXED(fparrsb_partmesh, FPARRSB_PARTMESH) -void fparrsb_part_mesh(int *part, int *seq, long long *vtx, double *coord, - int *nel, int *nve, int *options, int *comm, int *err); +void parrsb_part_solid(int *part, const long long *vtx2, unsigned nel2, + const long long *vtx1, unsigned nel1, unsigned nv, + MPI_Comm comm); +void parrsb_check_tagged_partitions(const long long *const eids, + const long long *const vtx, const uint nel, + const unsigned nv, const uint ntags, + const struct comm *const c, + const int verbose); //============================================================================== // Connectivity // -int parrsb_conn_mesh(long long *vtx, double *coord, int nel, int nDim, +int parrsb_conn_mesh(long long *vtx, double *coord, uint nel, unsigned nDim, long long *periodicInfo, int nPeriodicFaces, double tol, MPI_Comm comm); @@ -72,9 +80,6 @@ int parrsb_dump_con(char *name, unsigned nelt, unsigned nv, long long *vl, int parrsb_dump_map(char *name, unsigned nelt, unsigned nv, long long *vl, MPI_Comm comm); -int parrsb_dump_part(char *name, unsigned nelt, unsigned nv, double *coord, - int gid, MPI_Comm comm); - //============================================================================== // Auxiliary functions // @@ -85,13 +90,6 @@ typedef struct { int dump; // dump the connectivity or map file, default: 1 int nactive; // # of active MPI ranks, default: INT_MAX int verbose; // Verbosity, default: 0 - - int ilu_type; // ILU type, default: 0 - double ilu_tol; // ILU tolerance, default: 0.1 - int ilu_pivot; // Pivoting for ILU: default: 0 - - int crs_type; // Coarse solver type, default: 0 - double crs_tol; // Coarse tolerance, default: 1e-3 } parrsb_cmd_line_opts; parrsb_cmd_line_opts *parrsb_parse_cmd_opts(int argc, char *argv[]); @@ -99,7 +97,7 @@ parrsb_cmd_line_opts *parrsb_parse_cmd_opts(int argc, char *argv[]); void parrsb_cmd_opts_free(parrsb_cmd_line_opts *opts); int parrsb_dist_mesh(unsigned *nelt, long long **vl, double **coord, int *part, - int nv, MPI_Comm comm); + unsigned nv, MPI_Comm comm); int parrsb_setup_mesh(unsigned *nelt, unsigned *nv, long long **vl, double **coord, parrsb_cmd_line_opts *opts, diff --git a/src/parrsb-impl.h b/src/parrsb-impl.h index e34319e2..f792dbb1 100644 --- a/src/parrsb-impl.h +++ b/src/parrsb-impl.h @@ -1,15 +1,12 @@ #ifndef _PARRSB_IMPL_H_ #define _PARRSB_IMPL_H_ -#include "parRSB.h" -#include +#define _POSIX_C_SOURCE 200809L + #include -#include -#include -#include -#include #include -#include + +#include "parRSB.h" #ifdef scalar #undef scalar @@ -22,15 +19,14 @@ #define SCALAR_MAX DBL_MAX #define SCALAR_TOL 1e-12 -#define MAXDIM 3 // Maximum dimension of the mesh -#define MAXNV 8 // Maximum number of vertices per element +#define MAXDIM 3 // Maximum dimension of the mesh. +#define MAXNV 8 // Maximum number of vertices per element. //------------------------------------------------------------------------------ -// RCB / RIB +// RCB / RIB. // `struct rcb_element` is used for RCB and RIB partitioning. -// `struct rsb_element` should be a superset of `struct rcb_element` struct rcb_element { - uint proc, origin, seq; + uint proc, origin; ulong globalId; scalar coord[MAXDIM], fiedler; }; @@ -41,17 +37,20 @@ int rib(struct array *elements, size_t unit_size, int ndim, struct comm *c, buffer *bfr); //------------------------------------------------------------------------------ -// RSB -// +// RSB. +// `struct rsb_element` = `struct rcb_element` + vertices. Order is important. struct rsb_element { - uint proc, origin, seq; + uint proc, origin; ulong globalId; scalar coord[MAXDIM], fiedler; slong vertices[MAXNV]; }; +void rsb(struct array *elements, int nv, const parrsb_options *const options, + const struct comm comms[3], buffer *bfr); + //------------------------------------------------------------------------------ -// Find number of components +// Find number of components. // uint get_components(sint *component, struct array *elems, unsigned nv, struct comm *c, buffer *buf, int verbose); @@ -59,7 +58,21 @@ uint get_components_v2(sint *component, struct array *elems, unsigned nv, const struct comm *ci, buffer *bfr, int verbose); //------------------------------------------------------------------------------ -// Laplacian +// Dump partition statistics. +// +void parrsb_dump_stats_start(const uint nv_); + +void parrsb_dump_stats(const struct comm *const gc, const struct comm *const lc, + const struct array *const elems, buffer *bfr); + +void parrsb_dump_stats_end(const struct comm *const gc, const char *prefix); + +uint parrsb_get_neighbors(const struct array *const elems, const unsigned nv, + const struct comm *const gc, + const struct comm *const lc, buffer *bfr); + +//------------------------------------------------------------------------------ +// Laplacian. // #define GS 1 #define CSR 2 @@ -72,11 +85,11 @@ int laplacian(scalar *v, struct laplacian *l, scalar *u, buffer *buf); void laplacian_free(struct laplacian *l); //------------------------------------------------------------------------------ -// Misc +// Misc. // int log2ll(long long n); void parrsb_barrier(struct comm *c); -void debug_print(struct comm *c, int verbose, const char *fmt, ...); +void parrsb_print(const struct comm *c, int verbose, const char *fmt, ...); #endif diff --git a/src/parrsb.c b/src/parrsb.c new file mode 100644 index 00000000..d68dec00 --- /dev/null +++ b/src/parrsb.c @@ -0,0 +1,972 @@ +#include "metrics.h" +#include "parrsb-impl.h" + +#include +#include +#include +#include +#include +#include + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +void parrsb_print(const struct comm *c, int verbose, const char *fmt, ...) { + comm_barrier(c); + + va_list vargs; + if (c->id == 0 && verbose > 0) { + va_start(vargs, fmt); + vprintf(fmt, vargs); + va_end(vargs); + printf("\n"); + fflush(stdout); + } +} + +parrsb_options parrsb_default_options = { + // General options + .partitioner = 0, + .tagged = 0, + .levels = 2, + .repair = 0, + .verbose_level = 1, + .profile_level = 0, + // RSB common (Lanczos and MG) options + .rsb_algo = 0, + .rsb_pre = 1, + .rsb_max_iter = 50, + .rsb_max_passes = 50, + .rsb_tol = 1e-5, + .rsb_dump_stats = 0, + // RSB MG specific options + .rsb_mg_grammian = 0, + .rsb_mg_factor = 2}; + +static char *ALGO[3] = {"RSB", "RCB", "RIB"}; + +static void update_options(parrsb_options *const options) { +#define UPDATE_OPTION(OPT, STR, IS_INT) \ + do { \ + const char *val = getenv(STR); \ + if (val != NULL) { \ + if (IS_INT) \ + options->OPT = atoi(val); \ + else \ + options->OPT = atof(val); \ + } \ + } while (0) + + UPDATE_OPTION(partitioner, "PARRSB_PARTITIONER", 1); + UPDATE_OPTION(tagged, "PARRSB_TAGGED", 1); + UPDATE_OPTION(levels, "PARRSB_LEVELS", 1); + UPDATE_OPTION(repair, "PARRSB_REPAIR", 1); + UPDATE_OPTION(verbose_level, "PARRSB_VERBOSE_LEVEL", 1); + UPDATE_OPTION(profile_level, "PARRSB_PROFILE_LEVEL", 1); + UPDATE_OPTION(rsb_algo, "PARRSB_RSB_ALGO", 1); + UPDATE_OPTION(rsb_pre, "PARRSB_RSB_PRE", 1); + UPDATE_OPTION(rsb_max_iter, "PARRSB_RSB_MAX_ITER", 1); + UPDATE_OPTION(rsb_max_passes, "PARRSB_RSB_MAX_PASSES", 1); + UPDATE_OPTION(rsb_tol, "PARRSB_RSB_TOL", 0); + UPDATE_OPTION(rsb_dump_stats, "PARRSB_DUMP_STATS", 1); + UPDATE_OPTION(rsb_mg_grammian, "PARRSB_RSB_MG_GRAMMIAN", 1); + UPDATE_OPTION(rsb_mg_factor, "PARRSB_RSB_MG_FACTOR", 1); + +#undef UPDATE_OPTION +} + +static void print_options(const struct comm *c, + const parrsb_options *const options) { +#define PRINT_OPTION(OPT, STR, FMT) \ + parrsb_print(c, options->verbose_level, "%s = " FMT "", STR, options->OPT) + + PRINT_OPTION(partitioner, "PARRSB_PARTITIONER", "%d"); + PRINT_OPTION(tagged, "PARRSB_TAGGED", "%d"); + PRINT_OPTION(levels, "PARRSB_LEVELS", "%d"); + PRINT_OPTION(repair, "PARRSB_REPAIR", "%d"); + PRINT_OPTION(verbose_level, "PARRSB_VERBOSE_LEVEL", "%d"); + PRINT_OPTION(profile_level, "PARRSB_PROFILE_LEVEL", "%d"); + PRINT_OPTION(rsb_algo, "PARRSB_RSB_ALGO", "%d"); + PRINT_OPTION(rsb_pre, "PARRSB_RSB_PRE", "%d"); + PRINT_OPTION(rsb_max_iter, "PARRSB_RSB_MAX_ITER", "%d"); + PRINT_OPTION(rsb_max_passes, "PARRSB_RSB_MAX_PASSES", "%d"); + PRINT_OPTION(rsb_tol, "PARRSB_RSB_TOL", "%lf"); + PRINT_OPTION(rsb_dump_stats, "PARRSB_DUMP_STATS", "%d"); + PRINT_OPTION(rsb_mg_grammian, "PARRSB_RSB_MG_GRAMMIAN", "%d"); + PRINT_OPTION(rsb_mg_factor, "PARRSB_RSB_MG_FACTOR", "%d"); + +#undef PRINT_OPTION +} + +static size_t load_balance(struct array *elist, uint nel, int nv, + const double *const xyz, const long long *const vtx, + int verbose, struct crystal *cr, buffer *bfr) { + struct comm *c = &cr->comm; + slong out[2][1], wrk[2][1], in = nel; + comm_scan(out, c, gs_long, gs_add, &in, 1, wrk); + slong start = out[0][0], nelg = out[1][0]; + parrsb_print(c, verbose, "load_balance: start = %lld nelg = %lld", start, + nelg); + + uint nstar = nelg / c->np, nrem = nelg - nstar * c->np; + slong lower = (nstar + 1) * nrem; + + size_t unit_size; + if (vtx == NULL) // RCB + unit_size = sizeof(struct rcb_element); + else // RSB + unit_size = sizeof(struct rsb_element); + parrsb_print( + c, verbose, "load_balance: unit_size = %zu (rsb = %zu, rcb = %zu)", + unit_size, sizeof(struct rsb_element), sizeof(struct rcb_element)); + + array_init_(elist, nel, unit_size, __FILE__, __LINE__); + + struct rcb_element *pe = (struct rcb_element *)calloc(1, unit_size); + pe->origin = c->id; + + int ndim = (nv == 8) ? 3 : 2; + for (uint e = 0; e < nel; ++e) { + slong eg = pe->globalId = start + e + 1; + if (nstar == 0) + pe->proc = eg - 1; + else if (eg <= lower) + pe->proc = (eg - 1) / (nstar + 1); + else + pe->proc = (eg - 1 - lower) / nstar + nrem; + + pe->coord[0] = pe->coord[1] = pe->coord[2] = 0.0; + if (xyz != NULL) { + for (int v = 0; v < nv; v++) + for (int n = 0; n < ndim; n++) + pe->coord[n] += xyz[e * ndim * nv + v * ndim + n]; + for (int n = 0; n < ndim; n++) + pe->coord[n] /= nv; + } + + array_cat_(unit_size, elist, pe, 1, __FILE__, __LINE__); + } + + if (vtx != NULL) { // RSB + struct rsb_element *pr = (struct rsb_element *)elist->ptr; + for (uint e = 0; e < nel; e++) { + for (int v = 0; v < nv; v++) + pr[e].vertices[v] = vtx[e * nv + v]; + } + } + + sarray_transfer_(elist, unit_size, offsetof(struct rcb_element, proc), 1, cr); + if (vtx == NULL) // RCB + sarray_sort(struct rcb_element, elist->ptr, elist->n, globalId, 1, bfr); + else // RSB + sarray_sort(struct rsb_element, elist->ptr, elist->n, globalId, 1, bfr); + + free(pe); + + return unit_size; +} + +static void restore_original(int *part, struct crystal *cr, struct array *elist, + size_t usize, buffer *bfr) { + sarray_transfer_(elist, usize, offsetof(struct rcb_element, origin), 1, cr); + uint nel = elist->n; + + if (usize == sizeof(struct rsb_element)) // RSB + sarray_sort(struct rsb_element, elist->ptr, nel, globalId, 1, bfr); + else if (usize == sizeof(struct rcb_element)) // RCB + sarray_sort(struct rcb_element, elist->ptr, nel, globalId, 1, bfr); + + struct rcb_element *element; + uint e; + for (e = 0; e < nel; e++) { + element = (struct rcb_element *)((char *)elist->ptr + e * usize); + part[e] = element->origin; // element[e].origin; + } +} + +static void initialize_node_aux(struct comm *c, const struct comm *const gc) { +#ifdef MPI + MPI_Comm node; + MPI_Comm_split_type(gc->c, MPI_COMM_TYPE_SHARED, gc->id, MPI_INFO_NULL, + &node); + comm_init(c, node); + MPI_Comm_free(&node); +#else + comm_init(1, 1); +#endif +} + +static void initialize_levels(struct comm *const comms, int *const levels_, + const struct comm *const c, const int verbose) { + // Level 1 communicator is the global communicator. + comm_dup(&comms[0], c); + // Node level communicator is the last level communicator. + struct comm nc; + initialize_node_aux(&nc, c); + + // Find the number of nodes under the global communicator and number of MPI + // ranks in the node level communicator. + uint nnodes, nranks_per_node; + { + sint in = (nc.id == 0), wrk; + comm_allreduce(c, gs_int, gs_add, &in, 1, &wrk); + nnodes = in; + + nranks_per_node = nc.np; + // Check invariant: nranks_per_node should be the same across all the nodes. + sint nranks_max = nranks_per_node, nranks_min = nranks_per_node; + comm_allreduce(&comms[0], gs_int, gs_max, &nranks_max, 1, &wrk); + comm_allreduce(&comms[0], gs_int, gs_min, &nranks_min, 1, &wrk); + assert(nranks_max == nranks_min); + // Check invariant: nranks_per_node must be larger than 0. + assert(nranks_per_node > 0); + parrsb_print(c, verbose, + "initialize_levels: num_nodes = %u, num_ranks_per_node = %u", + nnodes, nranks_per_node); + } + + // Check if there are custom levels specified by the user. Size of the + // partition (in terms of number of nodes) in a given level must be a + // multiple of the partition size of the next level. + sint levels; + uint sizes[2] = {nnodes, 1}; + { + const uint size_max = sizeof(sizes) / sizeof(sizes[0]); + uint start = 1; + while (start < size_max && sizes[start] >= sizes[0]) + start++; + while (start < size_max && sizes[0] % sizes[start]) + ++start; + + uint level = 1; + for (; start < size_max; ++start, ++level) + sizes[level] = sizes[start]; + // Set the size of the last partition to 1 (since it is the node level + // partitioner). + sizes[level - 1] = 1; + + // Check assert: sizes should be strictly decreasing. + for (uint i = 1; i < level; i++) + assert(sizes[i - 1] > sizes[i]); + + levels = level; + } + + for (sint level = 1; level < levels - 1; ++level) { + comm_split(&comms[level - 1], + comms[level - 1].id / (sizes[level] * nranks_per_node), + comms[level - 1].id, &comms[level]); + } + levels = MIN(levels, *levels_); + if (levels > 1) + comm_dup(&comms[levels - 1], &nc); + *levels_ = levels; + parrsb_print(c, verbose, "initialize_levels: levels = %u", levels); + + comm_free(&nc); +} + +static void parrsb_part_mesh_v0(int *part, const long long *const vtx, + const double *const xyz, const uint nel, + const unsigned nv, + parrsb_options *const options, + const struct comm *const c, + struct crystal *const cr, buffer *const bfr) { + const int verbose = options->verbose_level; + + if (vtx == NULL && xyz == NULL) { + parrsb_print( + c, verbose, + "parrsb_part_mesh_v0: Both vertices and coordinates can't be NULL"); + MPI_Abort(c->c, EXIT_FAILURE); + } + if (xyz == NULL) + options->rsb_pre = 0; + + struct array elist; + size_t esize = load_balance(&elist, nel, nv, xyz, vtx, verbose, cr, bfr); + + struct comm ca; + comm_split(c, elist.n > 0, c->id, &ca); + + // Setup communicators for each level of the partitioning. + struct comm comms[9]; + { + // Check invariant: levels > 0 and levels <= sizeof(comms) / + // sizeof(comms[0]). + const uint levels = options->levels; + assert(levels <= sizeof(comms) / sizeof(comms[0])); + initialize_levels(comms, &options->levels, &ca, verbose); + parrsb_print(c, verbose, + "parrsb_part_mesh_v0: Levels: requested = %d, enabled = %d", + levels, options->levels); + } + + parrsb_print(c, verbose, "parrsb_part_mesh_v0: running partitioner ..."); + if (elist.n > 0) { + int ndim = (nv == 8) ? 3 : 2; + switch (options->partitioner) { + case 0: + rsb(&elist, nv, options, comms, bfr); + break; + case 1: + rcb(&elist, esize, ndim, &ca, bfr); + break; + case 2: + rib(&elist, esize, ndim, &ca, bfr); + break; + default: + break; + } + } + comm_free(&ca); + + for (uint l = 0; l < (uint)options->levels; l++) + comm_free(&comms[l]); + + parrsb_print(c, verbose, "parrsb_part_mesh_v0: restore original input"); + restore_original(part, cr, &elist, esize, bfr); + + array_free(&elist); +} + +void parrsb_check_tagged_partitions(const long long *const eids, + const long long *const vtx, const uint nel, + const unsigned nv, const uint ntags, + const struct comm *const c, + const int verbose) { + parrsb_print(c, verbose, "Check if the input elements are sorted locally."); + { + sint sorted = 1; + for (uint i = 1; i < nel; i++) { + if (eids[i] < eids[i - 1]) { + sorted = 0; + break; + } + } + + sint wrk; + comm_allreduce(c, gs_int, gs_min, &sorted, 1, &wrk); + if (!sorted) { + if (c->id == 0) { + fprintf(stderr, "Input elements are not sorted.\n"); + fflush(stderr); + } + exit(EXIT_FAILURE); + } + } + + // Number the elements within the each tag id and setup a gs handle based on + // 2D element id. + parrsb_print(c, verbose, "Number elements within each layer."); + const uint tag_id = c->id / ntags; + struct comm lc; + struct gs_data *gse = NULL; + { + comm_split(c, tag_id, c->id, &lc); + + slong out[2][1], wrk[2][1], in = nel; + comm_scan(out, &lc, gs_long, gs_add, &in, 1, wrk); + slong start = out[0][0]; + + slong *lids = tcalloc(slong, nel); + for (uint i = 0; i < nel; i++) + lids[i] = start + i; + + gse = gs_setup(lids, nel, c, 0, gs_pairwise, 0); + free(lids); + } + + // Setup a local gs handle based on the original gs vertex ids. + parrsb_print(c, verbose, "Setup multiplicity."); + const size_t size = nel * nv; + buffer bfr; + buffer_init(&bfr, size); + sint *mul = tcalloc(sint, size); + { + struct gs_data *gsl = gs_setup(vtx, size, &lc, 0, gs_pairwise, 0); + for (uint i = 0; i < size; i++) + mul[i] = 1; + gs(mul, gs_int, gs_add, 0, gsl, &bfr); + gs_free(gsl); + } + + // Now let's compare the multiplicity across the layers. + parrsb_print(c, verbose, "Check multiplicity across the layers."); + { + sint *lmin = tcalloc(sint, nel); + sint *lmax = tcalloc(sint, nel); + for (uint v = 0; v < nv; v++) { + for (uint e = 0; e < nel; e++) { + lmin[e] = mul[e * nv + v]; + lmax[e] = mul[e * nv + v]; + } + + gs(lmin, gs_int, gs_min, 0, gse, &bfr); + gs(lmax, gs_int, gs_max, 0, gse, &bfr); + + for (uint e = 0; e < nel; e++) + assert(lmin[e] == lmax[e]); + } + + free(lmin), free(lmax); + } + + free(mul); + buffer_free(&bfr); + gs_free(gse); + comm_free(&lc); + + return; +} + +static void parrsb_part_mesh_v1(int *part, const long long *const vtx, + const double *const xyz, const int *const tag, + const uint nel, const unsigned nv, + parrsb_options *const options, + const struct comm *const c, + struct crystal *const cr, buffer *const bfr) { + const int verbose = options->verbose_level; + parrsb_print(c, verbose, "Find number of tags in the mesh ..."); + + struct tag_t { + uint p, tag, seq, tagn; + }; + + struct array tags; + array_init(struct tag_t, &tags, nel); + + { + struct tag_t tt; + for (uint i = 0; i < nel; i++) { + tt.seq = i, tt.tag = tag[i], tt.p = tt.tag % c->np; + array_cat(struct tag_t, &tags, &tt, 1); + } + sarray_sort(struct tag_t, tags.ptr, tags.n, tag, 0, bfr); + } + + struct array unique; + array_init(struct tag_t, &unique, 1024); + + if (tags.n > 0) { + const struct tag_t *const pt = (const struct tag_t *const)tags.ptr; + array_cat(struct tag_t, &unique, &pt[0], 1); + for (uint i = 1; i < tags.n; i++) { + if (pt[i].tag > pt[i - 1].tag) + array_cat(struct tag_t, &unique, &pt[i], 1); + } + } + + sint out[2][1]; + { + sarray_transfer(struct tag_t, &unique, p, 1, cr); + sarray_sort(struct tag_t, unique.ptr, unique.n, tag, 0, bfr); + + const struct tag_t *const pu = (const struct tag_t *const)unique.ptr; + sint in = 0; + if (unique.n > 0) { + in = 1; + for (uint i = 1; i < unique.n; i++) { + if (pu[i].tag > pu[i - 1].tag) + in++; + } + } + + sint wrk[2][1]; + comm_scan(out, c, gs_int, gs_add, &in, 1, wrk); + } + const uint num_tags = out[1][0], tag_start = out[0][0]; + + parrsb_print(c, verbose, "Num tags: %d", num_tags); + if (c->np % num_tags != 0) { + if (c->id == 0) { + fprintf(stderr, + "Number of processes must be a multiple of number of tags: " + "processes = %d, tags = %d.\n", + c->np, num_tags); + } + exit(EXIT_FAILURE); + } + + { + struct tag_t *const pu = (struct tag_t *const)unique.ptr; + uint start = tag_start; + if (unique.n > 0) { + pu[0].tagn = start; + for (uint i = 1; i < unique.n; i++) { + if (pu[i].tag > pu[i - 1].tag) + start++; + pu[i].tagn = start; + } + } + + sarray_transfer(struct tag_t, &unique, p, 0, cr); + sarray_sort(struct tag_t, unique.ptr, unique.n, tag, 0, bfr); + } + + const uint chunk_size = c->np / num_tags; + parrsb_print(c, verbose, "Processes per tag: %d", chunk_size); + { + struct tag_t *const pt = (struct tag_t *const)tags.ptr; + const struct tag_t *const pu = (const struct tag_t *const)unique.ptr; + for (uint i = 0, s = 0; i < unique.n; i++) { + uint e = s + 1; + assert(pt[s].tag == pu[i].tag); + while (e < tags.n && pt[e].tag == pu[i].tag) + e++; + for (uint j = s; j < e; j++) + pt[j].p = chunk_size * pu[i].tagn + pt[i].seq % chunk_size; + s = e; + } + + sarray_sort(struct tag_t, tags.ptr, tags.n, seq, 0, bfr); + } + array_free(&unique); + + struct element_t { + uint proc, part, seq; + scalar xyz[MAXDIM * MAXNV]; + slong vertices[MAXNV]; + }; + + struct array elements; + array_init(struct element_t, &elements, nel); + + parrsb_print(c, verbose, + "Pack element data for transfering. tags.n=%u, nel=%u", tags.n, + nel); + const unsigned ndim = (nv == 8) ? 3 : 2; + { + assert(tags.n == nel); + const struct tag_t *const pt = (const struct tag_t *const)tags.ptr; + struct element_t et; + for (uint i = 0; i < tags.n; i++) { + et.proc = pt[i].p, et.seq = i; + for (uint j = 0; j < nv; j++) { + et.vertices[j] = vtx[i * nv + j]; + for (uint k = 0; k < ndim; k++) + et.xyz[j * ndim + k] = xyz[i * nv * ndim + j * ndim + k]; + } + array_cat(struct element_t, &elements, &et, 1); + } + + sarray_transfer(struct element_t, &elements, proc, 1, cr); + } + array_free(&tags); + + parrsb_print(c, verbose, "Copy element data for feeding to parRSB."); + long long *lvtx = tcalloc(long long, (elements.n + 1) * nv); + double *lxyz = tcalloc(double, (elements.n + 1) * nv * ndim); + { + const struct element_t *const pe = + (const struct element_t *const)elements.ptr; + for (uint e = 0; e < elements.n; e++) { + for (uint j = 0; j < nv; j++) { + lvtx[e * nv + j] = pe[e].vertices[j]; + for (uint k = 0; k < ndim; k++) + lxyz[e * nv * ndim + j * ndim + k] = pe[e].xyz[j * ndim + k]; + } + } + } + + parrsb_print(c, verbose, "Run parRSB locally within a tag now."); + { + int *lpart = tcalloc(int, elements.n + 1); + + struct comm lc; + comm_split(c, c->id / chunk_size, c->id, &lc); + + struct crystal lcr; + crystal_init(&lcr, &lc); + + options->verbose_level = 0; + options->profile_level = 0; + parrsb_part_mesh_v0(lpart, lvtx, lxyz, elements.n, nv, options, &lc, &lcr, + bfr); + crystal_free(&lcr), comm_free(&lc); + + struct element_t *const pe = (struct element_t *const)elements.ptr; + for (uint e = 0; e < elements.n; e++) { + pe[e].part = lpart[e] + (c->id / chunk_size) * chunk_size; + assert(pe[e].part < c->np); + } + free(lpart); + + sarray_transfer(struct element_t, &elements, proc, 0, cr); + assert(nel == elements.n); + } + free(lvtx), free(lxyz); + + { + sarray_sort(struct element_t, elements.ptr, elements.n, seq, 0, bfr); + const struct element_t *const pe = + (const struct element_t *const)elements.ptr; + for (uint i = 0; i < nel; i++) + part[i] = pe[i].part; + } + + array_free(&elements); +} + +static void update_frontier(sint *const target, sint *const hop, + sint *const frontier, const unsigned nv, + const unsigned hid, const struct comm *c, + buffer *const bfr) { + // If target is already set, we don't update either target or hop. + // We simply update frontier to previous target value and return. + if (*target >= 0) { + // Check invariant: *hop < INT_MAX + assert(*hop < INT_MAX); + for (uint i = 0; i < nv; i++) + frontier[i] = *target; + return; + } + + struct dest_t { + uint target; + }; + + struct array dests; + array_init(struct dest_t, &dests, nv); + { + struct dest_t dt; + for (uint i = 0; i < nv; i++) { + if (frontier[i] >= 0) { + dt.target = frontier[i]; + array_cat(struct dest_t, &dests, &dt, 1); + } + } + } + + if (dests.n > 0) { + sarray_sort(struct dest_t, dests.ptr, dests.n, target, 0, bfr); + + const struct dest_t *const pd = (const struct dest_t *const)dests.ptr; + uint current_target = pd[0].target, current_count = 1; + uint final_target = current_target, final_count = 1; + for (uint i = 1; i < dests.n; i++) { + if (pd[i].target == current_target) { + current_count++; + } else { + if (current_count > final_count) + final_count = current_count, final_target = current_target; + current_target = pd[i].target, current_count = 1; + } + } + if (current_count > final_count) + final_target = current_target; + + // Update frontier, target and hop. + for (uint j = 0; j < nv; j++) + frontier[j] = final_target; + *target = final_target, *hop = hid + 1; + } + + array_free(&dests); +} + +void parrsb_part_solid(int *part, const long long *const vtx2, + const unsigned nel2, const long long *const vtx1, + const unsigned nel1, const unsigned nv, + const MPI_Comm comm) { + struct comm c; + comm_init(&c, comm); + parrsb_print(&c, 1, "Running greedy solid ... nel1 = %d nel2 = %d", nel1, + nel2); + + for (uint i = 0; i < nel2; i++) + part[i] = -1; + + buffer bfr; + buffer_init(&bfr, 1024); + + struct crystal cr; + crystal_init(&cr, &c); + + // Return if global size is 0. + const uint nelt = nel1 + nel2; + slong nelg = nelt; + { + slong wrk; + comm_allreduce(&c, gs_long, gs_add, &nelg, 1, &wrk); + if (nelg == 0) { + parrsb_print(&c, 1, "Mesh is empty ..."); + crystal_free(&cr); + buffer_free(&bfr); + comm_free(&c); + return; + } + } + + const size_t size1 = nel1 * nv; + const size_t size2 = nel2 * nv; + const size_t size = size1 + size2; + + // Setup the gather-scatter handle to find connectivity through BFS. + parrsb_print(&c, 1, "Setup gather-scatter handle ..."); + struct gs_data *gsh = NULL; + { + slong *vtx = tcalloc(slong, size); + for (size_t i = 0; i < size1; i++) + vtx[i] = vtx1[i]; + for (size_t i = 0; i < size2; i++) + vtx[size1 + i] = vtx2[i]; + + gsh = gs_setup(vtx, size, &c, 0, gs_pairwise, 0); + free(vtx); + } + + // Check if the solid + fluid mesh is connected. Otherwise, we cannot use + // the greedy solid partitioner. + parrsb_print(&c, 1, "Check if fluid + solid is connected ..."); + { + slong wrk; + sint idmin = (c.id + 1) * (size > 0); + comm_allreduce(&c, gs_int, gs_min, &idmin, 1, &wrk); + assert(idmin > 0); + + sint *const component = tcalloc(sint, size); + if (c.id + 1 == (uint)idmin) { + for (uint i = 0; i < nv; i++) + component[i] = 1; + } + + slong marked0 = 0, marked1 = 1; + sint epoch = 0; + while (marked1 > marked0) { + gs(component, gs_int, gs_max, 0, gsh, &bfr); + + marked0 = marked1, marked1 = 0; + for (uint i = 0; i < nel1 + nel2; i++) { + sint v = 0; + for (uint j = 0; j < nv; j++) + v += component[i * nv + j]; + if (v > 0) { + for (uint j = 0; j < nv; j++) + component[i * nv + j] = 1; + marked1 += 1; + } + } + + comm_allreduce(&c, gs_long, gs_add, &marked1, 1, &wrk); + parrsb_print(&c, 1, "\tepoch = %d marked0 = %lld marked1 = %lld", epoch, + marked0, marked1); + epoch++; + } + free(component); + + if (marked1 != nelg) { + if (c.id == 0) { + fprintf(stderr, "Fluid + Solid mesh is not connected.\n"); + fflush(stderr); + } + exit(EXIT_FAILURE); + } + } + + // Calculate the global number of elements in solid mesh and expected number + // of elements in each partition. + parrsb_print(&c, 1, "Calculate expected number of elements ..."); + slong nelgt2 = nel2; + uint nexp2; + { + slong wrk; + comm_allreduce(&c, gs_long, gs_add, &nelgt2, 1, &wrk); + nexp2 = nelgt2 / c.np; + nexp2 += (c.id < (nelgt2 - nexp2 * c.np)); + // Check for invariant: (min(nexp2) - max(nexp2)) <= 1. + slong nexp2_min = nexp2, nexp2_max = nexp2; + comm_allreduce(&c, gs_long, gs_min, &nexp2_min, 1, &wrk); + comm_allreduce(&c, gs_long, gs_max, &nexp2_max, 1, &wrk); + assert(nexp2_max - nexp2_min <= 1); + // Check for invariant: (sum(nexp2) == nelgt2). + slong nexp2_sum = nexp2; + comm_allreduce(&c, gs_long, gs_add, &nexp2_sum, 1, &wrk); + assert(nexp2_sum == nelgt2); + } + + // Initialize array of elements to be sent to each partition. + struct elem_t { + sint part; + uint target, hop, sequence; + }; + + struct array arr; + array_init(struct elem_t, &arr, nel2); + + // Allocate space for work arrays: frontier, target, and hop. + sint *const frontier = tcalloc(sint, size); + sint *const target = tcalloc(sint, nelt); + sint *const hop = tcalloc(sint, nelt); + + uint nrecv2 = 0; + slong nrem2 = nelgt2; + while (nrem2 > 0) { + parrsb_print(&c, 1, "nrem2 = %lld", nrem2); + + // Check for invariant: nrecv2 <= nexp2. + assert(nrecv2 <= nexp2); + + // If the partition does not have enough elements, we keep it under + // consideration for accepting new solid elements. If the partition + // already has enough elements, we take that partition out of + // consideration (by setting the frontier to -1). We always initialize solid + // elements as unassigned (-1) although they may be already assigned. We + // check for that later when we actually assign the elements to partitions. + { + sint id = c.id, hid = 0; + if (nrecv2 == nexp2) + id = -1, hid = INT_MAX; + + // Max id should be >= 0; + sint wrk, idmax = id; + comm_allreduce(&c, gs_int, gs_max, &idmax, 1, &wrk); + assert(idmax >= 0); + + // Initialize frontier, target, and hop. + for (uint i = 0; i < size1; i++) + frontier[i] = id; + for (uint i = size1; i < size; i++) + frontier[i] = -1; + for (uint i = 0; i < nel1; i++) + target[i] = id, hop[i] = hid; + for (uint i = nel1; i < nelt; i++) + target[i] = -1, hop[i] = INT_MAX; + } + + // Then perform a BFS till we assign all the elements in the solid mesh with + // a potential partition id. + parrsb_print(&c, 1, "Assign partition id ..."); + { + sint assigned = 0; + slong wrk; + for (uint hid = 0; !assigned; hid++) { + gs(frontier, gs_int, gs_max, 0, gsh, &bfr); + + assigned = 1; + slong unassigned = 0; + for (uint i = 0; i < nelt; i++) { + update_frontier(&target[i], &hop[i], &frontier[i * nv], nv, hid, &c, + &bfr); + assigned = assigned && (target[i] >= 0); + unassigned += (target[i] < 0); + } + + comm_allreduce(&c, gs_int, gs_min, &assigned, 1, &wrk); + comm_allreduce(&c, gs_long, gs_add, &unassigned, 1, &wrk); + parrsb_print(&c, 1, "hid = %d, assigned = %d unassigned = %d", hid, + assigned, unassigned); + } + } + + // Pack unassigned solid elements and send them to the target partition. + arr.n = 0; + { + struct elem_t et = {.part = -1}; + for (uint i = 0; i < nel2; i++) { + if (part[i] >= 0) + continue; + et.sequence = i, et.target = target[nel1 + i], et.hop = hop[nel1 + i]; + array_cat(struct elem_t, &arr, &et, 1); + } + + parrsb_print(&c, 1, "Send elemenets to the target partition ..."); + sarray_transfer(struct elem_t, &arr, target, 1, &cr); + } + + // Assign elements if the partition still doesn't have enough elements. + if (nrecv2 < nexp2) { + // We sort by hop value. Elements with lower hop value are assigned first + // since they are technically closer to the partition. + sarray_sort(struct elem_t, arr.ptr, arr.n, hop, 1, &bfr); + struct elem_t *const pa = (struct elem_t *const)arr.ptr; + uint keep = MIN(nexp2 - nrecv2, arr.n); + for (uint i = 0; i < keep; i++) + pa[i].part = c.id; + nrecv2 += keep; + // Check for invariant: nrecv2 <= nexp2. + assert(nrecv2 <= nexp2); + } + + // Send everything back with updated partition id and update the part array. + { + parrsb_print(&c, 1, "Send everything back ..."); + sarray_transfer(struct elem_t, &arr, target, 0, &cr); + + const struct elem_t *const pa = (const struct elem_t *const)arr.ptr; + for (uint j = 0; j < arr.n; j++) + part[pa[j].sequence] = pa[j].part; + arr.n = 0; + } + + // Update the number of elements remaining. + { + slong wrk; + nrem2 = nexp2 - nrecv2; + comm_allreduce(&c, gs_long, gs_add, &nrem2, 1, &wrk); + } + } + + gs_free(gsh); + free(frontier), free(target), free(hop); + array_free(&arr); + crystal_free(&cr); + buffer_free(&bfr); + comm_free(&c); +} + +int parrsb_part_mesh(int *part, const long long *const vtx, + const double *const xyz, const int *const tag, + const int nel, const int nv, parrsb_options *const options, + MPI_Comm comm) { + struct comm c; + comm_init(&c, comm); + + update_options(options); + + // Check verboity and print a message. + const int verbose = options->verbose_level; + { + slong nelg = nel, wrk; + comm_allreduce(&c, gs_long, gs_add, &nelg, 1, &wrk); + parrsb_print(&c, verbose, "Running parRSB ..., nv = %d, nelg = %lld", nv, + nelg); + } + + print_options(&c, options); + + if (options->tagged == 1 && !tag) { + parrsb_print(&c, verbose, + "Tagged partitioning requested but tag array is NULL.."); + return 1; + } + + buffer bfr; + buffer_init(&bfr, (nel + 1) * 72); + + struct crystal cr; + crystal_init(&cr, &c); + + metric_init(); + + parrsb_barrier(&c); + const double t = comm_time(); + + if (options->tagged == 1) + parrsb_part_mesh_v1(part, vtx, xyz, tag, nel, nv, options, &c, &cr, &bfr); + if (options->tagged == 0) + parrsb_part_mesh_v0(part, vtx, xyz, nel, nv, options, &c, &cr, &bfr); + + parrsb_print(&c, verbose, "par%s finished in %g seconds.", + ALGO[options->partitioner], comm_time() - t); + + metric_rsb_print(&c, options->profile_level); + metric_finalize(); + + crystal_free(&cr); + buffer_free(&bfr); + comm_free(&c); + + return 0; +} + +#undef MIN diff --git a/src/rcb.c b/src/rcb.c index 5defa8c5..a695ab5f 100644 --- a/src/rcb.c +++ b/src/rcb.c @@ -1,8 +1,11 @@ #include "parrsb-impl.h" #include "sort.h" +#include +#include + static void get_axis_len(double *length, size_t unit_size, char *elems, - uint nel, int ndim, struct comm *c) { + uint nel, uint ndim, struct comm *c) { double min[3] = {DBL_MAX, DBL_MAX, DBL_MAX}, max[3] = {-DBL_MAX, -DBL_MAX, -DBL_MAX}; @@ -140,23 +143,17 @@ static int rcb_level(struct array *a, size_t unit_size, int ndim, int rcb(struct array *elements, size_t unit_size, int ndim, struct comm *ci, buffer *bfr) { - struct comm c, t; + struct comm c; comm_dup(&c, ci); - int size = c.np; - int rank = c.id; - + uint size = c.np, rank = c.id; while (size > 1) { rcb_level(elements, unit_size, ndim, &c, bfr); - int bin = 1; - if (rank < (size + 1) / 2) - bin = 0; - + struct comm t; + const int bin = ((rank >= (size + 1) / 2) ? 1 : 0); comm_split(&c, bin, rank, &t); - comm_free(&c); - comm_dup(&c, &t); - comm_free(&t); + comm_free(&c), comm_dup(&c, &t), comm_free(&t); size = c.np, rank = c.id; } diff --git a/src/rib.c b/src/rib.c index 1aac9031..b64d8771 100644 --- a/src/rib.c +++ b/src/rib.c @@ -96,23 +96,16 @@ int rib(struct array *elements, size_t unit_size, int ndim, struct comm *ci, struct comm c; comm_dup(&c, ci); - int size = c.np; - int rank = c.id; - + uint size = c.np, rank = c.id; while (size > 1) { rib_level(elements, unit_size, ndim, &c, bfr); - int p = (size + 1) / 2; - int bin = (rank >= p); - - MPI_Comm comm_rib; - MPI_Comm_split(c.c, bin, rank, &comm_rib); - comm_free(&c); - comm_init(&c, comm_rib); - MPI_Comm_free(&comm_rib); + struct comm t; + const int bin = ((rank >= (size + 1) / 2) ? 1 : 0); + comm_split(&c, bin, rank, &t); + comm_free(&c), comm_dup(&c, &t), comm_free(&t); - size = c.np; - rank = c.id; + size = c.np, rank = c.id; } comm_free(&c); diff --git a/src/rsb-aux.c b/src/rsb-aux.c deleted file mode 100644 index 04e500c4..00000000 --- a/src/rsb-aux.c +++ /dev/null @@ -1,396 +0,0 @@ -#include "metrics.h" -#include "parrsb-impl.h" -#include "sort.h" - -static unsigned disconnected = 0; - -extern int fiedler(struct array *elements, int nv, parrsb_options *options, - struct comm *gsc, buffer *buf, int verbose); - -static void test_component_versions(struct array *elements, struct comm *lc, - unsigned nv, unsigned lvl, buffer *bfr) { - // Send elements to % P processor to test disconnected components - struct crystal cr; - crystal_init(&cr, lc); - - struct rsb_element *pe = (struct rsb_element *)elements->ptr; - for (unsigned e = 0; e < elements->n; e++) - pe[e].proc = pe[e].globalId % lc->np; - - sarray_transfer(struct rsb_element, elements, proc, 1, &cr); - - MPI_Comm tmp; - int color = (lc->id < lc->np / 2); - MPI_Comm_split(lc->c, color, lc->id, &tmp); - - struct comm tc0; - comm_init(&tc0, tmp); - - sint nc1 = get_components(NULL, elements, nv, &tc0, bfr, 0); - sint nc2 = get_components_v2(NULL, elements, nv, &tc0, bfr, 0); - if (nc1 != nc2) { - if (tc0.id == 0) - printf("lvl = %u SS BFS != MS BFS: %d %d\n", lvl, nc1, nc2); - fflush(stdout); - } - if (nc1 > 1) { - if (tc0.id == 0) - printf("lvl = %u: %d disconnected componets were present.\n", lvl, nc1); - fflush(stdout); - } - - comm_free(&tc0); - MPI_Comm_free(&tmp); - - sarray_transfer(struct rsb_element, elements, proc, 0, &cr); - crystal_free(&cr); -} - -static void check_rsb_partition(struct comm *gc, parrsb_options *opts) { - int max_levels = log2ll(gc->np); - int miter = opts->rsb_max_iter, mpass = opts->rsb_max_passes; - - for (int i = 0; i < max_levels; i++) { - sint converged = 1; - int val = (int)metric_get_value(i, RSB_FIEDLER_CALC_NITER); - if (opts->rsb_algo == 0) { - if (val == miter * mpass) - converged = 0; - } else if (opts->rsb_algo == 1) { - if (val == mpass) - converged = 0; - } - - struct comm c; - comm_split(gc, converged, gc->id, &c); - - slong bfr[4]; - if (converged == 0) { - if (opts->rsb_algo == 0) { - double init = metric_get_value(i, TOL_INIT); - comm_allreduce(&c, gs_double, gs_min, &init, 1, (void *)bfr); - - double target = metric_get_value(i, TOL_TGT); - comm_allreduce(&c, gs_double, gs_min, &target, 1, (void *)bfr); - - double final = metric_get_value(i, TOL_FNL); - comm_allreduce(&c, gs_double, gs_min, &final, 1, (void *)bfr); - if (c.id == 0) { - printf("Warning: Lanczos reached a residual of %lf (target: %lf) " - "after %d x %d iterations in Level=%d!\n", - final, target, mpass, miter, i); - fflush(stdout); - } - } else if (opts->rsb_algo == 1) { - if (c.id == 0) { - printf("Warning: Inverse iteration didn't converge after %d " - "iterations in Level = %d\n", - mpass, i); - fflush(stdout); - } - } - } - comm_free(&c); - - sint minc, maxc; - minc = maxc = (sint)metric_get_value(i, RSB_COMPONENTS); - comm_allreduce(gc, gs_int, gs_min, &minc, 1, (void *)bfr); - comm_allreduce(gc, gs_int, gs_max, &maxc, 1, (void *)bfr); - - if (maxc > 1 && gc->id == 0) { - printf("Warning: Partition created %d/%d (min/max) disconnected " - "components in Level=%d!\n", - minc, maxc, i); - fflush(stdout); - } - } -} - -static int check_bin_val(int bin, struct comm *c) { - if (bin < 0 || bin > 1) { - if (c->id == 0) { - printf("%s:%d bin value out of range: %d\n", __FILE__, __LINE__, bin); - fflush(stdout); - } - return 1; - } - return 0; -} - -int balance_partitions(struct array *elements, int nv, struct comm *lc, - struct comm *gc, int bin, buffer *bfr) { - assert(check_bin_val(bin, gc) == 0); - - struct ielem_t { - uint index, orig; - sint dest; - scalar fiedler; - }; - - // Calculate expected # of elements per processor - uint ne = elements->n; - slong nelgt = ne, nglob = ne, wrk; - comm_allreduce(lc, gs_long, gs_add, &nelgt, 1, &wrk); - comm_allreduce(gc, gs_long, gs_add, &nglob, 1, &wrk); - - sint ne_ = nglob / gc->np, nrem = nglob - ne_ * gc->np; - slong nelgt_exp = ne_ * lc->np + nrem / 2 + (nrem % 2) * (1 - bin); - slong send_cnt = nelgt - nelgt_exp > 0 ? nelgt - nelgt_exp : 0; - - // Setup gather-scatter - uint size = ne * nv, e, v; - slong *ids = tcalloc(slong, size); - struct rsb_element *elems = (struct rsb_element *)elements->ptr; - for (e = 0; e < ne; e++) { - for (v = 0; v < nv; v++) - ids[e * nv + v] = elems[e].vertices[v]; - } - struct gs_data *gsh = gs_setup(ids, size, gc, 0, gs_pairwise, 0); - - sint *input = (sint *)ids; - if (send_cnt > 0) - for (e = 0; e < size; e++) - input[e] = 0; - else - for (e = 0; e < size; e++) - input[e] = 1; - - gs(input, gs_int, gs_add, 0, gsh, bfr); - - for (e = 0; e < ne; e++) - elems[e].proc = gc->id; - - sint sid = (send_cnt == 0) ? gc->id : INT_MAX, balanced = 0; - comm_allreduce(gc, gs_int, gs_min, &sid, 1, &wrk); - - struct crystal cr; - - if (send_cnt > 0) { - struct array ielems; - array_init(struct ielem_t, &ielems, 10); - - struct ielem_t ielem = { - .index = 0, .orig = lc->id, .dest = -1, .fiedler = 0}; - int mul = (sid == 0) ? 1 : -1; - for (e = 0; e < ne; e++) { - for (v = 0; v < nv; v++) { - if (input[e * nv + v] > 0) { - ielem.index = e, ielem.fiedler = mul * elems[e].fiedler; - array_cat(struct ielem_t, &ielems, &ielem, 1); - break; - } - } - } - - // Sort based on fiedler value and sets `orig` field - parallel_sort(struct ielem_t, &ielems, fiedler, gs_double, 0, 1, lc, bfr); - - slong out[2][1], bfr[2][1], nielems = ielems.n; - comm_scan(out, lc, gs_long, gs_add, &nielems, 1, bfr); - slong start = out[0][0]; - - sint P = gc->np - lc->np; - sint part_size = (send_cnt + P - 1) / P; - - if (out[1][0] >= send_cnt) { - balanced = 1; - struct ielem_t *ptr = ielems.ptr; - for (e = 0; start + e < send_cnt && e < ielems.n; e++) - ptr[e].dest = sid + (start + e) / part_size; - - crystal_init(&cr, lc); - sarray_transfer(struct ielem_t, &ielems, orig, 0, &cr); - crystal_free(&cr); - - ptr = ielems.ptr; - for (e = 0; e < ielems.n; e++) - if (ptr[e].dest != -1) - elems[ptr[e].index].proc = ptr[e].dest; - } - - array_free(&ielems); - } - - comm_allreduce(gc, gs_int, gs_max, &balanced, 1, &wrk); - if (balanced == 1) { - crystal_init(&cr, gc); - sarray_transfer(struct rsb_element, elements, proc, 0, &cr); - crystal_free(&cr); - - // Do a load balanced sort in each partition - parallel_sort(struct rsb_element, elements, fiedler, gs_double, 0, 1, lc, - bfr); - } else { - // Forget about disconnected components, just do a load balanced partition - // TODO: Need to change how parallel_sort load balance - parallel_sort(struct rsb_element, elements, fiedler, gs_double, 0, 1, gc, - bfr); - } - - free(ids), gs_free(gsh); - return 0; -} - -int repair_partitions_v2(struct array *elems, unsigned nv, struct comm *tc, - struct comm *lc, unsigned bin, unsigned algo, - buffer *bfr) { - assert(check_bin_val(bin, lc) == 0); - - sint ibuf; - sint nc = get_components_v2(NULL, elems, nv, tc, bfr, 0); - comm_allreduce(lc, gs_int, gs_max, &nc, 1, &ibuf); - if (nc > 1) { - // If nc > 1, send elements back and do RCBx, RCBy and RCBz - struct crystal cr; - crystal_init(&cr, lc); - sarray_transfer(struct rsb_element, elems, proc, 0, &cr); - crystal_free(&cr); - - // Do rcb or rib - unsigned ndim = (nv == 8) ? 3 : 2; - switch (algo) { - case 0: - parallel_sort(struct rsb_element, elems, globalId, gs_long, 0, 1, lc, - bfr); - break; - case 1: - rcb(elems, sizeof(struct rsb_element), ndim, lc, bfr); - break; - case 2: - rib(elems, sizeof(struct rsb_element), ndim, lc, bfr); - break; - default: - break; - } - - // And count number of components again. If nc > 1 still, set - // isconnected = 1 - nc = get_components_v2(NULL, elems, nv, tc, bfr, 0); - comm_allreduce(lc, gs_int, gs_max, &nc, 1, &ibuf); - if (nc > 1) - disconnected = 1; - } - - return 0; -} - -static void get_part(sint *np, sint *nid, int two_lvl, struct comm *lc, - struct comm *nc) { - if (two_lvl) { - sint out[2][1], wrk[2][1], in = (nc->id == 0); - comm_scan(out, lc, gs_int, gs_add, &in, 1, &wrk); - *nid = (nc->id == 0) * out[0][0], *np = out[1][0]; - comm_allreduce(nc, gs_int, gs_max, nid, 1, wrk); - } else { - *np = lc->np, *nid = lc->id; - } -} - -int rsb(struct array *elements, int nv, int check, parrsb_options *options, - struct comm *gc, buffer *bfr) { - // `gc` is the global communicator. We make a duplicate of it in `lc` and - // keep splitting it. `nc` is the communicator for the two level partitioning. - struct comm lc, nc; - - // Duplicate the global communicator to `lc` - comm_dup(&lc, gc); - - // Initialize `nc` based on `lc` - if (options->two_level) { -#ifdef MPI - MPI_Comm node; - MPI_Comm_split_type(lc.c, MPI_COMM_TYPE_SHARED, lc.id, MPI_INFO_NULL, - &node); - comm_init(&nc, node); - MPI_Comm_free(&node); -#else - comm_init(&nc, 1); -#endif - } - - // Get number of partitions we are going to perform RSB on first level - sint np, nid; - get_part(&np, &nid, options->two_level, &lc, &nc); - debug_print(gc, options->two_level && options->verbose_level, - "Number of nodes = %d\n", np); - - struct comm tc; - unsigned ndim = (nv == 8) ? 3 : 2; - while (np > 1) { - // Run the pre-partitioner - debug_print(&lc, options->verbose_level > 1, "\tPre-partitioner ..."); - metric_tic(&lc, RSB_PRE); - switch (options->rsb_pre) { - case 0: // Sort by global id - parallel_sort(struct rsb_element, elements, globalId, gs_long, 0, 1, &lc, - bfr); - break; - case 1: // RCB - rcb(elements, sizeof(struct rsb_element), ndim, &lc, bfr); - break; - case 2: // RIB - rib(elements, sizeof(struct rsb_element), ndim, &lc, bfr); - break; - default: - break; - } - metric_toc(&lc, RSB_PRE); - debug_print(&lc, options->verbose_level > 1, " done.\n"); - - // Find the Fiedler vector - debug_print(&lc, options->verbose_level > 1, "\tFiedler ..."); - unsigned bin = (nid >= (np + 1) / 2); - comm_split(&lc, bin, lc.id, &tc); - - struct rsb_element *pe = (struct rsb_element *)elements->ptr; - for (unsigned i = 0; i < elements->n; i++) - pe[i].proc = lc.id; - - metric_tic(&lc, RSB_FIEDLER); - fiedler(elements, nv, options, &lc, bfr, gc->id == 0); - metric_toc(&lc, RSB_FIEDLER); - debug_print(&lc, options->verbose_level > 1, " done.\n"); - - // Sort by Fiedler vector - debug_print(&lc, options->verbose_level > 1, "\tSort ..."); - metric_tic(&lc, RSB_SORT); - parallel_sort_2(struct rsb_element, elements, fiedler, gs_double, globalId, - gs_long, 0, 1, &lc, bfr); - metric_toc(&lc, RSB_SORT); - debug_print(&lc, options->verbose_level > 1, " done.\n"); - - // Attempt to repair if there are disconnected components - debug_print(&lc, options->verbose_level > 1, "\tRepair ..."); - metric_tic(&lc, RSB_REPAIR); - if (options->repair) - repair_partitions_v2(elements, nv, &tc, &lc, bin, options->rsb_pre, bfr); - metric_toc(&lc, RSB_REPAIR); - debug_print(&lc, options->verbose_level > 1, " done.\n"); - - // Bisect and balance - debug_print(&lc, options->verbose_level > 1, "\tBalance ..."); - metric_tic(&lc, RSB_BALANCE); - balance_partitions(elements, nv, &tc, &lc, bin, bfr); - metric_toc(&lc, RSB_BALANCE); - debug_print(&lc, options->verbose_level > 1, " done.\n"); - - // Split the communicator and recurse on the sub-problems. - comm_free(&lc), comm_dup(&lc, &tc), comm_free(&tc); - get_part(&np, &nid, options->two_level, &lc, &nc); - debug_print(&lc, options->verbose_level > 1, "\tBisect ... done.\n"); - metric_push_level(); - } - comm_free(&lc); - - // Partition within the node - if (options->two_level) { - options->two_level = 0; - rsb(elements, nv, 0, options, &nc, bfr); - comm_free(&nc); - } - - if (check) - check_rsb_partition(gc, options); - - return 0; -} diff --git a/src/rsb.c b/src/rsb.c index d38da260..066b00fe 100644 --- a/src/rsb.c +++ b/src/rsb.c @@ -1,250 +1,400 @@ #include "metrics.h" #include "parrsb-impl.h" -#include -#include -#include -#include -#include - -extern int rsb(struct array *elements, int nv, int check, - parrsb_options *options, struct comm *gc, buffer *bfr); -extern int rcb(struct array *elements, size_t unit_size, int ndim, - struct comm *ci, buffer *bfr); - -parrsb_options parrsb_default_options = { - // General options - .partitioner = 0, - .verbose_level = 0, - .profile_level = 0, - .two_level = 1, - .repair = 0, - // RSB common (Lanczos + MG) options - .rsb_algo = 0, - .rsb_pre = 1, - .rsb_max_iter = 50, - .rsb_max_passes = 50, - .rsb_tol = 1e-5, - // RSB MG specific options - .rsb_mg_grammian = 0, - .rsb_mg_factor = 2, - .rsb_mg_sagg = 0}; - -static char *ALGO[3] = {"RSB", "RCB", "RIB"}; - -#define UPDATE_OPTION(OPT, STR, IS_INT) \ - do { \ - const char *val = getenv(STR); \ - if (val != NULL) { \ - if (IS_INT) \ - options->OPT = atoi(val); \ - else \ - options->OPT = atof(val); \ - } \ - } while (0) - -static void update_options(parrsb_options *options) { - UPDATE_OPTION(partitioner, "PARRSB_PARTITIONER", 1); - UPDATE_OPTION(verbose_level, "PARRSB_VERBOSE_LEVEL", 1); - UPDATE_OPTION(profile_level, "PARRSB_PROFILE_LEVEL", 1); - UPDATE_OPTION(two_level, "PARRSB_TWO_LEVEL", 1); - UPDATE_OPTION(repair, "PARRSB_REPAIR", 1); - UPDATE_OPTION(rsb_algo, "PARRSB_RSB_ALGO", 1); - UPDATE_OPTION(rsb_pre, "PARRSB_RSB_PRE", 1); - UPDATE_OPTION(rsb_max_iter, "PARRSB_RSB_MAX_ITER", 1); - UPDATE_OPTION(rsb_max_passes, "PARRSB_RSB_MAX_PASSES", 1); - UPDATE_OPTION(rsb_tol, "PARRSB_RSB_TOL", 0); - UPDATE_OPTION(rsb_mg_grammian, "PARRSB_RSB_MG_GRAMMIAN", 1); - UPDATE_OPTION(rsb_mg_factor, "PARRSB_RSB_MG_FACTOR", 1); - UPDATE_OPTION(rsb_mg_sagg, "PARRSB_RSB_MG_SMOOTH_AGGREGATION", 1); - if (options->verbose_level == 0) - options->profile_level = 0; -} +#include "sort.h" -#undef UPDATE_OPTION - -#define PRINT_OPTION(OPT, STR, FMT) printf("%s = " FMT "\n", STR, options->OPT) - -static void print_options(parrsb_options *options) { - PRINT_OPTION(partitioner, "PARRSB_PARTITIONER", "%d"); - PRINT_OPTION(verbose_level, "PARRSB_VERBOSE_LEVEL", "%d"); - PRINT_OPTION(profile_level, "PARRSB_PROFILE_LEVEL", "%d"); - PRINT_OPTION(two_level, "PARRSB_TWO_LEVEL", "%d"); - PRINT_OPTION(repair, "PARRSB_REPAIR", "%d"); - PRINT_OPTION(rsb_algo, "PARRSB_RSB_ALGO", "%d"); - PRINT_OPTION(rsb_pre, "PARRSB_RSB_PRE", "%d"); - PRINT_OPTION(rsb_max_iter, "PARRSB_RSB_MAX_ITER", "%d"); - PRINT_OPTION(rsb_max_passes, "PARRSB_RSB_MAX_PASSES", "%d"); - PRINT_OPTION(rsb_tol, "PARRSB_RSB_TOL", "%lf"); - PRINT_OPTION(rsb_mg_grammian, "PARRSB_RSB_MG_GRAMMIAN", "%d"); - PRINT_OPTION(rsb_mg_factor, "PARRSB_RSB_MG_FACTOR", "%d"); - PRINT_OPTION(rsb_mg_sagg, "PARRSB_RSB_MG_SMOOTH_AGGREGATION", "%d"); -} +extern int fiedler(struct array *elements, int nv, + const parrsb_options *const options, struct comm *gsc, + buffer *buf, int verbose); -#undef PRINT_OPTION - -static size_t load_balance(struct array *elist, uint nel, int nv, double *coord, - long long *vtx, struct crystal *cr, buffer *bfr) { - struct comm *c = &cr->comm; - slong out[2][1], wrk[2][1], in = nel; - comm_scan(out, c, gs_long, gs_add, &in, 1, wrk); - slong start = out[0][0], nelg = out[1][0]; - - uint nstar = nelg / c->np, nrem = nelg - nstar * c->np; - slong lower = (nstar + 1) * nrem; - - size_t unit_size; - if (vtx == NULL) // RCB - unit_size = sizeof(struct rcb_element); - else // RSB - unit_size = sizeof(struct rsb_element); - - array_init_(elist, nel, unit_size, __FILE__, __LINE__); - - struct rcb_element *pe = (struct rcb_element *)calloc(1, unit_size); - pe->origin = c->id; - - int ndim = (nv == 8) ? 3 : 2; - for (uint e = 0; e < nel; ++e) { - slong eg = pe->globalId = start + e + 1; - if (nstar == 0) - pe->proc = eg - 1; - else if (eg <= lower) - pe->proc = (eg - 1) / (nstar + 1); - else - pe->proc = (eg - 1 - lower) / nstar + nrem; - - pe->coord[0] = pe->coord[1] = pe->coord[2] = 0.0; - for (int v = 0; v < nv; v++) - for (int n = 0; n < ndim; n++) - pe->coord[n] += coord[e * ndim * nv + v * ndim + n]; - for (int n = 0; n < ndim; n++) - pe->coord[n] /= nv; - - array_cat_(unit_size, elist, pe, 1, __FILE__, __LINE__); - } +static void test_component_versions(struct array *elements, struct comm *lc, + unsigned nv, unsigned lvl, buffer *bfr) { + // Send elements to % P processor to create disconnected components. + struct crystal cr; + crystal_init(&cr, lc); + + struct rsb_element *pe = (struct rsb_element *)elements->ptr; + for (unsigned e = 0; e < elements->n; e++) + pe[e].proc = pe[e].globalId % lc->np; + + sarray_transfer(struct rsb_element, elements, proc, 1, &cr); - if (vtx != NULL) { // RSB - struct rsb_element *pr = (struct rsb_element *)elist->ptr; - for (uint e = 0; e < nel; e++) { - for (int v = 0; v < nv; v++) - pr[e].vertices[v] = vtx[e * nv + v]; + struct comm tc0; + int color = (lc->id < lc->np / 2); + comm_split(lc, color, lc->id, &tc0); + + sint nc1 = get_components(NULL, elements, nv, &tc0, bfr, 0); + sint nc2 = get_components_v2(NULL, elements, nv, &tc0, bfr, 0); + if (nc1 != nc2) { + if (tc0.id == 0) { + fprintf(stderr, "Error: Level = %u SS BFS != MS BFS: %d %d\n", lvl, nc1, + nc2); + fflush(stderr); } + exit(EXIT_FAILURE); + } + if (nc1 > 1) { + if (tc0.id == 0) + printf("Warning: Level = %u has %d disconnected components.\n", lvl, nc1); + fflush(stdout); } - sarray_transfer_(elist, unit_size, offsetof(struct rcb_element, proc), 1, cr); - if (vtx == NULL) // RCB - sarray_sort(struct rcb_element, elist->ptr, elist->n, globalId, 1, bfr); - else // RSB - sarray_sort(struct rsb_element, elist->ptr, elist->n, globalId, 1, bfr); - - free(pe); - return unit_size; + comm_free(&tc0); + sarray_transfer(struct rsb_element, elements, proc, 0, &cr); + crystal_free(&cr); } -static void restore_original(int *part, int *seq, struct crystal *cr, - struct array *elist, size_t usize, buffer *bfr) { - sarray_transfer_(elist, usize, offsetof(struct rcb_element, origin), 1, cr); - uint nel = elist->n; - - if (usize == sizeof(struct rsb_element)) // RSB - sarray_sort(struct rsb_element, elist->ptr, nel, globalId, 1, bfr); - else if (usize == sizeof(struct rcb_element)) // RCB - sarray_sort(struct rcb_element, elist->ptr, nel, globalId, 1, bfr); - - struct rcb_element *element; - uint e; - for (e = 0; e < nel; e++) { - element = (struct rcb_element *)((char *)elist->ptr + e * usize); - part[e] = element->origin; // element[e].origin; - } +static void check_rsb_partition(const struct comm *gc, + const parrsb_options *const opts) { + int max_levels = log2ll(gc->np); + int miter = opts->rsb_max_iter, mpass = opts->rsb_max_passes; + + for (int i = 0; i < max_levels; i++) { + sint converged = 1; + int val = (int)metric_get_value(i, RSB_FIEDLER_CALC_NITER); + if (opts->rsb_algo == 0) { + if (val == miter * mpass) + converged = 0; + } else if (opts->rsb_algo == 1) { + if (val == mpass) + converged = 0; + } - if (seq != NULL) { - for (e = 0; e < nel; e++) { - element = (struct rcb_element *)((char *)elist->ptr + e * usize); - seq[e] = element->seq; // element[e].seq; + struct comm c; + comm_split(gc, converged, gc->id, &c); + + slong bfr[4]; + if (converged == 0) { + if (opts->rsb_algo == 0) { + double init = metric_get_value(i, TOL_INIT); + comm_allreduce(&c, gs_double, gs_min, &init, 1, (void *)bfr); + + double target = metric_get_value(i, TOL_TGT); + comm_allreduce(&c, gs_double, gs_min, &target, 1, (void *)bfr); + + double final = metric_get_value(i, TOL_FNL); + comm_allreduce(&c, gs_double, gs_min, &final, 1, (void *)bfr); + if (c.id == 0) { + printf("Warning: Lanczos reached a residual of %lf (target: %lf) " + "after %d x %d iterations in Level=%d!\n", + final, target, mpass, miter, i); + fflush(stdout); + } + } else if (opts->rsb_algo == 1) { + if (c.id == 0) { + printf("Warning: Inverse iteration didn't converge after %d " + "iterations in Level = %d\n", + mpass, i); + fflush(stdout); + } + } + } + comm_free(&c); + + sint minc, maxc; + minc = maxc = (sint)metric_get_value(i, RSB_COMPONENTS_NCOMP); + comm_allreduce(gc, gs_int, gs_min, &minc, 1, (void *)bfr); + comm_allreduce(gc, gs_int, gs_max, &maxc, 1, (void *)bfr); + + if (maxc > 1 && gc->id == 0) { + printf("Warning: Partition created %d/%d (min/max) disconnected " + "components in Level=%d!\n", + minc, maxc, i); + fflush(stdout); } } } -int parrsb_part_mesh(int *part, int *seq, long long *vtx, double *coord, - int nel, int nv, parrsb_options options, MPI_Comm comm) { - struct comm c; - comm_init(&c, comm); +static int check_bin_val(int bin, struct comm *c) { + if (bin < 0 || bin > 1) { + if (c->id == 0) { + printf("%s:%d bin value out of range: %d\n", __FILE__, __LINE__, bin); + fflush(stdout); + } + return 1; + } + return 0; +} - slong nelg = nel, wrk; - comm_allreduce(&c, gs_long, gs_add, &nelg, 1, &wrk); +static int balance_partitions(struct array *elements, unsigned nv, + struct comm *lc, struct comm *gc, int bin, + buffer *bfr) { + // Return if there is only one processor. + if (gc->np == 1) + return 0; + + assert(check_bin_val(bin, gc) == 0); + + struct ielem_t { + uint index, orig; + sint dest; + scalar fiedler; + }; + + // Calculate expected # of elements per processor. + uint ne = elements->n; + slong nelgt = ne, nglob = ne, wrk; + comm_allreduce(lc, gs_long, gs_add, &nelgt, 1, &wrk); + comm_allreduce(gc, gs_long, gs_add, &nglob, 1, &wrk); + + sint ne_ = nglob / gc->np, nrem = nglob - ne_ * gc->np; + slong nelgt_exp = ne_ * lc->np + nrem / 2 + (nrem % 2) * (1 - bin); + slong send_cnt = nelgt - nelgt_exp > 0 ? nelgt - nelgt_exp : 0; + + // Setup gather-scatter. + size_t size = ne * nv; + uint e, v; + slong *ids = tcalloc(slong, size); + struct rsb_element *elems = (struct rsb_element *)elements->ptr; + for (e = 0; e < ne; e++) { + for (v = 0; v < nv; v++) + ids[e * nv + v] = elems[e].vertices[v]; + } + struct gs_data *gsh = gs_setup(ids, size, gc, 0, gs_pairwise, 0); + + sint *input = (sint *)ids; + if (send_cnt > 0) { + for (e = 0; e < size; e++) + input[e] = 0; + } else { + for (e = 0; e < size; e++) + input[e] = 1; + } - update_options(&options); + gs(input, gs_int, gs_add, 0, gsh, bfr); - debug_print(&c, options.verbose_level, - "Running parRSB ..., nv = %d, nelg = %lld\n", nv, nelg); - if (c.id == 0 && options.verbose_level > 0) - print_options(&options); - fflush(stdout); + for (e = 0; e < ne; e++) + elems[e].proc = gc->id; - parrsb_barrier(&c); - double t = comm_time(); + sint sid = (send_cnt == 0) ? gc->id : INT_MAX, balanced = 0; + comm_allreduce(gc, gs_int, gs_min, &sid, 1, &wrk); struct crystal cr; - crystal_init(&cr, &c); - - buffer bfr; - buffer_init(&bfr, (nel + 1) * sizeof(struct rsb_element)); - - // Load balance input data - debug_print(&c, options.verbose_level, "Load balance ..."); - struct array elist; - size_t esize = load_balance(&elist, nel, nv, coord, vtx, &cr, &bfr); - debug_print(&c, options.verbose_level, " done.\n"); - - // Run RSB now - debug_print(&c, options.verbose_level, "Running the partitioner ..."); - struct comm ca; - comm_split(&c, elist.n > 0, c.id, &ca); - metric_init(); - if (elist.n > 0) { - slong out[2][1], wrk[2][1], in = elist.n; - comm_scan(out, &ca, gs_long, gs_add, &in, 1, wrk); - slong nelg = out[1][0]; - - int ndim = (nv == 8) ? 3 : 2; - switch (options.partitioner) { + + if (send_cnt > 0) { + struct array ielems; + array_init(struct ielem_t, &ielems, 10); + + struct ielem_t ielem = { + .index = 0, .orig = lc->id, .dest = -1, .fiedler = 0}; + int mul = (sid == 0) ? 1 : -1; + for (e = 0; e < ne; e++) { + for (v = 0; v < nv; v++) { + if (input[e * nv + v] > 0) { + ielem.index = e, ielem.fiedler = mul * elems[e].fiedler; + array_cat(struct ielem_t, &ielems, &ielem, 1); + break; + } + } + } + + // Sort based on fiedler value and sets `orig` field + parallel_sort(struct ielem_t, &ielems, fiedler, gs_double, 0, 1, lc, bfr); + + slong out[2][1], bfr[2][1], nielems = ielems.n; + comm_scan(out, lc, gs_long, gs_add, &nielems, 1, bfr); + slong start = out[0][0]; + + sint P = gc->np - lc->np; + sint part_size = (send_cnt + P - 1) / P; + + if (out[1][0] >= send_cnt) { + balanced = 1; + struct ielem_t *ptr = ielems.ptr; + for (e = 0; start + e < send_cnt && e < ielems.n; e++) + ptr[e].dest = sid + (start + e) / part_size; + + crystal_init(&cr, lc); + sarray_transfer(struct ielem_t, &ielems, orig, 0, &cr); + crystal_free(&cr); + + ptr = ielems.ptr; + for (e = 0; e < ielems.n; e++) + if (ptr[e].dest != -1) + elems[ptr[e].index].proc = ptr[e].dest; + } + + array_free(&ielems); + } + + comm_allreduce(gc, gs_int, gs_max, &balanced, 1, &wrk); + if (balanced == 1) { + crystal_init(&cr, gc); + sarray_transfer(struct rsb_element, elements, proc, 0, &cr); + crystal_free(&cr); + + // Do a load balanced sort in each partition + parallel_sort(struct rsb_element, elements, fiedler, gs_double, 0, 1, lc, + bfr); + } else { + // Forget about disconnected components, just do a load balanced partition + parallel_sort(struct rsb_element, elements, fiedler, gs_double, 0, 1, gc, + bfr); + } + + free(ids), gs_free(gsh); + return 0; +} + +static int repair_partitions_v2(struct array *elems, unsigned nv, + struct comm *tc, struct comm *lc, unsigned bin, + unsigned algo, buffer *bfr) { + assert(check_bin_val(bin, lc) == 0); + + sint nc = get_components_v2(NULL, elems, nv, tc, bfr, 0), wrk; + comm_allreduce(lc, gs_int, gs_max, &nc, 1, &wrk); + if (nc > 1) { + // If nc > 1, send elements back and do RCBx, RCBy and RCBz + struct crystal cr; + crystal_init(&cr, lc); + sarray_transfer(struct rsb_element, elems, proc, 0, &cr); + crystal_free(&cr); + + // Do rcb or rib + unsigned ndim = (nv == 8) ? 3 : 2; + switch (algo) { case 0: - rsb(&elist, nv, 1, &options, &ca, &bfr); + parallel_sort(struct rsb_element, elems, globalId, gs_long, 0, 1, lc, + bfr); break; case 1: - rcb(&elist, esize, ndim, &ca, &bfr); + rcb(elems, sizeof(struct rsb_element), ndim, lc, bfr); break; case 2: - rib(&elist, esize, ndim, &ca, &bfr); + rib(elems, sizeof(struct rsb_element), ndim, lc, bfr); break; default: break; } - metric_rsb_print(&ca, options.profile_level); + // And count number of components again. If nc > 1 still, set + // isconnected = 1 + nc = get_components_v2(NULL, elems, nv, tc, bfr, 0); + comm_allreduce(lc, gs_int, gs_max, &nc, 1, &wrk); } - metric_finalize(), comm_free(&ca); - debug_print(&c, options.verbose_level, " done.\n"); - debug_print(&c, options.verbose_level, "Restore the original input ..."); - restore_original(part, seq, &cr, &elist, esize, &bfr); - debug_print(&c, options.verbose_level, " done.\n"); + return 0; +} + +static sint get_bisect_comm(struct comm *const tc, const struct comm *const lc, + const uint level, const uint levels, + const struct comm comms[3]) { + sint pid, psize; + if (level < levels - 1) { + sint out[2][1], wrk[2][1], in = (comms[level + 1].id == 0); + comm_scan(out, &comms[level], gs_int, gs_add, &in, 1, wrk); + psize = out[1][0], pid = (comms[level + 1].id == 0) * out[0][0]; + comm_allreduce(&comms[level + 1], gs_int, gs_max, &pid, 1, wrk); + } else { + psize = lc->np, pid = lc->id; + } - // Report time and finish - parrsb_barrier(&c); - debug_print(&c, 1, "par%s finished in %g seconds.\n", - ALGO[options.partitioner], comm_time() - t); + const sint bin = (pid >= (psize + 1) / 2); + comm_split(lc, bin, lc->id, tc); + return bin; +} - array_free(&elist), buffer_free(&bfr), crystal_free(&cr), comm_free(&c); +static uint get_level_cuts(const uint level, const uint levels, + const struct comm comms[3]) { + uint n; + if (level < levels - 1) { + sint size = (comms[level + 1].id == 0), wrk; + comm_allreduce(&comms[level], gs_int, gs_add, &size, 1, &wrk); + n = size; + } else { + n = comms[level].np; + } - return 0; + sint cuts = 0; + uint pow2 = 1; + while (pow2 < n) + pow2 <<= 1, cuts++; + + sint wrk; + comm_allreduce(&comms[0], gs_int, gs_max, &cuts, 1, &wrk); + + return cuts; } -void fparrsb_part_mesh(int *part, int *seq, long long *vtx, double *coord, - int *nel, int *nv, int *options, int *comm, int *err) { - *err = 1; - comm_ext c = MPI_Comm_f2c(*comm); - parrsb_options opt = parrsb_default_options; - *err = parrsb_part_mesh(part, seq, vtx, coord, *nel, *nv, opt, c); +void rsb(struct array *elements, int nv, const parrsb_options *const options, + const struct comm comms[3], buffer *bfr) { + const unsigned levels = options->levels; + const sint verbose = options->verbose_level; + const uint ndim = (nv == 8) ? 3 : 2; + const struct comm *gc = &comms[0]; + for (uint level = 0; level < levels; level++) { + // Find the maximum number of RSB cuts in current level. + uint ncuts = get_level_cuts(level, levels, comms); + parrsb_print(gc, verbose, "rsb: Level=%u/%u number of cuts = %u", level + 1, + levels, ncuts); + + struct comm lc; + comm_dup(&lc, &comms[level]); + for (uint cut = 0; cut < ncuts; cut++) { + // Run the pre-partitioner. + parrsb_print(gc, verbose - 1, "\trsb: Pre-partition ..."); + + metric_tic(&lc, RSB_PRE); + switch (options->rsb_pre) { + case 0: + parallel_sort(struct rsb_element, elements, globalId, gs_long, 0, 1, + &lc, bfr); + break; + case 1: + rcb(elements, sizeof(struct rsb_element), ndim, &lc, bfr); + break; + case 2: + rib(elements, sizeof(struct rsb_element), ndim, &lc, bfr); + break; + default: + break; + } + metric_toc(&lc, RSB_PRE); + + struct rsb_element *const pe = (struct rsb_element *const)elements->ptr; + for (unsigned i = 0; i < elements->n; i++) + pe[i].proc = lc.id; + + // Find the Fiedler vector. + parrsb_print(gc, verbose - 1, "\trsb: Fiedler ... "); + metric_tic(&lc, RSB_FIEDLER); + fiedler(elements, nv, options, &lc, bfr, verbose - 2); + metric_toc(&lc, RSB_FIEDLER); + + // Sort by Fiedler vector. + parrsb_print(gc, verbose - 1, "\trsb: Sort ..."); + metric_tic(&lc, RSB_SORT); + parallel_sort(struct rsb_element, elements, fiedler, gs_double, 0, 1, &lc, + bfr); + metric_toc(&lc, RSB_SORT); + + // `tc` is the new communicator in newly found partitions. + struct comm tc; + sint bin = get_bisect_comm(&tc, &lc, level, levels, comms); + + // Find the number of disconnected components. + parrsb_print(gc, verbose - 1, "\trsb: Components ..."); + metric_tic(&lc, RSB_COMPONENTS); + const uint ncomp = + get_components_v2(NULL, elements, nv, &tc, bfr, verbose - 2); + metric_acc(RSB_COMPONENTS_NCOMP, ncomp); + metric_toc(&lc, RSB_COMPONENTS); + + // Bisect and balance. + parrsb_print(gc, verbose - 1, "\trsb: Balance ..."); + metric_tic(&lc, RSB_BALANCE); + balance_partitions(elements, nv, &tc, &lc, bin, bfr); + metric_toc(&lc, RSB_BALANCE); + + // Split the communicator and recurse on the sub-problems. + parrsb_print(gc, verbose - 1, "\trsb: Bisect ..."); + comm_free(&lc), comm_dup(&lc, &tc), comm_free(&tc); + + const uint nbrs = parrsb_get_neighbors(elements, nv, gc, &lc, bfr); + metric_acc(RSB_NEIGHBORS, nbrs); + metric_push_level(); + } + comm_free(&lc); + } + + check_rsb_partition(gc, options); } diff --git a/src/schur.c b/src/schur.c deleted file mode 100644 index a94bcfac..00000000 --- a/src/schur.c +++ /dev/null @@ -1,1264 +0,0 @@ -#include "coarse-impl.h" -#include "metrics.h" -#include "multigrid.h" -#include - -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - -//------------------------------------------------------------------------------ -// Cholesky factorization of a matrix -// -/* -symbolic factorization: finds the sparsity structure of L - -uses the concept of elimination tree: - the parent of node j is node i when L(i,j) is the first - non-zero in column j below the diagonal (i>j) - L's structure is discovered row-by-row; the first time - an entry in column j is set, it must be the parent - -the nonzeros in L are the nonzeros in A + paths up the elimination tree - -linear in the number of nonzeros of L -*/ -static uint *cholesky_symbolic(struct mat *L, uint n, uint const *Ap, - uint const *Ai, buffer *buf) { - L->n = n; - - uint *parent = tcalloc(uint, 2 * n), *visit = parent + n; - uint i, j, nz = 0; - for (i = 0; i < n; i++) { - parent[i] = n, visit[i] = i; - for (uint p = Ap[i]; p < Ap[i + 1]; p++) { - if ((j = Ai[p]) >= i) - break; - for (; visit[j] != i; j = parent[j]) { - ++nz, visit[j] = i; - if (parent[j] == n) { - parent[j] = i; - break; - } - } - } - } - - uint *Lp = L->Lp = tcalloc(uint, n + 1); - uint *Li = L->Li = tcalloc(uint, nz); - - Lp[0] = 0; - uint *Lir, nzr; - for (i = 0; i < n; i++) { - nzr = 0, Lir = &Li[Lp[i]]; - visit[i] = i; - for (uint p = Ap[i]; p < Ap[i + 1]; p++) { - if ((j = Ai[p]) >= i) - break; - for (; visit[j] != i; j = parent[j]) - Lir[nzr++] = j, visit[j] = i; - } - sortv(Lir, Lir, nzr, sizeof(uint), buf); - Lp[i + 1] = Lp[i] + nzr; - } - - free(parent); - return 0; -} - -/* -numeric factorization: - -L is built row-by-row, using: ( ' indicates transpose ) - - -[ A r ] = [ (I-L) ] [ D^(-1) ] [ (I-L)' -s ] -[ r' a ] [ -s' 1 ] [ 1/d ] [ 1 ] - - = [ A (I-L) D^(-1) (-s) ] - [ r' s' D^(-1) s + 1/d ] - -so, if r' is the next row of A, up to but excluding the diagonal, -then the next row of L, s', obeys - - r = - (I-L) D^(-1) s - -let y = (I-L)^(-1) (-r) -then s = D y, and d = 1/(a - s' y) -*/ -static void cholesky_numeric(struct mat *chol, const uint n, const uint *Ap, - const uint *Ai, const scalar *A, uint *visit, - scalar *y) { - const uint *Lp = chol->Lp, *Li = chol->Li; - scalar *D = chol->D = tcalloc(scalar, n); - scalar *L = chol->L = tcalloc(scalar, Lp[n]); - - uint i; - for (i = 0; i < n; i++) { - uint p, pe, j; - scalar a; - visit[i] = n; - for (p = Lp[i], pe = Lp[i + 1]; p != pe; p++) - j = Li[p], y[j] = 0, visit[j] = i; - for (p = Ap[i], pe = Ap[i + 1]; p != pe; p++) { - if ((j = Ai[p]) >= i) { - if (j == i) - a = A[p]; - break; - } - y[j] = -A[p]; - } - for (p = Lp[i], pe = Lp[i + 1]; p != pe; p++) { - uint j = Li[p], q = Lp[j], qe = Lp[j + 1]; - scalar yj = y[j]; - for (; q != qe; q++) { - uint k = Li[q]; - if (visit[k] == i) - yj += L[q] * y[k]; - } - y[j] = yj; - scalar lij = L[p] = D[j] * yj; - a -= lij * yj; - } - D[i] = 1 / a; - } -} - -static void cholesky_factor(struct mat *L, struct mat *A, uint null_space, - buffer *buf) { - L->start = A->start; - const uint uints_as_dbls = - (A->n * sizeof(uint) + sizeof(double) - 1) / sizeof(double); - buffer_reserve(buf, (uints_as_dbls + A->n - null_space) * sizeof(double)); - cholesky_symbolic(L, A->n - null_space, A->Lp, A->Li, buf); - cholesky_numeric(L, L->n, A->Lp, A->Li, A->L, buf->ptr, - uints_as_dbls + (double *)buf->ptr); - A->n = A->n - null_space; -} - -static void cholesky_solve(scalar *x, const struct mat *A, scalar *b) { - const uint *Lp = A->Lp, *Li = A->Li, n = A->n; - const scalar *L = A->L, *D = A->D; - - uint i, p, pe; - for (i = 0; i < n; i++) { - scalar xi = b[i]; - for (p = Lp[i], pe = Lp[i + 1]; p != pe; p++) - xi += x[Li[p]] * L[p]; - x[i] = xi; - } - - for (i = 0; i < n; i++) - x[i] *= D[i]; - - for (i = n; i > 0;) { - scalar xi = x[--i]; - for (p = Lp[i], pe = Lp[i + 1]; p != pe; p++) { - x[Li[p]] += xi * L[p]; - } - x[i] = xi; - } -} - -static void cholesky_lower_solve(scalar *x, const struct mat *A, scalar *b) { - const uint *Lp = A->Lp, *Li = A->Li, n = A->n; - const scalar *L = A->L, *D = A->D; - - uint i, p, pe; - for (i = 0; i < n; i++) { - scalar xi = b[i]; - for (p = Lp[i], pe = Lp[i + 1]; p != pe; p++) - xi += x[Li[p]] * L[p]; - x[i] = xi; - } - - for (i = 0; i < n; i++) - x[i] *= sqrt(D[i]); -} - -static void cholesky_upper_solve(scalar *x, const struct mat *A, scalar *b) { - const uint *Lp = A->Lp, *Li = A->Li, n = A->n; - const scalar *L = A->L, *D = A->D; - - uint i; - for (i = 0; i < n; i++) - x[i] = b[i] * sqrt(D[i]); - - uint p, pe; - for (i = n; i > 0;) { - scalar xi = x[--i]; - for (p = Lp[i], pe = Lp[i + 1]; p != pe; p++) { - x[Li[p]] += xi * L[p]; - } - x[i] = xi; - } -} - -//----------------------------------------------------------------------------- -// Schur setup, solve and free -// -// A_ll: local dof of a processor (block diagonal across processors) -// A_sl: shared - local matrix -// A_ss: shared dof freedom (matrix is split row wise) -// -// |A_ll (B) A_ls (F)| -// A= | | -// |A_sl (E) A_ss (S)| -// -struct schur { - struct mat A_ll; - struct par_mat A_ls, A_sl, A_ss; - struct gs_data *Q_ls, *Q_sl, *Q_ss; - struct mg *M; -}; - -static int S_owns_row(const ulong r, const ulong *rows, const uint n) { - // We can do a binary search instead of linear search - uint i = 0; - while (i < n && rows[i] != r) - i++; - return i; -} - -// Calculate G = L_{B}^{-1} x F where B = L_{B} U_{B}. F is in CSR format, -// distributed by rows similar to B. G will be in CSC format and distributed -// by columns similar to row distribution of S. -static int schur_setup_G(struct par_mat *G, scalar tol, const struct mat *L, - const struct par_mat *F, const ulong *srows, - const uint srn, struct crystal *const cr, - buffer *bfr) { - assert(IS_CSR(F)); - assert(!IS_DIAG(F)); - - buffer_reserve(bfr, sizeof(scalar) * L->n * F->cn); - scalar *v = (scalar *)bfr->ptr; - for (uint i = 0; i < L->n * F->cn; i++) - v[i] = 0; - - // Do L_B^{-1} x F now. Columns of L_B^{-1} are found one by one and - // then they are multplied by F. Is the above description correct? - scalar *b = tcalloc(scalar, 2 * L->n); - scalar *x = b + L->n; - for (uint i = 0; i < F->rn; i++) { - b[F->rows[i] - L->start] = 1; - cholesky_lower_solve(x, L, b); - - // Calculate F: i^th row of F is multiplied by each element of i^th - // column of L_B^-1 - for (uint k = F->adj_off[i], ke = F->adj_off[i + 1]; k < ke; k++) - for (uint j = 0; j < L->n; j++) - // m.c = F->cols[F->adj_idx[k]], m.r = L->start + j; - v[j * F->cn + F->adj_idx[k]] += F->adj_val[k] * x[j]; - - b[F->rows[i] - L->start] = 0; - for (uint j = 0; j < L->n; j++) - x[j] = 0; - } - - uint size = L->n * 20 + 1; - struct array unique; - array_init(struct mij, &unique, size); - - struct comm *c = &cr->comm; - struct mij m = {.r = 0, .c = 0, .idx = 1, .p = 0, .v = 0}; - for (uint i = 0; i < L->n; i++) { - for (uint j = 0; j < F->cn; j++) { - if (fabs(v[i * F->cn + j]) >= tol) { - m.r = L->start + i, m.c = F->cols[j], m.p = m.c % c->np; - m.v = v[i * F->cn + j]; - array_cat(struct mij, &unique, &m, 1); - } - } - } - - m.r = 0, m.idx = 0, m.v = 0; - for (uint i = 0; i < srn; i++) { - m.c = srows[i], m.p = m.c % c->np; - array_cat(struct mij, &unique, &m, 1); - } - - sarray_transfer(struct mij, &unique, p, 1, cr); - - struct array mijs; - array_init(struct mij, &mijs, unique.n); - if (unique.n > 0) { - sarray_sort_2(struct mij, unique.ptr, unique.n, c, 1, idx, 0, bfr); - struct mij *pu = (struct mij *)unique.ptr; - uint i = 0, j = 1; - for (; j < unique.n; j++) { - if (pu[j].c != pu[i].c) { - assert(pu[i].idx == 0); - for (uint k = i + 1; k < j; k++) { - pu[k].p = pu[i].p; - array_cat(struct mij, &mijs, &pu[k], 1); - } - i = j; - } - } - assert(pu[i].idx == 0); - for (uint k = i + 1; k < unique.n; k++) { - pu[k].p = pu[i].p; - array_cat(struct mij, &mijs, &pu[k], 1); - } - } - array_free(&unique); - - sarray_transfer(struct mij, &mijs, p, 0, cr); - par_csc_setup(G, &mijs, 0, bfr); - array_free(&mijs); -#ifdef DUMPG - par_mat_print(G); -#endif - - return 0; -} - -// Calculate W = E x U_{B}^{-1} where B = L_{B} U_{B}. E is in CSC format. -// W will be in CSR format and distributed by rows similar to distribution of S. -static int schur_setup_W(struct par_mat *W, scalar tol, const struct mat *L, - const struct par_mat *E, const ulong *srows, - const uint srn, struct crystal *const cr, - buffer *bfr) { - assert(IS_CSC(E)); - assert(!IS_DIAG(E)); - - buffer_reserve(bfr, sizeof(scalar) * L->n * E->rn); - scalar *v = (scalar *)bfr->ptr; - for (uint i = 0; i < L->n * E->rn; i++) - v[i] = 0; - - // Multiply E by U_B^{-1} now. Columns of U_B^{-1} are found one by one and - // then E is multiplied by each column. - scalar *b = tcalloc(scalar, 2 * L->n); - scalar *x = b + L->n; - for (uint i = 0; i < L->n; i++) { - b[i] = 1; - cholesky_upper_solve(x, L, b); - - // Multiply E by x: i^th col of E is multiplied by element x[i] - for (uint j = 0; j < E->cn; j++) - for (uint k = E->adj_off[j], ke = E->adj_off[j + 1]; k < ke; k++) - // m.c = L->start + i, m.r = E->rows[E->adj_idx[k]]; - v[E->adj_idx[k] * L->n + i] += E->adj_val[k] * x[E->cols[j] - L->start]; - - b[i] = 0; - for (uint j = 0; j < L->n; j++) - x[j] = 0; - } - - uint size = E->rn * 20 + 1; - struct array unique; - array_init(struct mij, &unique, size); - - struct comm *c = &cr->comm; - struct mij m = {.r = 0, .c = 0, .idx = 1, .p = 0, .v = 0}; - for (uint i = 0; i < E->rn; i++) { - for (uint j = 0; j < L->n; j++) { - if (fabs(v[i * L->n + j]) >= tol) { - m.r = E->rows[i], m.c = L->start + j, m.p = m.r % c->np; - m.v = v[i * L->n + j]; - array_cat(struct mij, &unique, &m, 1); - } - } - } - - m.c = 0, m.idx = 0, m.v = 0; - for (uint i = 0; i < srn; i++) { - m.r = srows[i], m.p = m.r % c->np; - array_cat(struct mij, &unique, &m, 1); - } - - sarray_transfer(struct mij, &unique, p, 1, cr); - - struct array mijs; - array_init(struct mij, &mijs, unique.n); - if (unique.n > 0) { - sarray_sort_2(struct mij, unique.ptr, unique.n, r, 1, idx, 0, bfr); - struct mij *pu = (struct mij *)unique.ptr; - uint i = 0, j = 1; - for (; j < unique.n; j++) { - if (pu[j].r != pu[i].r) { - assert(pu[i].idx == 0); - for (uint k = i + 1; k < j; k++) { - pu[k].p = pu[i].p; - array_cat(struct mij, &mijs, &pu[k], 1); - } - i = j; - } - } - assert(pu[i].idx == 0); - for (uint k = i + 1; k < unique.n; k++) { - pu[k].p = pu[i].p; - array_cat(struct mij, &mijs, &pu[k], 1); - } - } - array_free(&unique); - - sarray_transfer(struct mij, &mijs, p, 0, cr); - par_csr_setup(W, &mijs, 0, bfr); - array_free(&mijs); -#ifdef DUMPW - par_mat_print(W); -#endif - - return 0; -} - -// C = A - B; A and B should be in CSR format with the same row -// distribution across processors -static int sparse_sub(struct par_mat *C, const struct par_mat *A, - const struct par_mat *B, buffer *bfr) { - assert(IS_CSR(A)); - assert(IS_CSR(B)); - - struct array cij; - array_init(struct mij, &cij, 100); - - struct mij m; - uint r, j, je; - for (r = 0; r < B->rn; r++) { - m.r = B->rows[r]; - for (j = B->adj_off[r], je = B->adj_off[r + 1]; j != je; j++) { - m.c = B->cols[B->adj_idx[j]], m.v = -B->adj_val[j]; - array_cat(struct mij, &cij, &m, 1); - } - } - if (IS_DIAG(B)) { - for (r = 0; r < B->rn; r++) { - m.r = m.c = B->rows[r], m.v = -B->diag_val[r]; - array_cat(struct mij, &cij, &m, 1); - } - } - - for (r = 0; r < A->rn; r++) { - m.r = A->rows[r]; - for (j = A->adj_off[r], je = A->adj_off[r + 1]; j != je; j++) { - m.c = A->cols[A->adj_idx[j]], m.v = A->adj_val[j]; - array_cat(struct mij, &cij, &m, 1); - } - } - if (IS_DIAG(A)) { - for (r = 0; r < A->rn; r++) { - m.r = A->rows[r], m.c = A->rows[r], m.v = A->diag_val[r]; - array_cat(struct mij, &cij, &m, 1); - } - } - - struct array unique; - array_init(struct mij, &unique, 100); - if (cij.n > 0) { - sarray_sort_2(struct mij, cij.ptr, cij.n, r, 1, c, 1, bfr); - struct mij *ptr = (struct mij *)cij.ptr; - uint i = 0; - while (i < cij.n) { - scalar s = 0; - for (j = i; j < cij.n && ptr[j].r == ptr[i].r && ptr[j].c == ptr[i].c; - j++) - s += ptr[j].v; - m = ptr[i], m.v = s; - array_cat(struct mij, &unique, &m, 1); - i = j; - } - } - array_free(&cij); - - par_csr_setup(C, &unique, 1, bfr); - array_free(&unique); - - return 0; -} - -int sparse_gemm(struct par_mat *WG, const struct par_mat *W, - const struct par_mat *G, int diag_wg, struct crystal *cr, - buffer *bfr) { - // W is in CSR, G is in CSC; we multiply rows of W by shifting - // the columns of G from processor to processor. This is not scalable - // at all -- need to do a 2D partition of the matrices W and G. - assert(IS_CSR(W) && !IS_DIAG(W)); - assert(IS_CSC(G)); - - // Put G into an array to transfer from processor to processor - struct array gij, sij; - array_init(struct mij, &gij, 100); - array_init(struct mij, &sij, 100); - - struct mij m = {.r = 0, .c = 0, .idx = 0, .p = cr->comm.id, .v = 0}; - uint i, j, je; - for (i = 0; i < G->cn; i++) { - m.c = G->cols[i]; - for (j = G->adj_off[i], je = G->adj_off[i + 1]; j != je; j++) { - m.r = G->rows[G->adj_idx[j]]; - m.v = G->adj_val[j]; - array_cat(struct mij, &gij, &m, 1); - } - } - if (IS_DIAG(G)) { - for (i = 0; i < G->cn; i++) { - m.c = m.r = G->cols[i]; - m.v = G->diag_val[i]; - array_cat(struct mij, &gij, &m, 1); - } - } - - sarray_sort_2(struct mij, gij.ptr, gij.n, c, 1, r, 1, bfr); - struct mij *pg = (struct mij *)gij.ptr; - for (i = 0; i < gij.n; i++) - pg[i].idx = i; - - for (uint p = 0; p < cr->comm.np; p++) { - // Calculate dot product of each row of W with columns of G - for (i = 0; i < W->rn; i++) { - m.r = W->rows[i]; - uint s = 0, e = 0; - while (s < gij.n) { - m.c = pg[s].c, m.v = 0; - for (j = W->adj_off[i], je = W->adj_off[i + 1]; j < je; j++) { - ulong k = W->cols[W->adj_idx[j]]; - while (e < gij.n && pg[s].c == pg[e].c && pg[e].r < k) - e++; - if (e < gij.n && pg[s].c == pg[e].c && pg[e].r == k) - m.v += W->adj_val[j] * pg[e].v; - } - while (e < gij.n && pg[s].c == pg[e].c) - e++; - if (fabs(m.v) > 1e-12) - array_cat(struct mij, &sij, &m, 1); - s = e; - } - } - - sint next = (cr->comm.id + 1) % cr->comm.np; - for (i = 0; i < gij.n; i++) - pg[i].p = next; - sarray_transfer(struct mij, &gij, p, 0, cr); - - sarray_sort(struct mij, gij.ptr, gij.n, idx, 0, bfr); - pg = gij.ptr; - } - - par_csr_setup(WG, &sij, diag_wg, bfr); - array_free(&gij), array_free(&sij); - - return 0; -} - -static struct mg * -schur_precond_setup(const struct mat *L, const struct par_mat *F, - const struct par_mat *S, const struct par_mat *E, ulong si, - uint ni, struct crystal *cr, buffer *bfr) { - // TODO: Sparsify W and G when they are built - struct par_mat W, G, WG; - - struct comm *c = &cr->comm; - comm_barrier(c); - double t = comm_time(); - - double tol = 1e-12; - char *val = getenv("PARRSB_SCHUR_TOL"); - if (val) - tol = atof(val); - schur_setup_G(&G, tol, L, F, S->rows, S->rn, cr, bfr); - - t = comm_time() - t; - double wrk, min = t, max = t; - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - if (c->id == 0) { - printf("\tSetup G : %g %g (min max)\n", min, max); - fflush(stdout); - } - - comm_barrier(c); - t = comm_time(); - - schur_setup_W(&W, tol, L, E, S->rows, S->rn, cr, bfr); - - min = max = comm_time() - t; - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - if (c->id == 0) { - printf("\tSetup W : %g %g (min max)\n", min, max); - fflush(stdout); - } - - comm_barrier(c); - t = comm_time(); - - sparse_gemm(&WG, &W, &G, 0, cr, bfr); - - min = max = comm_time() - t; - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - if (c->id == 0) { - printf("\tSparse gemm : %g %g (min max)\n", min, max); - fflush(stdout); - } - -#ifdef DUMPWG - par_mat_print(&WG); -#endif - - comm_barrier(c); - t = comm_time(); - - // P is CSR - struct par_mat *P = tcalloc(struct par_mat, 1); - sparse_sub(P, S, &WG, bfr); - - min = max = comm_time() - t; - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - if (c->id == 0) { - printf("\tSparse sub : %g %g (min max)\n", min, max); - fflush(stdout); - } - -#ifdef DUMPP - par_mat_print(P); -#endif - - par_mat_free(&W), par_mat_free(&G), par_mat_free(&WG); - - comm_barrier(c); - t = comm_time(); - - int factor = 2; - val = getenv("PARRSB_SCHUR_MG_FACTOR"); - if (val) - factor = atoi(val); - struct mg *precond = mg_setup(P, factor, 0, cr, bfr); - - min = max = comm_time() - t; - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - if (c->id == 0) { - printf("\tMG precond : %g %g (min max)\n", min, max); - fflush(stdout); - } - - return precond; -} - -static struct gs_data *setup_Ezl_Q(struct par_mat *E, ulong s, uint n, - struct comm *c, buffer *bfr) { - assert(IS_CSC(E)); - assert(!IS_DIAG(E)); - - buffer_reserve(bfr, sizeof(slong) * (n + E->rn)); - slong *ids = (slong *)bfr->ptr; - uint i, j; - for (i = 0; i < n; i++) - ids[i] = s + i; - for (j = 0; j < E->rn; j++, i++) - ids[i] = -E->rows[j]; - -#if 0 - comm_barrier(c); - for (uint p = 0; p < c->np; p++) { - if (c->id == p) { - printf("\np = %d, s = %u ids = ", p, n + E->rn); - for (uint i = 0; i < n + E->rn; i++) { - printf("%lld ", ids[i]); - fflush(stdout); - } - printf("\n"); - } - comm_barrier(c); - } -#endif - - return gs_setup(ids, n + E->rn, c, 0, gs_auto, 0); -} - -static int Ezl(scalar *y, const struct par_mat *E, struct gs_data *gsh, - const scalar *zl, const ulong s, const uint n, buffer *bfr) { - assert(IS_CSC(E)); - assert(!IS_DIAG(E)); - - uint nn = n + E->rn; - scalar *wrk = (scalar *)tcalloc(scalar, nn); - scalar *ye = wrk + n; - for (uint i = 0; i < E->cn; i++) { - scalar zlk = zl[E->cols[i] - s]; - for (uint j = E->adj_off[i], je = E->adj_off[i + 1]; j < je; j++) - ye[E->adj_idx[j]] += zlk * E->adj_val[j]; - } - -#if 0 - for (uint i = 0; i < n + E->rn; i++) { - printf("wrk in = %u, E->rn = %u, E->cn = %u, i = %u, %lf\n", n, E->rn, - E->cn, i, wrk[i]); - fflush(stdout); - } -#endif - - gs(wrk, gs_double, gs_add, 1, gsh, bfr); - - for (uint i = 0; i < n; i++) - y[i] = wrk[i]; - - free(wrk); - - return 0; -} - -static struct gs_data *setup_Fxi_Q(struct par_mat *F, ulong s, uint n, - struct comm *c, buffer *bfr) { - assert(IS_CSR(F)); - assert(!IS_DIAG(F)); - - uint nnz = F->rn > 0 ? F->adj_off[F->rn] : 0; - buffer_reserve(bfr, sizeof(slong) * (n + nnz)); - slong *ids = (slong *)bfr->ptr; - uint i, j; - for (i = 0; i < nnz; i++) - ids[i] = F->cols[F->adj_idx[i]]; - for (j = 0; j < n; j++, i++) - ids[i] = -(s + j); - - return gs_setup(ids, i, c, 0, gs_pairwise, 0); -} - -static int Fxi(scalar *y, const struct par_mat *F, struct gs_data *gsh, - scalar *xi, const ulong s, const uint n, buffer *bfr) { - assert(IS_CSR(F)); - assert(!IS_DIAG(F)); - - uint nnz = F->rn > 0 ? F->adj_off[F->rn] : 0; - scalar *wrk = (scalar *)tcalloc(scalar, nnz + n); - uint i, j; - for (i = 0; i < nnz; i++) - wrk[i] = 0; - for (j = 0; j < n; j++, i++) - wrk[i] = xi[j]; - - gs(wrk, gs_double, gs_add, 1, gsh, bfr); - - for (i = 0; i < F->rn; i++) { - scalar si = 0; - for (uint j = F->adj_off[i], je = F->adj_off[i + 1]; j < je; j++) - si += F->adj_val[j] * wrk[j]; - y[F->rows[i] - s] = si; - } - return 0; -} - -static int distribute_by_columns(struct array *aij, ulong s, uint n, ulong ng, - struct crystal *cr, buffer *bfr) { - slong *cols = (slong *)tcalloc(slong, n + aij->n); - sint *owner = (sint *)tcalloc(sint, n + aij->n); - - struct mij *ptr = (struct mij *)aij->ptr; - for (uint i = 0; i < aij->n; i++) { - cols[i] = ptr[i].c; - owner[i] = -1; - } - - struct comm *c = &cr->comm; - for (uint i = 0; i < n; i++) { - cols[aij->n + i] = s + i; - owner[aij->n + i] = c->id; - } - - struct gs_data *gsh = gs_setup(cols, aij->n + n, c, 0, gs_auto, 0); - gs(owner, gs_int, gs_max, 0, gsh, bfr); - gs_free(gsh); - - for (uint i = 0; i < aij->n; i++) { - assert(owner[i] >= 0 && owner[i] < c->np); - ptr[i].p = owner[i]; - } - - free(owner); - free(cols); - - sarray_transfer(struct mij, aij, p, 1, cr); - - return 0; -} - -static inline scalar dot(scalar *r, scalar *s, uint n) { - scalar t = 0; - for (uint i = 0; i < n; i++) - t += r[i] * s[i]; - return t; -} - -static inline void ortho(scalar *q, uint n, ulong ng, struct comm *c) { - scalar s = 0, buf; - for (uint i = 0; i < n; i++) - s += q[i]; - - comm_allreduce(c, gs_double, gs_add, &s, 1, &buf); - s /= ng; - - for (uint i = 0; i < n; i++) - q[i] -= s; -} - -static int schur_action(scalar *y, const struct schur *schur, scalar *x, - ulong ls, scalar *wrk, buffer *bfr, struct comm *c) { - const struct par_mat *S = &schur->A_ss; - assert(IS_CSR(S)); - assert(S->rn == 0 || IS_DIAG(S)); - - uint ln = schur->A_ll.n, in = S->rn; - uint mn = ln > in ? ln : in; - scalar *xl = (scalar *)tcalloc(scalar, 2 * mn), *exl = xl + mn; - - metric_tic(c, SCHUR_PROJECT_OPERATOR_FXI); - // Calculate (E (B^-1) F) x - // Fx: x has size in, Fx has size ln. So wrk has to be at least ln - Fxi(exl, &schur->A_ls, schur->Q_ls, x, ls, in, bfr); - metric_toc(c, SCHUR_PROJECT_OPERATOR_FXI); - - metric_tic(c, SCHUR_PROJECT_OPERATOR_CHOL); - // Multiply Fx by B^-1 or (LU)^-1 - cholesky_solve(xl, &schur->A_ll, exl); - metric_toc(c, SCHUR_PROJECT_OPERATOR_CHOL); - - metric_tic(c, SCHUR_PROJECT_OPERATOR_EZL); - // Multuply (B^-1)Fx by E - Ezl(exl, &schur->A_sl, schur->Q_sl, xl, ls, in, bfr); - metric_toc(c, SCHUR_PROJECT_OPERATOR_EZL); - - metric_tic(c, SCHUR_PROJECT_OPERATOR_MATVEC); - // Separately calculate Sx - mat_vec_csr(y, x, S, schur->Q_ss, wrk, bfr); - metric_toc(c, SCHUR_PROJECT_OPERATOR_MATVEC); - - for (uint i = 0; i < in; i++) - y[i] -= exl[i]; - - free(xl); - - return 0; -} - -static int project(scalar *x, scalar *b, const struct schur *schur, ulong ls, - struct comm *c, int miter, scalar tol, int null_space, - int verbose, buffer *bfr) { - const struct par_mat *S = &schur->A_ss; - struct mg *d = schur->M; - - slong out[2][1], buf[2][1], in = S->rn; - comm_scan(out, c, gs_long, gs_add, &in, 1, buf); - ulong ng = out[1][0]; - - if (ng == 0) - return 0; - - uint n = S->rn, nnz = n > 0 ? S->adj_off[n] + n : 0; - scalar *z = (scalar *)tcalloc(scalar, 6 * n + nnz); - scalar *w = z + n, *r = w + n, *p = r + n, *z0 = p + n, *dz = z0 + n; - scalar *wrk = dz + n; - scalar *P = (scalar *)tcalloc(scalar, 2 * (miter + 1) * n); - scalar *W = P + n * (miter + 1); - - uint i; - for (i = 0; i < n; i++) { - x[i] = 0; - r[i] = b[i]; - } - - scalar rr = dot(r, r, n); - comm_allreduce(c, gs_double, gs_add, &rr, 1, buf); - scalar rtol = MAX(rr * tol * tol, tol * tol); - - for (i = 0; i < n; i++) - z[i] = r[i]; - if (null_space) - ortho(z, n, ng, c); - - scalar rz1 = dot(r, z, n); - comm_allreduce(c, gs_double, gs_add, &rz1, 1, buf); - - for (i = 0; i < n; i++) - p[i] = z[i]; - - scalar alpha, beta, rzt, rz2; - uint j, k; - for (i = 0; i < miter; i++) { - // Action of S - E (LU)^-1 F - metric_tic(c, SCHUR_PROJECT_OPERATOR); - schur_action(w, schur, p, ls, wrk, bfr, c); - metric_toc(c, SCHUR_PROJECT_OPERATOR); - - scalar pw = dot(p, w, n); - comm_allreduce(c, gs_double, gs_add, &pw, 1, buf); - alpha = rz1 / pw; - - pw = 1.0 / sqrt(pw); - for (j = 0; j < n; j++) { - W[i * n + j] = pw * w[j]; - P[i * n + j] = pw * p[j]; - } - - for (j = 0; j < n; j++) { - x[j] += alpha * p[j]; - r[j] -= alpha * w[j]; - } - - rr = dot(r, r, n); - comm_allreduce(c, gs_double, gs_add, &rr, 1, buf); - if (rr < rtol || sqrt(rr) < tol) - break; - - for (j = 0; j < n; j++) - z0[j] = z[j]; - - metric_tic(c, SCHUR_PROJECT_PRECOND); -#if 1 - mg_vcycle(z, r, d, c, bfr); -#else - for (j = 0; j < n; j++) - z[j] = r[j]; -#endif - metric_toc(c, SCHUR_PROJECT_PRECOND); - - if (null_space) - ortho(z, n, ng, c); - for (j = 0; j < n; j++) - dz[j] = z[j] - z0[j]; - - // Do the following two reductions together - rzt = rz1; - rz1 = dot(r, z, n); - comm_allreduce(c, gs_double, gs_add, &rz1, 1, buf); - rz2 = dot(r, dz, n); - comm_allreduce(c, gs_double, gs_add, &rz2, 1, buf); - - if (c->id == 0 && verbose > 0) { - printf("i = %u rr = %e rtol = %e rz0 = %e rz1 = %e rz2 = %e\n", i, rr, - rtol, rzt, rz1, rz2); - fflush(stdout); - } - - beta = rz2 / rzt; - for (j = 0; j < n; j++) - p[j] = z[j] + beta * p[j]; - - for (k = 0; k < n; k++) - P[miter * n + k] = 0; - - for (j = 0; j <= i; j++) { - pw = 0; - for (k = 0; k < n; k++) - pw += W[j * n + k] * p[k]; - comm_allreduce(c, gs_double, gs_add, &pw, 1, buf); - for (k = 0; k < n; k++) - P[miter * n + k] += pw * P[j * n + k]; - } - - for (k = 0; k < n; k++) - p[k] -= P[miter * n + k]; - } - - free(z); - free(P); - - return i == miter ? i : i + 1; -} - -//============================================================================== -// Dump matrix for debug purposes -// -struct mij_t { - ulong r, c; - scalar v; - uint p; -}; - -static int append_par_mat(struct array *mijs, const struct par_mat *A) { - struct mij_t t = {.r = 0, .c = 0, .v = 0, .p = 0}; - if (IS_CSR(A)) { - for (uint i = 0; i < A->rn; i++) { - t.r = A->rows[i]; - for (uint j = A->adj_off[i]; j < A->adj_off[i + 1]; j++) { - t.c = A->cols[A->adj_idx[j]], t.v = A->adj_val[j]; - array_cat(struct mij_t, mijs, &t, 1); - } - if (IS_DIAG(A)) { - t.c = t.r, t.v = A->diag_val[i]; - array_cat(struct mij_t, mijs, &t, 1); - } - } - } else if (IS_CSC(A)) { - for (uint i = 0; i < A->cn; i++) { - t.c = A->cols[i]; - for (uint j = A->adj_off[i]; j < A->adj_off[i + 1]; j++) { - t.r = A->rows[A->adj_idx[j]], t.v = A->adj_val[j]; - array_cat(struct mij_t, mijs, &t, 1); - } - if (IS_DIAG(A)) { - t.r = t.c, t.v = A->diag_val[i]; - array_cat(struct mij_t, mijs, &t, 1); - } - } - } - return 0; -} - -int schur_dump(const char *name, const struct mat *B, - const struct par_mat *A_ls, const struct par_mat *A_sl, - const struct par_mat *A_ss, struct crystal *cr, buffer *bfr) { - struct comm *c = &cr->comm; - - struct array mijs; - array_init(struct mij_t, &mijs, 1000); - - struct mij_t m = {.r = 0, .c = 0, .v = 0, .p = 0}; - for (uint i = 0; i < B->n; i++) { - m.r = B->start + i; - for (uint j = B->Lp[i]; j < B->Lp[i + 1]; j++) { - m.c = B->start + B->Li[j], m.v = B->L[j]; - array_cat(struct mij_t, &mijs, &m, 1); - } - if (B->D != NULL) { - m.c = m.r, m.v = B->D[i]; - array_cat(struct mij_t, &mijs, &m, 1); - } - } - - append_par_mat(&mijs, A_ls); - append_par_mat(&mijs, A_sl); - append_par_mat(&mijs, A_ss); - - sarray_transfer(struct mij_t, &mijs, p, 0, cr); - sarray_sort_2(struct mij_t, mijs.ptr, mijs.n, r, 1, c, 1, bfr); - - if (c->id == 0 && mijs.n > 0) { - FILE *fp = fopen(name, "w"); - if (fp != NULL) { - struct mij_t *pm = (struct mij_t *)mijs.ptr; - for (uint i = 0; i < mijs.n; i++) - fprintf(fp, "%llu %llu %.15lf\n", pm[i].r, pm[i].c, pm[i].v); - fclose(fp); - } - } - - array_free(&mijs); - - return 0; -} - -//============================================================================== -// Schur setup -// -int schur_setup(struct coarse *crs, struct array *eij, struct crystal *cr, - buffer *bfr) { - struct comm *c = &cr->comm; - comm_barrier(c); - double t = comm_time(); - - // Setup A_ll - struct array ll, ls, sl, ss; - array_init(struct mij, &ll, eij->n / 4 + 1); - array_init(struct mij, &ls, eij->n / 4 + 1); - array_init(struct mij, &sl, eij->n / 4 + 1); - array_init(struct mij, &ss, eij->n / 4 + 1); - - struct mij *ptr = (struct mij *)eij->ptr; - for (uint i = 0; i < eij->n; i++) { - if (ptr[i].r <= crs->ng[0]) { - if (ptr[i].c <= crs->ng[0]) - array_cat(struct mij, &ll, &ptr[i], 1); - else - array_cat(struct mij, &ls, &ptr[i], 1); - } else if (ptr[i].c <= crs->ng[0]) { - array_cat(struct mij, &sl, &ptr[i], 1); - } else { - array_cat(struct mij, &ss, &ptr[i], 1); - } - } - - t = comm_time() - t; - double wrk, min = t, max = t; - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - if (c->id == 0) { - printf("\tSeparate matrices: %g %g (min max)\n", min, max); - fflush(stdout); - } - - comm_barrier(c); - t = comm_time(); - - struct schur *schur = crs->solver = (struct schur *)tcalloc(struct schur, 1); - - // Setup local block diagonal (B). This is distributed by rows based on the - // partitioning. - struct mat B; - csr_setup(&B, &ll, 0, bfr); - if (!crs->null_space || (crs->n[1] + crs->n[2] != 0)) - cholesky_factor(&schur->A_ll, &B, 0, bfr); - else - cholesky_factor(&schur->A_ll, &B, 1, bfr); - schur->A_ll.start = crs->s[0]; - array_free(&ll); - - min = max = comm_time() - t; - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - if (c->id == 0) { - printf("\tSetup B : %g %g (min max)\n", min, max); - fflush(stdout); - } - - comm_barrier(c); - t = comm_time(); - - // Setup S: Setup interface nodes. This is distributed by rows in a load - // balanced manner. - par_csr_setup(&schur->A_ss, &ss, 1, bfr); - array_free(&ss); - schur->Q_ss = setup_Q(&schur->A_ss, &cr->comm, bfr); - - min = max = comm_time() - t; - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - if (c->id == 0) { - printf("\tSetup S : %g %g (min max)\n", min, max); - fflush(stdout); - } - - comm_barrier(c); - t = comm_time(); - - // Setup F: Setup local interface connectivity. This is distributed by rows - // similar to B. - par_csr_setup(&schur->A_ls, &ls, 0, bfr); - array_free(&ls); - schur->Q_ls = setup_Fxi_Q(&schur->A_ls, crs->s[1], crs->n[1], &cr->comm, bfr); - - min = max = comm_time() - t; - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - if (c->id == 0) { - printf("\tSetup F : %g %g (min max)\n", min, max); - fflush(stdout); - } - - comm_barrier(c); - t = comm_time(); - - // Setup E: E is distributed by columns in the same manner as columns (or - // rows) of B. - distribute_by_columns(&sl, crs->s[0], crs->n[0], crs->ng[0], cr, bfr); - par_csc_setup(&schur->A_sl, &sl, 0, bfr); - array_free(&sl); - schur->Q_sl = setup_Ezl_Q(&schur->A_sl, crs->s[1], crs->n[1], &cr->comm, bfr); - - min = max = comm_time() - t; - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - if (c->id == 0) { - printf("\tSetup E : %g %g (min max)\n", min, max); - fflush(stdout); - } - - comm_barrier(c); - t = comm_time(); - - // Setup the preconditioner for the Schur complement matrix - schur->M = schur_precond_setup(&schur->A_ll, &schur->A_ls, &schur->A_ss, - &schur->A_sl, crs->s[1], crs->n[1], cr, bfr); - - min = max = comm_time() - t; - comm_allreduce(c, gs_double, gs_min, &min, 1, &wrk); - comm_allreduce(c, gs_double, gs_max, &max, 1, &wrk); - if (c->id == 0) { - printf("\tSetup MG Precond : %g %g (min max)\n", min, max); - fflush(stdout); - } - - return 0; -} - -int schur_solve(scalar *x, struct coarse *crs, scalar *b, scalar tol, - buffer *bfr) { - struct comm *c = &crs->c; - struct schur *schur = crs->solver; - - uint ln = crs->n[0], in = crs->n[1]; - scalar *rhs = (scalar *)tcalloc(scalar, ln > in ? ln : in); - scalar *zl = (scalar *)tcalloc(scalar, ln); - scalar *xl = (scalar *)tcalloc(scalar, in + ln), *xi = xl + ln; - - // Solve: A_ll z_l = r_l - for (uint i = 0; i < ln; i++) - rhs[i] = b[i]; - - metric_tic(c, SCHUR_SOLVE_CHOL1); - cholesky_solve(zl, &schur->A_ll, rhs); - if (crs->null_space && (crs->n[1] + crs->n[2]) == 0) - zl[ln - 1] = 0; - metric_toc(c, SCHUR_SOLVE_CHOL1); - - metric_tic(c, SCHUR_SOLVE_SETRHS1); - // Solve: A_ss x_i = fi where fi = r_i - E zl - Ezl(rhs, &schur->A_sl, schur->Q_sl, zl, crs->s[0], in, bfr); - for (uint i = 0; i < in; i++) - rhs[i] = b[ln + i] - rhs[i]; - metric_toc(c, SCHUR_SOLVE_SETRHS1); - - metric_tic(c, SCHUR_SOLVE_PROJECT); - unsigned miter = (tol < 0 ? fabs(tol) : 100); - scalar mtol = (tol > 0 ? tol : 1e-7); - int iter = project(xi, rhs, schur, crs->s[0], c, miter, mtol, 0, 1, bfr); - metric_toc(c, SCHUR_SOLVE_PROJECT); - metric_acc(SCHUR_PROJECT_NITER, iter); - - // Solve A_ll xl = fl where fl = r_l - F xi - metric_tic(c, SCHUR_SOLVE_SETRHS2); - for (uint i = 0; i < ln; i++) - rhs[i] = 0; - Fxi(rhs, &schur->A_ls, schur->Q_ls, xi, crs->s[0], in, bfr); - for (uint i = 0; i < ln; i++) - rhs[i] = b[i] - rhs[i]; - metric_toc(c, SCHUR_SOLVE_SETRHS2); - - metric_tic(c, SCHUR_SOLVE_CHOL2); - cholesky_solve(xl, &schur->A_ll, rhs); - if (crs->null_space && (crs->n[1] + crs->n[2]) == 0) - xl[ln - 1] = 0; - metric_toc(c, SCHUR_SOLVE_CHOL2); - - for (uint i = 0; i < ln + in; i++) - x[i] = xl[i]; - - if (crs->null_space) { - scalar sum = 0, wrk; - for (uint i = 0; i < ln + in; i++) - sum += x[i]; - comm_allreduce(c, gs_double, gs_add, &sum, 1, &wrk); - sum = sum / (crs->ng[0] + crs->ng[1] + crs->ng[2]); - for (uint i = 0; i < ln + in; i++) - x[i] -= sum; - } - - free(rhs), free(zl), free(xl); - - return 0; -} - -int schur_free(struct coarse *crs) { - struct schur *schur = (struct schur *)crs->solver; - if (schur != NULL) { - mat_free(&schur->A_ll); - par_mat_free(&schur->A_ls); - if (schur->Q_ls != NULL) - gs_free(schur->Q_ls), schur->Q_ls = NULL; - par_mat_free(&schur->A_sl); - if (schur->Q_sl != NULL) - gs_free(schur->Q_sl), schur->Q_sl = NULL; - par_mat_free(&schur->A_ss); - if (schur->Q_ss != NULL) - gs_free(schur->Q_ss), schur->Q_ss = NULL; - if (schur->M != NULL) - mg_free(schur->M), schur->M = NULL; - free(schur), schur = NULL; - } - - return 0; -} - -#undef MAX diff --git a/src/sort-bin.c b/src/sort-bin.c new file mode 100644 index 00000000..cb6c6d9e --- /dev/null +++ b/src/sort-bin.c @@ -0,0 +1,52 @@ +#include "sort-impl.h" + +static uint *set_proc_from_val(struct sort *s, uint field, + const struct comm *c) { + struct array *a = s->a; + gs_dom t = s->t[field]; + uint offset = s->offset[field]; + + double extrema[2]; + get_extrema((void *)extrema, s, field, c); + double range = extrema[1] - extrema[0]; + + uint size = a->n; + if (size == 0) + return NULL; + uint *proc = tcalloc(uint, size); + + uint np = c->np; + assert(np > 0); + uint id = 0, index = 0; + do { + double end = extrema[0] + (range / np) * (id + 1); + while (index < size) { + double val = get_scalar(a, index, offset, s->unit_size, t); + if (val <= end) + proc[index] = id, index++; + else + break; + } + id++; + } while (id < np && index < size); + for (; index < size; index++) + proc[index] = np - 1; + + return proc; +} + +void parallel_bin_sort(struct sort *s, const struct comm *c) { + // Locally sort the array first. + sort_local(s); + + // Set destination bin based on the field value. + uint *proc = set_proc_from_val(s, 0, c); + + // Transfer the array in chunks. + sarray_transfer_chunk(s->a, s->unit_size, proc, c); + free(proc); + + // Locally sort again to make sure that we have both globally and locally + // sorted array. + sort_local(s); +} diff --git a/src/sort-hypercube.c b/src/sort-hypercube.c new file mode 100644 index 00000000..d8f01a52 --- /dev/null +++ b/src/sort-hypercube.c @@ -0,0 +1,150 @@ +#include "sort-impl.h" +#include + +struct hypercube { + struct sort *data; + int nprobes; + double *probes; + ulong *probe_cnt; +}; + +static void init_probes(struct hypercube *data, const struct comm *c) { + // Allocate space for probes and counts. + int nprobes = data->nprobes = 3; + if (!data->probes) + data->probes = tcalloc(double, nprobes); + if (!data->probe_cnt) + data->probe_cnt = tcalloc(ulong, nprobes); + + double extrema[2]; + get_extrema((void *)extrema, data->data, 0, c); + double range = extrema[1] - extrema[0]; + double delta = range / (nprobes - 1); + + data->probes[0] = extrema[0]; + data->probes[1] = extrema[0] + delta; + data->probes[2] = extrema[1]; +} + +static void update_probe_counts(struct hypercube *data, const struct comm *c) { + struct sort *input = data->data; + uint offset = input->offset[0]; + gs_dom t = input->t[0]; + + uint nprobes = data->nprobes; + for (uint i = 0; i < nprobes; i++) + data->probe_cnt[i] = 0; + + struct array *a = input->a; + for (uint e = 0; e < a->n; e++) { + double val = get_scalar(a, e, offset, input->unit_size, t); + for (uint i = 0; i < nprobes; i++) { + if (val < data->probes[i]) + data->probe_cnt[i]++; + } + } + + slong wrk[6]; + comm_allreduce(c, gs_long, gs_add, data->probe_cnt, nprobes, wrk); +} + +static void update_probes(slong nelem, double *probes, ulong *probe_cnt, + uint threshold) { + assert(nelem >= 0); + slong expected = nelem / 2; + if (llabs(expected - (slong)probe_cnt[1]) < threshold) + return; + + if (probe_cnt[1] < (ulong)expected) + probes[0] = probes[1]; + else + probes[2] = probes[1]; + + probes[1] = probes[0] + (probes[2] - probes[0]) / 2.0; +} + +static void transfer_elem(const struct hypercube *data, const struct comm *c) { + struct sort *input = data->data; + uint usize = input->unit_size; + uint offset = input->offset[0]; + gs_dom t = input->t[0]; + struct array *a = input->a; + + uint size = a->n, lown = 0, uppern = 0; + for (uint e = 0; e < size; e++) { + double val = get_scalar(a, e, offset, usize, t); + if (val < data->probes[1]) + lown++; + else + uppern++; + } + + slong out[2][2], in[2] = {lown, uppern}, wrk[2][2]; + comm_scan(out, c, gs_long, gs_add, in, 2, wrk); + slong lstart = out[0][0], ustart = out[0][1]; + slong lelem = out[1][0], uelem = out[1][1]; + + uint np = c->np, lnp = np / 2; + uint *proc1 = set_proc_from_idx(lnp, lstart, lown, lelem); + uint *proc2 = set_proc_from_idx(np - lnp, ustart, uppern, uelem); + proc1 = trealloc(uint, proc1, size); + for (uint e = lown; e < size; e++) + proc1[e] = proc2[e - lown] + lnp; + + sarray_transfer_chunk(a, usize, proc1, c); + free(proc1), free(proc2); +} + +// TODO: Get rid of this recursive implementation. +static void parallel_hypercube_sort_aux(struct hypercube *data, + const struct comm *c) { + struct sort *input = data->data; + struct array *a = input->a; + + // FIXME: Replace comm_scan() by comm_allreduce(). + slong out[2][1], buf[2][1], in = a->n; + comm_scan(out, c, gs_long, gs_add, &in, 1, buf); + slong nelem = out[1][0]; + + uint threshold = nelem / (10 * c->np); + if (threshold < 2) + threshold = 2; + + sort_local(data->data); + + if (c->np == 1) + return; + + init_probes(data, c); + update_probe_counts(data, c); + int max_iter = log2((data->probes[2] - data->probes[0]) / 1e-12); + int iter = 0; + while (llabs(nelem / 2 - (slong)data->probe_cnt[1]) > threshold && + iter++ < max_iter) { + update_probes(nelem, data->probes, data->probe_cnt, threshold); + update_probe_counts(data, c); + } + + transfer_elem(data, c); + + // split the communicator + struct comm nc; + sint lower = (c->id < c->np / 2); + comm_split(c, lower, c->id, &nc); + + // TODO: Keep load balancing after each split + parallel_hypercube_sort_aux(data, &nc); + + comm_free(&nc); +} + +void parallel_hypercube_sort(struct sort *sd, const struct comm *c) { + struct comm dup; + comm_dup(&dup, c); + + struct hypercube hdata = {.data = sd, .probes = NULL, .probe_cnt = NULL}; + parallel_hypercube_sort_aux(&hdata, &dup); + free(hdata.probes), free(hdata.probe_cnt); + + comm_free(&dup); +} diff --git a/src/sort-impl.h b/src/sort-impl.h new file mode 100644 index 00000000..47a52100 --- /dev/null +++ b/src/sort-impl.h @@ -0,0 +1,34 @@ +#ifndef _PARRSB_SORT_IMPL_H_ +#define _PARRSB_SORT_IMPL_H_ + +#include "sort.h" + +double get_scalar(struct array *a, uint i, uint offset, uint usize, + gs_dom type); + +uint *set_proc_from_idx(uint size, sint np, slong start, slong nelem); + +void sarray_transfer_chunk(struct array *arr, const size_t usize, + const uint *proc, const struct comm *c); + +struct sort { + struct array *a; + size_t unit_size, align; + + int nfields; + gs_dom t[3]; + uint offset[3]; + + buffer *buf; +}; + +void sort_local(struct sort *s); + +void get_extrema(void *extrema_, struct sort *data, uint field, + const struct comm *c); + +void parallel_hypercube_sort(struct sort *s, const struct comm *c); + +void parallel_bin_sort(struct sort *s, const struct comm *c); + +#endif // _PARRSB_SORT_IMPL_H_ diff --git a/src/sort.c b/src/sort.c index 5de28e61..93b0adb5 100644 --- a/src/sort.c +++ b/src/sort.c @@ -1,9 +1,12 @@ -#include "sort.h" +#include "sort-impl.h" #include #include -static double get_scalar(struct array *a, uint i, uint offset, uint usize, - gs_dom type) { +extern void parrsb_print(const struct comm *c, int verbose, const char *fmt, + ...); + +double get_scalar(struct array *a, uint i, uint offset, uint usize, + gs_dom type) { char *v = (char *)a->ptr + i * usize + offset; double data; @@ -18,14 +21,16 @@ static double get_scalar(struct array *a, uint i, uint offset, uint usize, data = *((double *)v); break; default: + fprintf(stderr, "Error: Unknown type %d\n", type); + exit(EXIT_FAILURE); break; } return data; } -static void get_extrema(void *extrema_, struct sort *data, uint field, - const struct comm *c) { +void get_extrema(void *extrema_, struct sort *data, uint field, + const struct comm *c) { struct array *a = data->a; uint usize = data->unit_size; uint offset = data->offset[field]; @@ -41,70 +46,35 @@ static void get_extrema(void *extrema_, struct sort *data, uint field, extrema[1] = get_scalar(a, size - 1, offset, usize, t); } - double buf[2]; + double buf[4]; comm_allreduce(c, gs_double, gs_max, extrema, 2, buf); extrema[0] *= -1; } -static int set_dest(uint *proc, uint size, sint np, slong start, slong nelem) { +uint *set_proc_from_idx(uint size, sint np_, slong start, slong nelem) { if (nelem == 0) - return 1; + return NULL; + uint *proc = tcalloc(uint, size + 1); - uint nelt = nelem / np, nrem = nelem - np * nelt; + ulong np = np_; + ulong nelt = nelem / np, nrem = nelem - np * nelt; + assert(nrem < np); if (nrem == 0) { - for (uint i = 0; i < size; i++) { - proc[i] = (start + i) / nelt; - } + for (uint i = 0; i < size; i++) + proc[i] = (uint)((start + i) / nelt); } else { - uint s = np - nrem; - slong t = nelt * s; + ulong s = np - nrem; + ulong t1 = nelt * s; for (uint i = 0; i < size; i++) { - if (start + i < t) - proc[i] = (start + i) / nelt; + ulong spi = start + i; + if (spi < t1) + proc[i] = (uint)(spi / nelt); else - proc[i] = s + (start + i - t) / (nelt + 1); + proc[i] = (uint)s + (uint)((spi - t1) / (nelt + 1)); } } - return 0; -} - -//----------------------------------------------------------------------------- -// Parallel Bin-Sort -// -static int set_bin(uint **proc_, struct sort *s, uint field, - const struct comm *c) { - struct array *a = s->a; - gs_dom t = s->t[field]; - uint offset = s->offset[field]; - - uint size = a->n; - uint *proc = *proc_ = tcalloc(uint, size); - - double extrema[2]; - get_extrema((void *)extrema, s, field, c); - double range = extrema[1] - extrema[0]; - - if (size == 0) - return 0; - - sint np = c->np; - uint id = 0; - uint index = 0; - do { - double end = extrema[0] + (range / np) * (id + 1); - while (index < size) { - double val = get_scalar(a, index, offset, s->unit_size, t); - if (val <= end) - proc[index++] = id; - else - break; - } - id++; - } while (id < np && index < size); - for (; index < size; index++) - proc[index] = np - 1; - return 0; + return proc; } static int sort_field(struct array *arr, size_t usize, gs_dom t, uint off, @@ -131,7 +101,7 @@ static int sort_field(struct array *arr, size_t usize, gs_dom t, uint off, return 0; } -int sort_local(struct sort *s) { +void sort_local(struct sort *s) { struct array *a = s->a; buffer *buf = s->buf; size_t usize = s->unit_size; @@ -141,247 +111,120 @@ int sort_local(struct sort *s) { while (i >= 0) sort_field(a, usize, s->t[i], s->offset[i], buf, 1), i--; sarray_permute_buf_(s->align, usize, a->ptr, a->n, buf); - - return 0; } -static int parallel_bin_sort(struct sort *s, const struct comm *c) { - // Local sort - sort_local(s); - - // Set destination bin - uint *proc; - set_bin(&proc, s, 0, c); - - // Transfer to destination processor - struct crystal cr; - crystal_init(&cr, c); - sarray_transfer_ext_(s->a, s->unit_size, proc, sizeof(uint), &cr); - crystal_free(&cr); +static int load_balance(struct array *a, size_t size, const struct comm *c) { + slong out[2][1], wrk[2][1], in = a->n; + comm_scan(out, c, gs_long, gs_add, &in, 1, wrk); + slong start = out[0][0], nelem = out[1][0]; + parrsb_print(c, 0, "\t\t\tstart = %lld, nelem = %lld", start, nelem); + uint *proc = set_proc_from_idx(a->n, c->np, start, nelem); + sarray_transfer_chunk(a, size, proc, c); free(proc); - // Locally sort again - sort_local(s); - - return 0; -} - -//----------------------------------------------------------------------------- -// Parallel Hypercube-Sort -// -struct hypercube { - struct sort *data; - int nprobes; - double *probes; - ulong *probe_cnt; -}; - -static int init_probes(struct hypercube *data, struct comm *c) { - struct sort *input = data->data; - - // Allocate space for probes and counts - int nprobes = data->nprobes = 3; - if (!data->probes) - data->probes = tcalloc(double, nprobes); - if (!data->probe_cnt) - data->probe_cnt = tcalloc(ulong, nprobes); - - double extrema[2]; - get_extrema((void *)extrema, data->data, 0, c); - double range = extrema[1] - extrema[0]; - double delta = range / (nprobes - 1); - - data->probes[0] = extrema[0]; - data->probes[1] = extrema[0] + delta; - data->probes[2] = extrema[1]; - - return 0; -} - -static int update_probe_counts(struct hypercube *data, struct comm *c) { - struct sort *input = data->data; - uint offset = input->offset[0]; - gs_dom t = input->t[0]; - - uint nprobes = data->nprobes; - uint i; - for (i = 0; i < nprobes; i++) - data->probe_cnt[i] = 0; - - struct array *a = input->a; - uint e; - for (e = 0; e < a->n; e++) { - double val_e = get_scalar(a, e, offset, input->unit_size, t); - for (i = 0; i < nprobes; i++) - if (val_e < data->probes[i]) - data->probe_cnt[i]++; - } - - ulong buf[3]; - comm_allreduce(c, gs_long, gs_add, data->probe_cnt, nprobes, buf); - - return 0; -} - -static int update_probes(slong nelem, double *probes, ulong *probe_cnt, - uint threshold) { - slong expected = nelem / 2; - if (llabs(expected - (slong)probe_cnt[1]) < threshold) - return 0; - - if (probe_cnt[1] < expected) - probes[0] = probes[1]; - else - probes[2] = probes[1]; - - probes[1] = probes[0] + (probes[2] - probes[0]) / 2.0; - return 0; } -static int transfer_elem(struct hypercube *data, struct comm *c) { - struct sort *input = data->data; - uint usize = input->unit_size, offset = input->offset[0]; - gs_dom t = input->t[0]; - struct array *a = input->a; - - uint size = a->n, lown = 0, uppern = 0; - for (uint e = 0; e < size; e++) { - double val = get_scalar(a, e, offset, usize, t); - if (val < data->probes[1]) - lown++; - else - uppern++; - } - - slong out[2][2], in[2] = {lown, uppern}, buf[2][2]; - comm_scan(out, c, gs_long, gs_add, in, 2, buf); - slong lstart = out[0][0], ustart = out[0][1]; - slong lelem = out[1][0], uelem = out[1][1]; - - uint np = c->np, lnp = np / 2; - uint *proc = tcalloc(uint, size); - set_dest(proc, lnp, lstart, lown, lelem); - set_dest(proc + lown, np - lnp, ustart, uppern, uelem); - - for (uint e = lown; e < size; e++) - proc[e] += lnp; +void sarray_transfer_chunk(struct array *arr, const size_t usize, + const uint *proci, const struct comm *c) { + // Calculate the global array size. If it is zero, nothing to do, just return. + slong ng = arr->n, wrk[2]; + comm_allreduce(c, gs_long, gs_add, &ng, 1, wrk); + if (ng == 0) + return; + // Initialize the crystal router. struct crystal cr; crystal_init(&cr, c); - sarray_transfer_ext_(a, usize, proc, sizeof(uint), &cr); - crystal_free(&cr); - - free(proc); - - return 0; -} - -static int parallel_hypercube_sort(struct hypercube *data, struct comm *c) { - struct sort *input = data->data; - struct array *a = input->a; - gs_dom t = input->t[0]; - uint offset = input->offset[0]; - - sint size = c->np, rank = c->id; - - slong out[2][1], buf[2][1], in = a->n; - comm_scan(out, c, gs_long, gs_add, &in, 1, buf); - slong start = out[0][0]; - slong nelem = out[1][0]; - - uint threshold = nelem / (10 * size); - if (threshold < 2) - threshold = 2; - sort_local(data->data); - - if (size == 1) - return 0; - - init_probes(data, c); - update_probe_counts(data, c); - - int max_iter = log2((data->probes[2] - data->probes[0]) / 1e-12); - int iter = 0; - while (llabs(nelem / 2 - (slong)data->probe_cnt[1]) > threshold && - iter++ < max_iter) { - update_probes(nelem, data->probes, data->probe_cnt, threshold); - update_probe_counts(data, c); + // Allocate `proc` with some buffer space. + uint *proc = tcalloc(uint, arr->n + 1); + for (uint i = 0; i < arr->n; i++) + proc[i] = proci[i]; + + // Transfer the array elements to destination processor. To avoid message + // sizes larger than INT_MAX, we calculate total message size and then figure + // out how many transfers we need. Then we transfer array using that many + // transfers. + slong msg_size = 9 * (INT_MAX / 10); + uint nt = (ng * usize + msg_size - 1) / msg_size; + parrsb_print(c, 0, "\t\t\tmsg_size = %lld, nt = %u", msg_size, nt); + uint tsize = (arr->n + nt - 1) / nt; + + struct array brr, crr; + array_init_(&brr, tsize + 1, usize, __FILE__, __LINE__); + array_init_(&crr, arr->n + 1, usize, __FILE__, __LINE__); + + char *pe = (char *)arr->ptr; + uint off = 0, off1; + for (unsigned t = 0; t < nt; t++) { + // Copy a chunk from `arr` to `brr`. + brr.n = 0, off1 = off + tsize; + assert(off <= arr->n); + for (uint j = off; j < arr->n && j < off1; j++) + array_cat_(usize, &brr, &pe[j * usize], 1, __FILE__, __LINE__); + + // Transfer the chunk in `brr` to the destination. + sarray_transfer_ext_(&brr, usize, &proc[off], sizeof(uint), &cr); + + // Append the received chunk to `crr`. + array_cat_(usize, &crr, brr.ptr, brr.n, __FILE__, __LINE__); + off = (off1 < arr->n ? off1 : arr->n); + + // Some debug printing. + slong cmax = crr.n, bmax = brr.n, cmin = crr.n, bmin = brr.n; + comm_allreduce(c, gs_long, gs_max, &cmax, 1, wrk); + comm_allreduce(c, gs_long, gs_max, &bmax, 1, wrk); + comm_allreduce(c, gs_long, gs_min, &cmin, 1, wrk); + comm_allreduce(c, gs_long, gs_min, &bmin, 1, wrk); + parrsb_print(c, 0, "\t\t\t %d/%d brr.n = %u/%lld/%lld crr.n = %u/%lld/%lld", + t, nt, brr.n, bmin, bmax, crr.n, cmin, cmax); } + array_free(&brr); - transfer_elem(data, c); - - // split the communicator - struct comm nc; - sint lower = (rank < size / 2) ? 1 : 0; -#if defined(MPI) - MPI_Comm nc_; - MPI_Comm_split(c->c, lower, rank, &nc_); - comm_init(&nc, nc_); - MPI_Comm_free(&nc_); -#else - comm_init(&nc, 1); -#endif + arr->n = 0; + array_cat_(usize, arr, crr.ptr, crr.n, __FILE__, __LINE__); + array_free(&crr); - // TODO: Keep load balancing after each split - parallel_hypercube_sort(data, &nc); - comm_free(&nc); - - return 0; -} - -static int load_balance(struct array *a, size_t size, const struct comm *c, - struct crystal *cr) { - slong out[2][1], buf[2][1], in = a->n; - comm_scan(out, c, gs_long, gs_add, &in, 1, buf); - slong start = out[0][0], nelem = out[1][0]; - - uint *proc = tcalloc(uint, a->n); - set_dest(proc, a->n, c->np, start, nelem); - sarray_transfer_ext_(a, size, proc, sizeof(uint), cr); free(proc); - - return 0; + crystal_free(&cr); } -int parallel_sort_private(struct sort *data, const struct comm *c) { - struct comm dup; - comm_dup(&dup, c); - - int balance = data->balance, algo = data->algo; - - struct array *a = data->a; - size_t usize = data->unit_size; +void parallel_sort_(struct array *arr, size_t usize, size_t align, + unsigned algo, unsigned balance, const struct comm *c, + buffer *bfr, unsigned nfields, ...) { + struct sort sd = {.a = arr, .unit_size = usize, .align = align}; + sd.buf = bfr; + sd.nfields = nfields; + + va_list vargs; + va_start(vargs, nfields); + for (uint i = 0; i < nfields; i++) { + sd.t[i] = va_arg(vargs, gs_dom); + sd.offset[i] = va_arg(vargs, size_t); + } + va_end(vargs); - struct hypercube hdata; + // If there is only a single MPI process, just sort locally and return. + if (c->np == 1) { + sort_local(&sd); + return; + } switch (algo) { case 0: - parallel_bin_sort(data, c); + parallel_bin_sort(&sd, c); break; case 1: - hdata.data = data; - hdata.probes = NULL; - hdata.probe_cnt = NULL; - parallel_hypercube_sort(&hdata, &dup); - free(hdata.probes); - free(hdata.probe_cnt); + parallel_hypercube_sort(&sd, c); break; default: break; } if (balance) { - struct crystal cr; - crystal_init(&cr, c); - load_balance(a, usize, c, &cr); - crystal_free(&cr); - sort_local(data); + load_balance(sd.a, sd.unit_size, c); + sort_local(&sd); } - - comm_free(&dup); - - return 0; } diff --git a/src/sort.h b/src/sort.h index b2e83cd2..5e8f150f 100644 --- a/src/sort.h +++ b/src/sort.h @@ -2,54 +2,18 @@ #define _PARRSB_SORT_H_ #include +#include -struct sort { - int balance, algo; +void parallel_sort_(struct array *arr, size_t usize, size_t align, + unsigned algo, unsigned balance, const struct comm *c, + buffer *bfr, unsigned nfields, ...); - int nfields; - gs_dom t[3]; - uint offset[3]; +#define parallel_sort(T, A, field, type, algo, balance, c, bfr) \ + parallel_sort_(A, sizeof(T), ALIGNOF(T), algo, balance, c, bfr, 1, type, \ + offsetof(T, field)) - struct array *a; - size_t unit_size, align; - - buffer *buf; -}; - -int sort_local(struct sort *s); -int parallel_sort_private(struct sort *s, const struct comm *c); - -// Uniform parallel sort -#define parallel_sort(T, A, field, type, method, loadbalance, c, bufp) \ - do { \ - struct sort sd; \ - sd.unit_size = sizeof(T); \ - sd.align = ALIGNOF(T); \ - sd.nfields = 1; \ - sd.t[0] = type; \ - sd.offset[0] = offsetof(T, field); \ - sd.a = A; \ - sd.algo = method; \ - sd.balance = loadbalance; \ - sd.buf = bufp; \ - parallel_sort_private(&sd, c); \ - } while (0) - -#define parallel_sort_2(T, A, f1, t1, f2, t2, method, loadbalance, c, bufp) \ - do { \ - struct sort sd; \ - sd.unit_size = sizeof(T); \ - sd.align = ALIGNOF(T); \ - sd.nfields = 2; \ - sd.t[0] = t1; \ - sd.offset[0] = offsetof(T, f1); \ - sd.t[1] = t2; \ - sd.offset[1] = offsetof(T, f2); \ - sd.a = A; \ - sd.algo = method; \ - sd.balance = loadbalance; \ - sd.buf = bufp; \ - parallel_sort_private(&sd, c); \ - } while (0) +#define parallel_sort_2(T, A, f1, t1, f2, t2, algo, balance, c, bfr) \ + parallel_sort_(A, sizeof(T), ALIGNOF(T), algo, balance, c, bfr, 2, t1, \ + offsetof(T, f1), t2, offsetof(T, f2)) #endif diff --git a/src/statistics.c b/src/statistics.c new file mode 100644 index 00000000..f8f8697a --- /dev/null +++ b/src/statistics.c @@ -0,0 +1,230 @@ +#include "parrsb-impl.h" + +#include +#include + +static uint get_partition(const struct comm *const gc, + const struct comm *const lc) { + // Find the partition id. A partition is a group of processors sharing the + // same local communicator. + sint out[2][1], wrk[2][1], root = (lc->id == 0); + comm_scan(out, gc, gs_int, gs_add, &root, 1, wrk); + sint part = out[0][0] * (lc->id == 0); + comm_allreduce(lc, gs_int, gs_max, &part, 1, wrk); + return part; +} + +uint parrsb_get_neighbors(const struct array *const elems, const unsigned nv, + const struct comm *const gc, + const struct comm *const lc, buffer *bfr) { + const uint n = elems->n; + const uint size = elems->n * nv; + + struct vertex_t { + ulong v; + uint p, partition; + }; + + struct array vertices; + array_init(struct vertex_t, &vertices, size); + + const struct rsb_element *const pe = + (const struct rsb_element *const)elems->ptr; + struct vertex_t vt = {.partition = get_partition(gc, lc)}; + for (uint i = 0; i < n; i++) { + for (uint v = 0; v < nv; v++) { + vt.v = pe[i].vertices[v], vt.p = vt.v % gc->np; + array_cat(struct vertex_t, &vertices, &vt, 1); + } + } + + struct crystal cr; + crystal_init(&cr, gc); + + sarray_transfer(struct vertex_t, &vertices, p, 1, &cr); + sarray_sort(struct vertex_t, vertices.ptr, vertices.n, v, 1, bfr); + + struct array neighbors; + array_init(struct vertex_t, &neighbors, vertices.n * 27); + + const struct vertex_t *const pv = (const struct vertex_t *const)vertices.ptr; + uint s = 0; + while (s < vertices.n) { + uint e = s + 1; + while (e < vertices.n && pv[s].v == pv[e].v) + e++; + for (uint i = s; i < e; i++) { + struct vertex_t vt = pv[i]; + for (uint j = s; j < e; j++) { + vt.partition = pv[j].partition; + array_cat(struct vertex_t, &neighbors, &vt, 1); + } + } + s = e; + } + array_free(&vertices); + + sarray_transfer(struct vertex_t, &neighbors, p, 0, &cr); + crystal_free(&cr); + sarray_sort(struct vertex_t, neighbors.ptr, neighbors.n, partition, 0, bfr); + + // Now, extract out different partition ids found locally into an array. + struct unique_t { + uint p, partition; + }; + + struct array unique; + array_init(struct unique_t, &unique, 27); + + if (neighbors.n > 0) { + const struct vertex_t *const pn = + (const struct vertex_t *const)neighbors.ptr; + struct unique_t ut = {.partition = pn[0].partition, + .p = pn[0].partition % lc->np}; + array_cat(struct unique_t, &unique, &ut, 1); + for (uint i = 1; i < neighbors.n; i++) { + if (pn[i].partition > pn[i - 1].partition) { + ut.partition = pn[i].partition, ut.p = ut.partition % lc->np; + array_cat(struct unique_t, &unique, &ut, 1); + } + } + } + array_free(&neighbors); + + crystal_init(&cr, lc); + sarray_transfer(struct unique_t, &unique, p, 0, &cr); + crystal_free(&cr); + + sarray_sort(struct unique_t, unique.ptr, unique.n, partition, 0, bfr); + sint un = 0; + if (unique.n > 0) { + un = 1; + struct unique_t *pu = (struct unique_t *)unique.ptr; + for (uint i = 1; i < unique.n; i++) { + if (pu[i].partition > pu[un - 1].partition) + pu[un] = pu[i], un++; + } + } + array_free(&unique); + + sint wrk; + comm_allreduce(lc, gs_int, gs_add, &un, 1, &wrk); + assert(un >= 1); + + return un - 1; +} + +static struct array pgeom; +static buffer bfr; +static uint pgeom_initialized = 0; +static uint nv = 0; +static uint level = 0; + +struct pgeom_t { + uint partition, level; + double centroid[3], min[3], max[3]; + uint p; +}; + +void parrsb_dump_stats_start(const uint nv_) { + if (pgeom_initialized) + return; + + nv = nv_; + level = 0; + array_init(struct pgeom_t, &pgeom, 1024); + buffer_init(&bfr, 1024); + + pgeom_initialized = 1; +} + +void parrsb_dump_stats(const struct comm *const gc, const struct comm *const lc, + const struct array *const elems, buffer *bfr) { + if (!pgeom_initialized) + return; + + const struct rsb_element *const pe = + (const struct rsb_element *const)elems->ptr; + + // Find the centroid and the bounding box of the partition. + double centroid[3] = {0.0, 0.0, 0.0}; + double max[3] = {-DBL_MAX, -DBL_MAX, -DBL_MAX}; + double min[3] = {DBL_MAX, DBL_MAX, DBL_MAX}; + const uint n = elems->n; + const unsigned ndim = (nv == 8) ? 3 : 2; + for (uint e = 0; e < n; e++) { + for (uint d = 0; d < ndim; d++) { + double c = pe[e].coord[d]; + centroid[d] += c; + max[d] = (max[d] < c) ? c : max[d]; + min[d] = (min[d] > c) ? c : min[d]; + } + } + for (uint d = 0; d < ndim; d++) + centroid[d] /= n; + + double wrk[3]; + comm_allreduce(lc, gs_double, gs_min, min, ndim, wrk); + comm_allreduce(lc, gs_double, gs_max, max, ndim, wrk); + comm_allreduce(lc, gs_double, gs_add, centroid, ndim, wrk); + for (uint d = 0; d < ndim; d++) + centroid[d] /= lc->np; + + // Partition root accumulates the partition geometry. + level++; + struct pgeom_t pg = {.partition = get_partition(gc, lc), + .level = level, + .centroid = {centroid[0], centroid[1], centroid[2]}, + .max = {max[0], max[1], max[2]}, + .min = {min[0], min[1], min[2]}, + .p = 0}; + if (lc->id == 0) + array_cat(struct pgeom_t, &pgeom, &pg, 1); +} + +void parrsb_dump_stats_end(const struct comm *const gc, const char *prefix) { + if (!pgeom_initialized) + return; + + const uint size = strnlen(prefix, 64); + assert(size < 64 && "Prefix must be less than 64 characters."); + + // Send all the data to global root. + struct crystal cr; + crystal_init(&cr, gc); + sarray_transfer(struct pgeom_t, &pgeom, p, 0, &cr); + crystal_free(&cr); + + // Sort by level first, then by partition id. + sarray_sort_2(struct pgeom_t, pgeom.ptr, pgeom.n, level, 0, partition, 0, + &bfr); + + if (gc->id == 0) { + const char name[BUFSIZ]; + snprintf((char *)name, BUFSIZ, "%s_partition_geom_p%06d.txt", prefix, + gc->np); + + FILE *fp = fopen(name, "w"); + if (!fp) { + fprintf(stderr, "Failed to open %s for writing.\n", name); + exit(EXIT_FAILURE); + } + + fprintf(fp, "%zu\n", pgeom.n); + fprintf(fp, "level partition centroid[0] centroid[1] centroid[2] min[0] " + "min[1] min[2] max[0] max[1] max[2]\n"); + const struct pgeom_t *const pg = (const struct pgeom_t *const)pgeom.ptr; + for (uint i = 0; i < pgeom.n; i++) { + fprintf(fp, "%u %u %lf %lf %lf %lf %lf %lf %lf %lf %lf\n", pg[i].level, + pg[i].partition, pg[i].centroid[0], pg[i].centroid[1], + pg[i].centroid[2], pg[i].min[0], pg[i].min[1], pg[i].min[2], + pg[i].max[0], pg[i].max[1], pg[i].max[2]); + } + fclose(fp); + } + + array_free(&pgeom); + buffer_free(&bfr); + + pgeom_initialized = nv = level = 0; +}