cuSolverRf.cpp

/*
 * Copyright 2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/*
 *  A framework of refactorization process.
 *  
 *  step 1: compute P*A*Q = L*U by
 *    - reordering and
 *    - LU with partial pivoting in cusolverSp
 *
 *  step 2: set up cusolverRf by (P, Q, L, U)
 *
 *  step 3: analyze and refactor A
 *
 *  How to use
 *     ./cuSolverRf -P=symrcm -file <file>
 *     ./cuSolverRf -P=symamd -file <file>
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

#include "cusolverSp.h"
#include "cusolverRf.h"

#include "helper_string.h"
#include "helper_cusolver.h"

#include "cusolverSp_LOWLEVEL_PREVIEW.h"

#include <cuda_runtime.h>
#include "helper_cuda.h"


template <typename T_ELEM>
int loadMMSparseMatrix(
    char *filename,
    char elem_type,
    bool csrFormat,
    int *m,
    int *n,
    int *nnz,
    T_ELEM **aVal,
    int **aRowInd,
    int **aColInd,
    int extendSymMatrix);

void UsageRF(void)
{
    printf( "<options>\n");
    printf( "-h          : display this help\n");
    printf( "-P=<name>    : choose a reordering\n");
    printf( "              symrcm (Reverse Cuthill-McKee)\n");
    printf( "              symamd (Approximate Minimum Degree)\n");
    printf( "-file=<filename> : filename containing a matrix in MM format\n");
    printf( "-device=<device_id> : <device_id> if want to run on specific GPU\n");

    exit( 0 );
}

void parseCommandLineArguments(int argc, char *argv[], struct testOpts &opts)
{
    memset(&opts, 0, sizeof(opts));

    if (checkCmdLineFlag(argc, (const char **)argv, "-h"))
    {
        UsageRF();
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "P"))
    {
        char *reorderType = NULL;
        getCmdLineArgumentString(argc, (const char **)argv, "P", &reorderType);

        if (reorderType)
        {
            if ((STRCASECMP(reorderType, "symrcm") != 0) && (STRCASECMP(reorderType, "symamd") != 0))
            {
                printf("\nIncorrect argument passed to -P option\n");
                UsageRF();
            }
            else
            {
                opts.reorder = reorderType;
            }
        }
    }

    if (!opts.reorder)
    {
        opts.reorder = "symrcm"; // Setting default reordering to be symrcm.
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "file"))
    {
        char *fileName = 0;
        getCmdLineArgumentString(argc, (const char **)argv, "file", &fileName);

        if (fileName)
        {
            opts.sparse_mat_filename = fileName;
        }
        else
        {
            printf("\nIncorrect filename passed to -file \n ");
            UsageRF();
        }
    }
}

int main (int argc, char *argv[])
{
    struct testOpts opts;
    cusolverRfHandle_t cusolverRfH = NULL; // refactorization
    cusolverSpHandle_t cusolverSpH = NULL; // reordering, permutation and 1st LU factorization
    cusparseHandle_t   cusparseH = NULL;   // residual evaluation
    cudaStream_t stream = NULL;
    cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix

    csrluInfoHost_t info = NULL; // opaque info structure for LU with parital pivoting

    int rowsA = 0; // number of rows of A
    int colsA = 0; // number of columns of A
    int nnzA  = 0; // number of nonzeros of A
    int baseA = 0; // base index in CSR format
                   // cusolverRf only works for base-0

// cusolverRf only works for square matrix, 
// assume n = rowsA = colsA

    // CSR(A) from I/O
    int *h_csrRowPtrA = NULL; // <int> n+1 
    int *h_csrColIndA = NULL; // <int> nnzA 
    double *h_csrValA = NULL; // <double> nnzA 

    int *h_Qreorder = NULL; // <int> n
                            // reorder to reduce zero fill-in
                            // Qreorder = symrcm(A) or Qreroder = symamd(A)
    // B = Q*A*Q^T
    int *h_csrRowPtrB = NULL; // <int> n+1
    int *h_csrColIndB = NULL; // <int> nnzA
    double *h_csrValB = NULL; // <double> nnzA
    int *h_mapBfromA = NULL;  // <int> nnzA

    double *h_x = NULL; // <double> n,  x = A \ b
    double *h_b = NULL; // <double> n, b = ones(m,1)
    double *h_r = NULL; // <double> n, r = b - A*x

    // solve B*(Qx) = Q*b
    double *h_xhat = NULL; // <double> n, Q*x_hat = x
    double *h_bhat = NULL; // <double> n, b_hat = Q*b 

    size_t size_perm = 0;
    size_t size_internal = 0; 
    size_t size_lu  = 0; // size of working space for csrlu
    void *buffer_cpu = NULL; // working space for
                             // - permutation: B = Q*A*Q^T
                             // - LU with partial pivoting in cusolverSp

    // cusolverSp computes LU with partial pivoting
    //     Plu*B*Qlu^T = L*U
    //   where B = Q*A*Q^T
    //
    // nnzL and nnzU are not known until factorization is done.
    // However upper bound of L+U is known after symbolic analysis of LU.
    int *h_Plu = NULL; // <int> n
    int *h_Qlu = NULL; // <int> n

    int nnzL = 0;
    int *h_csrRowPtrL = NULL; // <int> n+1
    int *h_csrColIndL = NULL; // <int> nnzL
    double *h_csrValL = NULL; // <double> nnzL

    int nnzU = 0;
    int *h_csrRowPtrU = NULL; // <int> n+1
    int *h_csrColIndU = NULL; // <int> nnzU
    double *h_csrValU = NULL; // <double> nnzU

    int *h_P = NULL; // <int> n, P = Plu * Qreorder
    int *h_Q = NULL; // <int> n, Q = Qlu * Qreorder

    int *d_csrRowPtrA = NULL; // <int> n+1
    int *d_csrColIndA = NULL; // <int> nnzA
    double *d_csrValA = NULL; // <double> nnzA
    double *d_x = NULL; // <double> n, x = A \ b 
    double *d_b = NULL; // <double> n, a copy of h_b
    double *d_r = NULL; // <double> n, r = b - A*x

    int *d_P = NULL; // <int> n, P*A*Q^T = L*U
    int *d_Q = NULL; // <int> n 
  
    double *d_T = NULL; // working space in cusolverRfSolve
                        // |d_T| = n * nrhs  

    // the constants used in residual evaluation, r = b - A*x
    const double minus_one = -1.0;
    const double one = 1.0;
    // the constants used in cusolverRf
    // nzero is the value below which zero pivot is flagged.
    // nboost is the value which is substitured for zero pivot.
    double nzero = 0.0;
    double nboost= 0.0;
    // the constant used in cusolverSp
    // singularity is -1 if A is invertible under tol
    // tol determines the condition of singularity
    // pivot_threshold decides pivoting strategy            
    int singularity = 0; 
    const double tol = 1.e-14;
    const double pivot_threshold = 1.0;
    // the constants used in cusolverRf
    const cusolverRfFactorization_t fact_alg = CUSOLVERRF_FACTORIZATION_ALG0; // default
    const cusolverRfTriangularSolve_t solve_alg = CUSOLVERRF_TRIANGULAR_SOLVE_ALG1; // default

    double x_inf = 0.0; // |x|
    double r_inf = 0.0; // |r|
    double A_inf = 0.0; // |A|
    int errors = 0;

    double start, stop;
    double time_reorder;
    double time_perm;
    double time_sp_analysis;
    double time_sp_factor;
    double time_sp_solve;
    double time_sp_extract;
    double time_rf_assemble;
    double time_rf_reset;
    double time_rf_refactor;
    double time_rf_solve;

    parseCommandLineArguments(argc, argv, opts);

    printf("step 1.1: preparation\n");
    printf("step 1.1: read matrix market format\n");

    findCudaDevice(argc, (const char **)argv);

    if (opts.sparse_mat_filename == NULL)
    {
        opts.sparse_mat_filename =  sdkFindFilePath("lap2D_5pt_n100.mtx", argv[0]);
        printf("Using default input file [%s]\n", opts.sparse_mat_filename);
    }
    else
    {
        printf("Using input file [%s]\n", opts.sparse_mat_filename);
    }

    if (opts.sparse_mat_filename) 
    {
        if (loadMMSparseMatrix<double>(opts.sparse_mat_filename, 'd', true , &rowsA, &colsA,
               &nnzA, &h_csrValA, &h_csrRowPtrA, &h_csrColIndA, true)) 
        {
            return 1;
        }
        baseA = h_csrRowPtrA[0]; // baseA = {0,1}
    }

    if ( rowsA != colsA )
    {
        fprintf(stderr, "Error: only support square matrix\n");
        return 1;
    }

    printf("WARNING: cusolverRf only works for base-0 \n");
    if (baseA)
    {
        for(int i = 0 ; i <= rowsA ; i++)
        {
            h_csrRowPtrA[i]--;
        }
        for(int i = 0 ; i < nnzA ; i++)
        {
            h_csrColIndA[i]--;
        }
        baseA = 0;
    }

    printf("sparse matrix A is %d x %d with %d nonzeros, base=%d\n", rowsA, colsA, nnzA, baseA);

    checkCudaErrors(cusolverSpCreate(&cusolverSpH));
    checkCudaErrors(cusparseCreate(&cusparseH));
    checkCudaErrors(cudaStreamCreate(&stream));

    checkCudaErrors(cusolverSpSetStream(cusolverSpH, stream));
    checkCudaErrors(cusparseSetStream(cusparseH, stream));

    checkCudaErrors(cusparseCreateMatDescr(&descrA));
    checkCudaErrors(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL));

    if (baseA) 
    {
        checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));
    }
    else
    {
        checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
    }

    h_Qreorder   = (int*)malloc(sizeof(int)*colsA);

    h_csrRowPtrB = (int*   )malloc(sizeof(int)*(rowsA+1));
    h_csrColIndB = (int*   )malloc(sizeof(int)*nnzA);
    h_csrValB    = (double*)malloc(sizeof(double)*nnzA);
    h_mapBfromA  = (int*   )malloc(sizeof(int)*nnzA);

    h_x    = (double*)malloc(sizeof(double)*colsA);
    h_b    = (double*)malloc(sizeof(double)*rowsA);
    h_r    = (double*)malloc(sizeof(double)*rowsA);
    h_xhat = (double*)malloc(sizeof(double)*colsA);
    h_bhat = (double*)malloc(sizeof(double)*rowsA);

    assert(NULL != h_Qreorder);

    assert(NULL != h_csrRowPtrB);
    assert(NULL != h_csrColIndB);
    assert(NULL != h_csrValB   );
    assert(NULL != h_mapBfromA);

    assert(NULL != h_x);
    assert(NULL != h_b);
    assert(NULL != h_r);
    assert(NULL != h_xhat);
    assert(NULL != h_bhat);

    checkCudaErrors(cudaMalloc((void **)&d_csrRowPtrA, sizeof(int)*(rowsA+1)));
    checkCudaErrors(cudaMalloc((void **)&d_csrColIndA, sizeof(int)*nnzA));
    checkCudaErrors(cudaMalloc((void **)&d_csrValA   , sizeof(double)*nnzA));
    checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*colsA));
    checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*rowsA));
    checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rowsA));
    checkCudaErrors(cudaMalloc((void **)&d_P, sizeof(int)*rowsA));
    checkCudaErrors(cudaMalloc((void **)&d_Q, sizeof(int)*colsA));
    checkCudaErrors(cudaMalloc((void **)&d_T, sizeof(double)*rowsA*1));

    printf("step 1.2: set right hand side vector (b) to 1\n");
    for(int row = 0 ; row < rowsA ; row++){
        h_b[row] = 1.0;
    }

    printf("step 2: reorder the matrix to reduce zero fill-in\n");
    printf("        Q = symrcm(A) or Q = symamd(A) \n");
    start = second();
    start = second();

    if ( 0 == strcmp(opts.reorder, "symrcm") )
    {
        checkCudaErrors(cusolverSpXcsrsymrcmHost(
            cusolverSpH, rowsA, nnzA,
            descrA, h_csrRowPtrA, h_csrColIndA, 
            h_Qreorder));
    }
    else if ( 0 == strcmp(opts.reorder, "symamd") )
    {
        checkCudaErrors(cusolverSpXcsrsymamdHost(
            cusolverSpH, rowsA, nnzA,
            descrA, h_csrRowPtrA, h_csrColIndA, 
            h_Qreorder));
    }
    else 
    {
        fprintf(stderr, "Error: %s is unknow reordering\n", opts.reorder);
        return 1;
    }

    stop = second();
    time_reorder = stop - start;

    printf("step 3: B = Q*A*Q^T\n");
    memcpy(h_csrRowPtrB, h_csrRowPtrA, sizeof(int)*(rowsA+1));
    memcpy(h_csrColIndB, h_csrColIndA, sizeof(int)*nnzA);

    start = second();
    start = second();

    checkCudaErrors(cusolverSpXcsrperm_bufferSizeHost(
        cusolverSpH, rowsA, colsA, nnzA,
        descrA, h_csrRowPtrB, h_csrColIndB,
        h_Qreorder, h_Qreorder,
        &size_perm));

    if (buffer_cpu) {
        free(buffer_cpu); 
    }
    buffer_cpu = (void*)malloc(sizeof(char)*size_perm);
    assert(NULL != buffer_cpu);

    // h_mapBfromA = Identity 
    for(int j = 0 ; j < nnzA ; j++){
        h_mapBfromA[j] = j;
    }
    checkCudaErrors(cusolverSpXcsrpermHost(
        cusolverSpH, rowsA, colsA, nnzA,
        descrA, h_csrRowPtrB, h_csrColIndB,
        h_Qreorder, h_Qreorder,
        h_mapBfromA,
        buffer_cpu));

    // B = A( mapBfromA )
    for(int j = 0 ; j < nnzA ; j++){
        h_csrValB[j] = h_csrValA[ h_mapBfromA[j] ];
    }

    stop = second();
    time_perm = stop - start;

    printf("step 4: solve A*x = b by LU(B) in cusolverSp\n");

    printf("step 4.1: create opaque info structure\n");
    checkCudaErrors(cusolverSpCreateCsrluInfoHost(&info));

    printf("step 4.2: analyze LU(B) to know structure of Q and R, and upper bound for nnz(L+U)\n");
    start = second();
    start = second();

    checkCudaErrors(cusolverSpXcsrluAnalysisHost(
        cusolverSpH, rowsA, nnzA,
        descrA, h_csrRowPtrB, h_csrColIndB,
        info));

    stop = second();
    time_sp_analysis = stop - start;

    printf("step 4.3: workspace for LU(B)\n");
    checkCudaErrors(cusolverSpDcsrluBufferInfoHost(
        cusolverSpH, rowsA, nnzA,
        descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB,
        info,
        &size_internal,
        &size_lu));

    if (buffer_cpu) { 
        free(buffer_cpu); 
    }
    buffer_cpu = (void*)malloc(sizeof(char)*size_lu);
    assert(NULL != buffer_cpu);

    printf("step 4.4: compute Ppivot*B = L*U \n");
    start = second();
    start = second();

    checkCudaErrors(cusolverSpDcsrluFactorHost(
        cusolverSpH, rowsA, nnzA,
        descrA, h_csrValB, h_csrRowPtrB, h_csrColIndB,
        info, pivot_threshold,
        buffer_cpu));

    stop = second();
    time_sp_factor = stop - start;

    // TODO: check singularity by tol
    printf("step 4.5: check if the matrix is singular \n");
    checkCudaErrors(cusolverSpDcsrluZeroPivotHost(
        cusolverSpH, info, tol, &singularity));

    if ( 0 <= singularity){
        fprintf(stderr, "Error: A is not invertible, singularity=%d\n", singularity);
        return 1;
    }


    printf("step 4.6: solve A*x = b \n");
    printf("    i.e.  solve B*(Qx) = Q*b \n");
    start = second();
    start = second();

    // b_hat = Q*b
    for(int j = 0 ; j < rowsA ; j++){
        h_bhat[j] = h_b[h_Qreorder[j]];
    }
    // B*x_hat = b_hat
    checkCudaErrors(cusolverSpDcsrluSolveHost(
        cusolverSpH, rowsA, h_bhat, h_xhat, info, buffer_cpu));

    // x = Q^T * x_hat
    for(int j = 0 ; j < rowsA ; j++){
        h_x[h_Qreorder[j]] = h_xhat[j];
    }

    stop = second();
    time_sp_solve = stop - start;

    printf("step 4.7: evaluate residual r = b - A*x (result on CPU)\n");
    // use GPU gemv to compute r = b - A*x
    checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int)*(rowsA+1), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int)*nnzA     , cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_csrValA   , h_csrValA   , sizeof(double)*nnzA  , cudaMemcpyHostToDevice));

    checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_x, h_x, sizeof(double)*colsA, cudaMemcpyHostToDevice));

    /* Wrap raw data into cuSPARSE generic API objects */
    cusparseSpMatDescr_t matA = NULL;
    if (baseA)
    {
        checkCudaErrors(cusparseCreateCsr(
        &matA, rowsA, colsA, nnzA, d_csrRowPtrA, d_csrColIndA, d_csrValA, CUSPARSE_INDEX_32I,
        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F));
    }
    else
    {
        checkCudaErrors(cusparseCreateCsr(
        &matA, rowsA, colsA, nnzA, d_csrRowPtrA, d_csrColIndA, d_csrValA, CUSPARSE_INDEX_32I,
        CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
    }

    cusparseDnVecDescr_t vecx = NULL;
    checkCudaErrors(cusparseCreateDnVec(&vecx, colsA, d_x, CUDA_R_64F));
    cusparseDnVecDescr_t vecAx = NULL;
    checkCudaErrors(cusparseCreateDnVec(&vecAx, rowsA, d_r, CUDA_R_64F));

    /* Allocate workspace for cuSPARSE */
    size_t bufferSize = 0;
    checkCudaErrors(cusparseSpMV_bufferSize(
        cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,
        &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize));
    void *buffer = NULL;
    checkCudaErrors(cudaMalloc(&buffer, bufferSize));

    checkCudaErrors(cusparseSpMV(
        cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,
        &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &buffer));

    checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost));

    x_inf = vec_norminf(colsA, h_x);
    r_inf = vec_norminf(rowsA, h_r);
    A_inf = csr_mat_norminf(rowsA, colsA, nnzA, descrA, h_csrValA, h_csrRowPtrA, h_csrColIndA);

    printf("(CPU) |b - A*x| = %E \n", r_inf);
    printf("(CPU) |A| = %E \n", A_inf);
    printf("(CPU) |x| = %E \n", x_inf);
    printf("(CPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));

    printf("step 5: extract P, Q, L and U from P*B*Q^T = L*U \n");
    printf("        L has implicit unit diagonal\n");
    start = second();
    start = second();

    checkCudaErrors(cusolverSpXcsrluNnzHost(
        cusolverSpH,
        &nnzL,
        &nnzU,
        info));

    h_Plu = (int*)malloc(sizeof(int)*rowsA);
    h_Qlu = (int*)malloc(sizeof(int)*colsA);

    h_csrValL    = (double*)malloc(sizeof(double)*nnzL);
    h_csrRowPtrL = (int*)malloc(sizeof(int)*(rowsA+1)); 
    h_csrColIndL = (int*)malloc(sizeof(int)*nnzL);

    h_csrValU    = (double*)malloc(sizeof(double)*nnzU);
    h_csrRowPtrU = (int*)malloc(sizeof(int)*(rowsA+1)); 
    h_csrColIndU = (int*)malloc(sizeof(int)*nnzU);

    assert(NULL != h_Plu);
    assert(NULL != h_Qlu);

    assert(NULL != h_csrValL);
    assert(NULL != h_csrRowPtrL);
    assert(NULL != h_csrColIndL);

    assert(NULL != h_csrValU);
    assert(NULL != h_csrRowPtrU);
    assert(NULL != h_csrColIndU);

    checkCudaErrors(cusolverSpDcsrluExtractHost(
        cusolverSpH,
        h_Plu,
        h_Qlu,
        descrA,
        h_csrValL, 
        h_csrRowPtrL,
        h_csrColIndL,
        descrA,
        h_csrValU,
        h_csrRowPtrU,
        h_csrColIndU,
        info,
        buffer_cpu));

    stop = second();
    time_sp_extract = stop - start;

    printf("nnzL = %d, nnzU = %d\n", nnzL, nnzU);

/*  B = Qreorder*A*Qreorder^T
 *  Plu*B*Qlu^T = L*U
 *
 *  (Plu*Qreorder)*A*(Qlu*Qreorder)^T = L*U
 *  
 *  Let P = Plu*Qreroder, Q = Qlu*Qreorder, 
 *  then we have
 *      P*A*Q^T = L*U
 *  which is the fundamental relation in cusolverRf.
 */
    printf("step 6: form P*A*Q^T = L*U\n");

    h_P = (int*)malloc(sizeof(int)*rowsA);
    h_Q = (int*)malloc(sizeof(int)*colsA);
    assert(NULL != h_P);
    assert(NULL != h_Q);

    printf("step 6.1: P = Plu*Qreroder\n");
    // gather operation, P = Qreorder(Plu)
    for(int j = 0 ; j < rowsA ; j++){
        h_P[j] = h_Qreorder[h_Plu[j]];
    }

    printf("step 6.2: Q = Qlu*Qreorder \n");
    // gather operation, Q = Qreorder(Qlu)
    for(int j = 0 ; j < colsA ; j++){
        h_Q[j] = h_Qreorder[h_Qlu[j]];
    }

    printf("step 7: create cusolverRf handle\n");
    checkCudaErrors(cusolverRfCreate(&cusolverRfH));

    printf("step 8: set parameters for cusolverRf \n");
    // numerical values for checking "zeros" and for boosting.
    checkCudaErrors(cusolverRfSetNumericProperties(cusolverRfH, nzero, nboost));

    // choose algorithm for refactorization and solve
    checkCudaErrors(cusolverRfSetAlgs(cusolverRfH, fact_alg, solve_alg));

    // matrix mode: L and U are CSR format, and L has implicit unit diagonal
    checkCudaErrors(cusolverRfSetMatrixFormat(
        cusolverRfH, CUSOLVERRF_MATRIX_FORMAT_CSR, CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_L));

    // fast mode for matrix assembling
    checkCudaErrors(cusolverRfSetResetValuesFastMode(
        cusolverRfH, CUSOLVERRF_RESET_VALUES_FAST_MODE_ON));

    printf("step 9: assemble P*A*Q = L*U \n");
    start = second();
    start = second();

    checkCudaErrors(cusolverRfSetupHost(
        rowsA, nnzA, 
        h_csrRowPtrA, h_csrColIndA, h_csrValA,
        nnzL, 
        h_csrRowPtrL, h_csrColIndL, h_csrValL, 
        nnzU, 
        h_csrRowPtrU, h_csrColIndU, h_csrValU, 
        h_P, 
        h_Q, 
        cusolverRfH));

    checkCudaErrors(cudaDeviceSynchronize());
    stop = second();
    time_rf_assemble = stop - start;

    printf("step 10: analyze to extract parallelism \n");
    checkCudaErrors(cusolverRfAnalyze(cusolverRfH));

    printf("step 11: import A to cusolverRf \n");
    checkCudaErrors(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int)*(rowsA+1), cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int)*nnzA     , cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_csrValA   , h_csrValA   , sizeof(double)*nnzA  , cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_P, h_P, sizeof(int)*rowsA, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_Q, h_Q, sizeof(int)*colsA, cudaMemcpyHostToDevice));

    start = second();
    start = second();

    checkCudaErrors(cusolverRfResetValues(
        rowsA,nnzA,
        d_csrRowPtrA, d_csrColIndA, d_csrValA,
        d_P,
        d_Q,
        cusolverRfH));

    checkCudaErrors(cudaDeviceSynchronize());
    stop = second();
    time_rf_reset = stop - start;

    printf("step 12: refactorization \n");
    start = second();
    start = second();

    checkCudaErrors(cusolverRfRefactor(cusolverRfH));

    checkCudaErrors(cudaDeviceSynchronize());
    stop = second();
    time_rf_refactor = stop - start;

    printf("step 13: solve A*x = b \n");
    checkCudaErrors(cudaMemcpy(d_x, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));

    start = second();
    start = second();

    checkCudaErrors(cusolverRfSolve(cusolverRfH, d_P, d_Q, 1, d_T, rowsA, d_x, rowsA));

    checkCudaErrors(cudaDeviceSynchronize());
    stop = second();
    time_rf_solve = stop - start;

    printf("step 14: evaluate residual r = b - A*x (result on GPU)\n");
    checkCudaErrors(cudaMemcpy(d_r, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));

    checkCudaErrors(cusparseSpMV(
        cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, &minus_one, matA, vecx,
        &one, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &buffer));

    checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double)*colsA, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost));

    x_inf = vec_norminf(colsA, h_x);
    r_inf = vec_norminf(rowsA, h_r);
    printf("(GPU) |b - A*x| = %E \n", r_inf);
    printf("(GPU) |A| = %E \n", A_inf);
    printf("(GPU) |x| = %E \n", x_inf);
    printf("(GPU) |b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));

    printf("===== statistics \n");
    printf(" nnz(A) = %d, nnz(L+U) = %d, zero fill-in ratio = %f\n", 
        nnzA, nnzL + nnzU, ((double)(nnzL+nnzU))/(double)nnzA);
    printf("\n");
    printf("===== timing profile \n");
    printf(" reorder A   : %f sec\n", time_reorder);
    printf(" B = Q*A*Q^T : %f sec\n", time_perm);
    printf("\n");
    printf(" cusolverSp LU analysis: %f sec\n", time_sp_analysis);
    printf(" cusolverSp LU factor  : %f sec\n", time_sp_factor);
    printf(" cusolverSp LU solve   : %f sec\n", time_sp_solve);
    printf(" cusolverSp LU extract : %f sec\n", time_sp_extract);
    printf("\n");
    printf(" cusolverRf assemble : %f sec\n", time_rf_assemble);
    printf(" cusolverRf reset    : %f sec\n", time_rf_reset);
    printf(" cusolverRf refactor : %f sec\n", time_rf_refactor);
    printf(" cusolverRf solve    : %f sec\n", time_rf_solve);

    if (cusolverRfH) { checkCudaErrors(cusolverRfDestroy(cusolverRfH)); }
    if (cusolverSpH) { checkCudaErrors(cusolverSpDestroy(cusolverSpH)); }
    if (cusparseH  ) { checkCudaErrors(cusparseDestroy(cusparseH)); }
    if (stream     ) { checkCudaErrors(cudaStreamDestroy(stream)); }
    if (descrA     ) { checkCudaErrors(cusparseDestroyMatDescr(descrA)); }
    if (info       ) { checkCudaErrors(cusolverSpDestroyCsrluInfoHost(info)); }

    if (matA       ) { checkCudaErrors(cusparseDestroySpMat(matA)); }
    if (vecx       ) { checkCudaErrors(cusparseDestroyDnVec(vecx)); }
    if (vecAx      ) { checkCudaErrors(cusparseDestroyDnVec(vecAx)); }

    if (h_csrValA   ) { free(h_csrValA); }
    if (h_csrRowPtrA) { free(h_csrRowPtrA); }
    if (h_csrColIndA) { free(h_csrColIndA); }

    if (h_Qreorder  ) { free(h_Qreorder); }
    
    if (h_csrRowPtrB) { free(h_csrRowPtrB); }
    if (h_csrColIndB) { free(h_csrColIndB); }
    if (h_csrValB   ) { free(h_csrValB   ); }
    if (h_mapBfromA ) { free(h_mapBfromA ); }

    if (h_x   ) { free(h_x); }
    if (h_b   ) { free(h_b); }
    if (h_r   ) { free(h_r); }
    if (h_xhat) { free(h_xhat); }
    if (h_bhat) { free(h_bhat); }

    if (buffer_cpu) { free(buffer_cpu); }

    if (h_Plu) { free(h_Plu); }
    if (h_Qlu) { free(h_Qlu); }
    if (h_csrRowPtrL) { free(h_csrRowPtrL); }
    if (h_csrColIndL) { free(h_csrColIndL); }
    if (h_csrValL   ) { free(h_csrValL   ); }
    if (h_csrRowPtrU) { free(h_csrRowPtrU); }
    if (h_csrColIndU) { free(h_csrColIndU); }
    if (h_csrValU   ) { free(h_csrValU   ); }

    if (h_P) { free(h_P); }
    if (h_Q) { free(h_Q); }

    if (d_csrValA   ) { checkCudaErrors(cudaFree(d_csrValA)); }
    if (d_csrRowPtrA) { checkCudaErrors(cudaFree(d_csrRowPtrA)); }
    if (d_csrColIndA) { checkCudaErrors(cudaFree(d_csrColIndA)); }
    if (d_x) { checkCudaErrors(cudaFree(d_x)); }
    if (d_b) { checkCudaErrors(cudaFree(d_b)); }
    if (d_r) { checkCudaErrors(cudaFree(d_r)); }
    if (d_P) { checkCudaErrors(cudaFree(d_P)); }
    if (d_Q) { checkCudaErrors(cudaFree(d_Q)); }
    if (d_T) { checkCudaErrors(cudaFree(d_T)); }

    return 0;
}