From 2f26622fbef4610c51f567194bd8ec977454cb93 Mon Sep 17 00:00:00 2001
From: Luc Berger <lberge@sandia.gov>
Date: Tue, 11 Jun 2024 11:16:43 -0600
Subject: [PATCH 01/32] Graph: removing executable from repo (#2239)

---
 example/graph/PartitioningExample | Bin 21536 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100755 example/graph/PartitioningExample
diff --git a/example/graph/PartitioningExample b/example/graph/PartitioningExample
deleted file mode 100755
index 88619a8d127f7c5acc2015424b160883008f33aa..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 21536
zcmeHPeQ;FQb-(%ofw3eowlPq9jRAuVYY8wGIF46Bi=T`D86--G@nfZ3!4iFSwE}@7
z2O@CU2qhYd6Vjv|Ch?@64AW3gN{K^3e1xTr$5j&IFsaih4mc|$J4!<wCBadD=iYPQ
zd;7F2cc#<nKU~e~+<U(6x#ymH?)(0XzhY%(MuyNOQ+!-dt2@^xQR=@CM=NBLam8YS
z@QHiHUx;zQ<lyI%6oRUz;V`S#FqiqV0JHHc&QyrvOrNZ1I8&j8hT1}c)Sq2^Tvf2-
z0oBw{2#-XN9NF5pP&NU#$clz1*^VBDT8^3|H_URwET^H+{0#N@Xl(SWVtiGm4=rl@
z2(VQ)Bwi|&*P=nHPns^9fIeB#@DvRMMMEw34CKhp|7j^;#&z6ZI=^_>ehs;M@ko8R
zX6ceg>VpgG!;P(N3)_m9E?m0A+tTD+B*#tqiT9~B)v_m2L-u)OO9+~O;>pKv5`N?H
z(?$0|GBiHor9$%LUz-03_}zxzMEq{YFAu-5N(qQlQZT%k27fjUeoh+vkJI33jH&wl
zZW{bA)8Hd%@Cz7U$m<nNS-yt<srnsAgTD)S4}R|QHV~=wY+`(~t6zb*Xo&`EAAXo(
zVW}`THSTN>rn$ADsnKkS1|m_@1Tt7(xr4B6ZDw;Q($drzs1HYXnL7$WTOTcY%rv9h
zB27EZnn2JD)YmuFf_u}N_0i&RQwxwSp(W*w;b>WrKP7$z;bBOFvH`WNwY6powTgsU
z-MBN{7&O;~T3Q=I#8#`=>YAEEjWDXVEno(sp|)^TG=v&jLQzpyTi?_Y5_LNx;V3|J
zB-|LS11HJypc(?<M)A~&73N}Zp?IpIymW=R$h*k<s93RaV@YYbS>#=;&>J^`WeL2O
ziMY;^|8*@Jkxj8q*bMv~MxElvgJ0#`T?qFa@geF|i2hH2n%+7+j3tpJaxx?tRMMg7
zg!$38K80nIEhYm=GST^mu#_^zOy(bcWDBv$HAd@6muZDtWR=z@v&y!|f#(z<@$(#b
zy>5w6=)mioMfhR|-kH~Z4!kq384kQNXH+`yItP>fjSf8J3|+Q3@RUPz+2+7ge$l1b
zfv0?>OPd4l%t3n`c*=3Q7#)MTMrTg@JdZH;_e66Ne*(qmxR85Zag00~0cm7T6#w(@
z^PxuVZA9$v8$p?~o!T@N{k^hXLv5OZ{vO%hL~U|?|8dz~M{RO>|54d~n%d;*{%+YW
zr8c>^e~)Z0r#88^zge~yQJY-azeTnmqV@!8SIYJ*YLg55eX>2B+T^<aLfM`|ZE{(^
zN46(Yn?kI=K(-?fdPH3}+OibAS04Mpb4KhPqvPsOWmS2+r}rM8Fyd#oND!}?;1h{2
zLGVUh{(WUaoIC<Y8&A;x=p83X1-W&?8A6Qo)pehjTy@_3`}RrwrzQX1_a%r88E4;r
z(l|SuWn^45e(>k$WJ|+%*5H=k^F#pliq^ZUjo7us3oztj&RSqHHY-jy|G?*oalNs3
zkmfA*uV4N1$cPd9=Wc+w|Bd@#Q`~=vb~&SK|C^E)Py8pmeR2Q4qXlf2#CE+zP4FBA
zI895~h{Y4{fzcs%4$ZiKkjB~3K6r2b7Y-o|;{Jp}B!FlGqSK!^+m{nN*XbY3=#BX=
z0CrVoC@dj*Q0?f+aPpk8coc!yPImfx&h||tJr^>1joMx|&WOE5rX|R<*#2HQq?fKJ
z<EeF;LYFv-5rdFr9X%;@ysUKeDIE!=gIbo38*p5w|I`?EY*9Lfl#XGggIbo3<5GvV
z<5UVeW-1*5lWoLa$|caXpIVlVda2{&7<OF2ih@5V<e_7l(m^dt$GjvvPNuNqu+l-x
z2s-8|9n`XPT$S^y?MUU*N~MFACUg`l9n`XP9F;os+@`W)lG5Q*It-<QT9%Gylk7<4
z(@V%W@JFT6u~F%umZf8w)S>4#l^y$(jx9>ZHl>4FmX6EFi*}r*vcsozG%Fo#N(Z$p
z9o<rgwj-5KZz7V7n17EEt3F_K`S)0&)e!CQCo=k9xH&S?f%VoOL?!OON-NTc`>#l-
z7Bwx25mL0we<lA=&%T~$nbBE&C3#gHQayE}rz7YIFrIiF3kuxhPtw8~$dMT=!++K2
z@L$Oo_}b}TA#a}P$G^nnKhm80j~ku%#m>Hls20-HGruMBI*{Z3%Vg5t_RAyi{%8KG
z(|<YU@7>$pE1S=v$syjGB0PVG0EYgfFzyu?2N+K*RL0HhT;1DQeVKYbeUlP-EH3R#
zbp3%G)0;8yl@9;qjDZ)mZM2^$+j6JFwi94eq<G>X)W1ts8n)w}L)Wn@1Ivxrd0=%O
zeHCLcVpqtHYEmGHlt2mdu(VbZi;1}R0ww5AZaz=>8oP8sto>%Jb-1JbW=89Sd)tSR
zj796cXTFSmCGjh?Uh78fG&RrAF>s;<xbA<G(x-{{3{6MkTZp91=ewoOSo^RMYaN1n
z5*W)cW*7AWt|p9FWo|e3PMnfM7@f@E%Qf!r#nzVpQjbGlZ0Lah#-8>YB6`0ifWr$?
zUdd1x6JAF*19L0}TjTG05eFK7Ke186p`YxHRSzl_%iPZDL1nBS7mRQ115={Ymm52W
zN}qxFh>;n7j8%2mWjqP{jeO%QHr#Q>**<K##0z1?@MYud_2m%k=zG$buf#jrhsCbD
z5z50pr3#Z?skAB~*;~TpI&!k~0qI3m81v5=F>Fv%zbE4cG%~bD^*^NQlEt2VgXiEi
zlE8*bkrYo{Metyf)wnwbJ(A1z5gcQ5uEwmsNzDJEm^q$^m*$;(^cuwmJpZG2)TnZ?
zT(r~=K9r#jv#W>go(gnNYG5=Ka+`ggq8a0{Iep+nRJZm5eMN-<OvjAKu&ag_s4j(%
zQnr(ULL5+8O|EtI?9n}Q93+qNN3fQop4N=!)mC<{LlnXxofFt6Ur)hLQw4Wbz$p&j
zDqT$bn1i++9nC{WD(GTJk+Ib@SU4A>QzKgfel+_eTA^N&DCE`?BL9TsSNN?+H3QQr
zH^@+oC(4LXWsUoO2M2XrnCHj}a)fNvbca<iiae!PLLY)SrlK3+*SWgSINOhy{{>?H
ztYfmjO_MF>KAyOYy2>BWRR~>}J<K3H^DIoXr%>T0D%_Bqv@VQPYLU}=1AB~)2ShQF
zie9I7UC$MdgnH1v*Z$=o+6*&f)cl%#TR=T``O&;+#D1Q5!fM77caj=*m|i1E^H@sE
z+-DTgr~AZ+D>34eh}+{vNBeORZ3f#(^n9FqZ6isQQGkjkj>vB91+CUH6jXY22c1%c
z+}$HrF^8ZJ?qTIUNIjj^mykXN7E4sz=aEaj%io*4@Nct)dZo}62ubtEOchDb6OXmh
zr1gEU2R$G`O|w^thvI}X_f_<Xs8!>lR2L;H5JK_9&nO5VriF2CPy1C79seD8Yh)nj
zJJLLOP^Fmnf0N`xpA|J-l?9H7*sQF_I>ZXMEyVJuPUuMK7__B3WkPZ~?bm;jPWuQA
z1?V3gaQekCDw9`;qVxRwQqiO=vYX_Q(L9Gd!6Hv85qCx<7viL^N;~f;oJ2!2I;YS^
zLZ^4uUgSe~zzJvH$ufTbQ=@BAffSN{kxBaw)t#J#rD-jsWEoFPrG8?x$}wnZY&9f!
z4eLF;qX#FLiE+t0`{;ef=!9h$#B`~R8XbvT>g0Qrq&s?Y`N+=JKIXa}?Z;~{){eoX
znaHK%9~wkQ5AgBCFb-v8o0o$v=CLMN?Iv(BWp^6C^d6nYJJvL@CsZ896JJwWAws(s
zEsJ3+3kZV)(JGh1L?f1qJwvW7;79eZ%$9>w{W@dxF?6n@nd8v%kz=ps<1g3{PrUhl
z()w`?kA@Xb%%WLd<{!%c5*?K1B(_5a*5c9uj$lClCffF_7$<3=qU6&B0iP)O%aYhg
z$%gXS2PM_zu^*RIRdmhSHq|4ljjn~wfa@#fUn}|#<Lrl7$bq?upS=g=``?S+tFKp7
z#I9GwJ}85Zk*RMR9p^KQWdp6(>4MQGH<vtHvbp5B67zhweSPD%=V2{fW8zCnbR7_7
zr`Q?^G;i|;o1&`bd48g3oD=jcT<8IqDB5uaCu2s|<8&6W6axETNH#?GVRV;)+wkj=
zkwF|!kHeN&k28ldz=MFB02dG+U<Keei4HjN1JD5%0UiT91Na7D_V1ww@FBoS(6<3_
z0pMZ43c%BV^?-SZmV<!H0gnN02b_vS_7?${08%tn0nSA1ej1RD;lB#F5AYP=n}BZv
z{uJ;g;Jbh`arTsnvz(6st^o`J?f^UpxCIIB7~pGwZvYMh4gxMm>YRjg#7%$;0CxaZ
z0KNiP4>%3G_CY`c@EG88fNucS0}cXy1CY*_Uj<wMcp9(*@G4+E;0zqV9Rz$D@EG7%
z0N((-2sjA%7T_fK`Hz4L06&J%ssOA4JO+3H@CG2|A}SfX*NKd_f{YoH#^-itj4vQO
z-9vk2aAf2mV&IaRUx?YD6Mx&l$jA`!<`t~WoAz}6Z98-Kh$p8%{?Ovtvj|N*W%$kg
zE$c1GD|j(;#qHxl=$Pq2z(UaH018dtV$=5ly%coQNzdFQ8A$)j`0W7wK_|V&mcIb}
z3!ry7>6r&C`K!PWfbLJC`z(4k;=15Qdi!RBei!Hqobp@k{^>|@7U)kn>1{SW2)YmS
z`A&M!rtbs&Y0%d?>6uSk_Pq@H?lJnm0Q!rd7dquXW$Pz&#_q}r84=^N=oV3yiVHQ*
zvq7)HJRe7#XU}{9$%^OD=*Y_YdX8ns0<70H=x2trpAB|D6`<2NQqif8gZQlkeK%ls
zk)t1fM^<LDt>-ZG97&^x;^Et%9|pYwb<5Vwrz|_^YEKX7<w^81i+%(2>!8c)`g*J@
z?f$1C&%8ATeF^C2Ku<T$D$q}ZzRKA@jj=@aL1^;FS21tzfRExj`w53XIJSvT#y|8C
z-*rN0do!<D<GBF(G347%B+;uZI$aHX7xZ-TnVpUPk(W!H@}IW-F&p%`pwm@qtADjT
z)%s_8Du0mubR&Bf<QF;R8*KgiKwkoSlarpg-Ln5>(D#9!F1{~-ei(GRcWK%8jIIAF
z=$`{!-j~t-E4S&{i2F9smpkQKZTf7`4}qR8Udlj!VGKI$FWW(%>(n2x_3s0{8FV^u
zvFwl7^p`>ZYtX$;df2940R4-gKjWllR!9c&|5ecApv%0k$M49`+1Q_c2)evKrRggD
z$oZWO`Z>@aaO!`~?!OH5i=dY}>FqW>2zml^xz}j@Yi;^I&<|l>`nXemmrZ{e^y8r4
z>7-XGdKqvRKtBroA)=S&6?`GnpEvFES^hjvSGGTI-oYFrukgijMqY8p_|<v7`n=+j
zyuy;ad8K)t(!6QlFU`x9`{pF<vzMR;=^?3%jATsSlruO{9GLJ%M(j4R@3vV&0GYcy
z=M$n4&P6>*;8x?72X1-bmIrQm;Fbq&dEk}@Zh7FA2mYTuuw{ZzT2Ps<U?I2lfvX#S
zj_uX>!>%?xb4X>90-|}mPuBD^gO~47?R%K6pRbnZh`<#vemcXUg<E8e&R3|=wIeDx
zb(5tn(<dR0wq>cz_DP6Ce_47neG*QT|Am;xhRbs{;HEKNo<ne*08Win!3}&W`q-cD
zSWwZ|$8c&O%iRn;<9xEF@wwdAcI*BoFu!i&@|`@*J(8p3aoJ9`wH-s;Zeu!&|DOZv
zId)f1-{5&Y!SEu(w-~<5@B@b9a@Dx*VmO!KQidxTZeUo)a0f$I_w*?3a?d<GJwD&F
z#9Qor)U&9t=+VNZixzw4tqTP`Mj)#23l}e&?_^%)Ey4r1W0x#E$y%K08^uJ=-BzW<
zXNk6y_-vu$Sg;R+q6$@p<1)%<4R>31C$$^5xO$5%3bg3xi8jGt(&$7K9%qMPR_
z89zzvN#Xx2JW=1B5}zmZ`cBr9FZy`hC*upO_ZO1!lf|VOJ|{}nv+$%jU50d*EOCeR
zzJ`;Qg;lEaM>76SysxpviIVj!ahK4~IXj720;`+zKr()sIFJ&5kD%vA-6c!ttY%Xv
zon(vO;DMmzLisZje(rJknXXk4e#nKtpYb|>Yi)GxC_~uxaQYG}C4RJj8d#6s7qp&T
zl7F<Hj{u*_{^OEAUtHyR*ZqE<@z-4VKE}W6!oMf+_Xv8Bii*~A6Ls>#6yCSwH6q{^
zfPwIOKht@g<Wkv5cbPN9WHIdlA2^arD*iR32)mSi-$qLWq@K}nwO#VxF)9w9XT0A3
zw4TpPJ)`4|uJzBtxZ3XaNmhAH61AT*{;mRre^63!>51`6CoB8{#!tsaMtVMbm%__y
zub{1Ed^h`Z4)cGC@vm~fdf)s4@CDdC-Esa!=Ktv(O26iRjq!Ed?+n%-XZ&{7Pw`FV
zmyDl2#V6a^{%mZBWWRg7vl&0cdMNIxY-Ie$**{$hE#_eZ_P~Dk`u#NcC*!Rnov-w9
zMknie*EOzh0Y8OxI>C>b|A*6j5-qQxqV^%<^QJ2NDoGXMeso0R`Y+b6{Zq+!_qgcV
zIPn+o{Ob979P>T}czO<=3f;S*@^{QXoBgBrz)Osu=j!(tz^57)J;X`<UcA#M^~!q(
zs7*#clQX9Ydg{?x?ggIsPjFnxdwF1>r@INilIK_3Q-^?=g15EY`$$;xbNtAA4$!lQ
z@!5<o;m-e_@&CyBcQgJB<GXph@?H>VZ!^Az{i(<G0pmZ*^L3r|%)|jE*<Z)wdWiAG
zz^C%tI_7uB&yF<wpJV=d_OrZ)gFa3$eg}_B$H@Z3GtIkuUp}8k|4-83ue1K6?1%Y0
zt}GnT%JI6^Zvo@q;W&SQ`5&K1_KZH>@G-x^d44MM*8@-S<IX2-z^C$OC+l&?&nd>A
zcIB%+#=G<3pBV3s6MCkf#_P_L&oKUw%m2?YeiO&ZgFLPm82=CK9~~!jZ=UqK^JFjM
z-Shi1#(%%SCpG(7{|Mu&c)apn7ie=450szX`E3XCLjjHlj=I+AV(^na53-$J)?X*%
ziW?5_Jo9g0{%4qWLiTGPZ)lNo%>O#uIhP4PWV}0%4FI2N-U|>{WT$&yD3bAZyP(&A
zsD$Ky+%{u=_x=@P{5a+>W&Xcm{7P4z`6lDP&3Jio2AQ7#Pj(jO`dEVk!_4o_{~qKw
zvfmw_Wisz2sgR9lCH{6nZ#hxHwkS(I>v8Ap{fu|}`6b4?_nq%Z{jAght}uTY=gA7D
z{RViCgU_nsfR)DOj(-p1U*~+XmIanG-km=;Fy6hsYJpD`4?CIPy>EVl`I}wyB5EVi
zmS}5Towrs5Ly^$da7#24F{2G8-r;NvwcxGIV3WDEzNscqZw8}Hkrp%1+9qn78k*}v
z(NNI)*y6&+QZt!#;l{8Th(rRr%ur)AvP;xO0u3QE*xJyr3nGq|3C?H|YbD-%4M)RG
zjp4?v{<Z-1=B;gRman^36fJ8GhiXGR!!04RwyCit8fmSKGHJ1V{WSnfp>L}~E)8#O
zY>I?bqu4aVP1GNnONvZ-A2k}Tl@^5YR_$)I@ak;<Z@dOWN;9<!!(m}QRk60Tqyi$X
zHD<XfUtu-PmFr4Y`%V9vGI~+h#2L8hHyCM@trKS1C)Sj#E?)s!(jbG$2>Gh7)x+AA
zE7$w0%&L;o3O~L6yJi($@?C6}?g-V=!03hE^7Vo88o%Fx4Z}%AMdoTa8!rI|gAx6*
zZ&9&X3isk&+=%oGiQ|>u%7*QvT4{FB-K3A~6awp3!JG!aHg$>YFcPXmL^aliYIoEq
zTbAJ!=EhdMG2B+PxG}U-y{b$$L3YI=;t2%<(EzDQ&sxK*FrE73mzpsQOePIG)Y8&a
z2SR0)xo&--bRUSzq)9SbEKi_I+X<<h<fNx|Kcv_{<bIQ^R|AobO`-EKOi;M7HsY@r
zX8Bq~UohNgwzh;24`UCyO!_Cy+ulB(e21D}mL5|RJT2#|NP8ybz!Z<=y_7tz*kc1Z
zpi)FlDqmS|q?5AbAmmro9lbc?o&uehddQDA=E>hnRj1`OYHdh^)GO)iAAZLi%vew^
z?!{O(6oF>2rODie^<Pg3gGS?u=D>P?4O7wt6;^{>u8#3Y0Sea@MYL??dZ{w?f^nJh
zXbtjkqh7u)B4(yx-KrXBP_r|7BuiStyF*HPv>`#Jj2b`uOp%GCOIgtJseANt&CnML
z?AQKBId`nJ%gdEMXfP^hN87Y4$tH70puRQ4-ly~jU+mzo4Y-ZKXB1NCcSTuq1ua9J
zNYnxwBl9%)iwc7wE9xp6WZbPvKimd?aYLZZ)bn9htzN-9h5UGe+hz*3lp!S-Y^snN
ztLx<T=!ivU+H(ijqU7LGahzf;+X;d49yB_30J-|?Z%w3Lzz+ISd&`bS0^w*2?QIQy
zf3f9h%Hz`4>Bq~#peEDWBGyM6!g4jH$i(Z%&Va?{db4q7QzS^)MJ9Ij=?Y%@>5NW_
z5VD)1gg=3iYQAkpI<s4fR9TFrfW0jgl}W0mrNv>6Oe}H^97#QS#?yZIBh@NN6NGZJ
zRA2F^ZE2Osg1nJ_QsiDEdmf9&p67<jRmx5^Aa|x5BN|uZXw3>2%HpFxmE!X`zAf@m
zW=m(P02#}46~#$kNP$23vnKq(6LJ+D3ah|zEjVi#sJ(%fkz)!+FFdu<DD%6&rjjDK
zTz!SxWGK5nEt7V?*7;-f9@1RTX>-)xh=j~pQ6&LNsp>N@Da=cq7#vtiEtZ2)e9I;|
zbcMHNS3@*V0~n1cxJ}pa0h&;xS$G?pq9HH7FS9Tj*ebl+0xjEwH@K@2f(k|>!iytR
z?+$!q2Z#Mh4HL9Ts6IdfTx+h63a>nu@S^J7inm3)_!18O%L(^Jn&jcEH?)n<cD4nf
zM7N+FpV<ku)rOj*CJe>_hg7Ccgp>|lN1uU)a4kfdqSP16r2}Oz@__K-Y^(uiSk}4c
z|DrMjvbZE-omcDoe)_v(`o6R7f_R;Uu6N-Qk|kf2=|ju@u9ru)?0>j?npBF=!X-va
zUf)Ng_Zg{FDS&$ys^D^uEV@nCxp7Kv$@gaZ&=R=sD1q3+22&qY)$-F6Sv(1dOVXCS
zzAs7FovG;i<*H@hf6?uyP@#L(`nrU^f7wPIqIl$g`+mLV(~!nT_r7&o-{(BS^35zy
zO|nzl&uCGL3Yn_q_5ILXT*suM$Ep3L<u$$mJVYxcW)#mjA43U*zTZzgcjM=l-w7P~
zmAkX<uQpRqpiEc5`mgQKhzKf9d3_(Yo8^ZzBf|n0Y<J1)`?Y)6P;IYP<8Hs`lGoRd
zyP2_cvX)d(_ph-BUGn<6{bhc@1DElVihKNrSYFSczVFh<@((9T+i|4Pf9sOh_vQ5W
z@lKI*9mU=ME1(gJ?qA<`zC?)urHbc4we0!V?N?E8%Io{nGbu5k%u&Gl?=nYR0EO;(
zQEXGu_XG9!_T2u~^QU1iWa!?Kme=>YhZd5y(e`RN4c`V2#jcju_r>-147Fo;Ia=|r
z?b-`j8j~ge$QHEle%vDxSp2GG+oSux27*)Gy3byyBo1gnhFVU;_g(V(diMdA*LG?(
zT28lbph7rpzrO!)gyr4*?)m!zWC_RYsHvjm^?hlL*YZxNV^8xCyuu2HZ&z5_hZf<d
p?a}#*)|a+jajO5l-0}Bu9n@KLzZyr&JC*!bRw<55mw>A&{u^Y&_cH(h


From e275401eb806bc3b02304f74569c7984f02deac0 Mon Sep 17 00:00:00 2001
From: brian-kelley <bmkelle@sandia.gov>
Date: Tue, 11 Jun 2024 14:45:27 -0600
Subject: [PATCH 02/32] Fix logic around merge path with TPLs (#2240)

SPMV_MERGE_PATH is not always a native algorithm. Add
SPMV_NATIVE_MERGE_PATH to cover that case specifically. Test this new
option.
---
 sparse/impl/KokkosSparse_spmv_impl.hpp  |  8 ++++----
 sparse/src/KokkosSparse_spmv_handle.hpp | 12 +++++++++---
 sparse/unit_test/Test_Sparse_spmv.hpp   |  6 +++++-
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp
index a2bb19a44c..f1f4c5700e 100644
--- a/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -32,8 +32,6 @@
 namespace KokkosSparse {
 namespace Impl {
 
-constexpr const char* KOKKOSSPARSE_ALG_NATIVE_MERGE = "native-merge";
-
 // This TransposeFunctor is functional, but not necessarily performant.
 template <class execution_space, class AMatrix, class XVector, class YVector,
           bool conjugate>
@@ -609,7 +607,8 @@ static void spmv_beta(const execution_space& exec, Handle* handle,
                       typename YVector::const_value_type& beta,
                       const YVector& y) {
   if (mode[0] == NoTranspose[0]) {
-    if (handle->algo == SPMV_MERGE_PATH) {
+    if (handle->algo == SPMV_MERGE_PATH ||
+        handle->algo == SPMV_NATIVE_MERGE_PATH) {
       SpmvMergeHierarchical<execution_space, AMatrix, XVector, YVector>::spmv(
           exec, mode, alpha, A, x, beta, y);
     } else {
@@ -617,7 +616,8 @@ static void spmv_beta(const execution_space& exec, Handle* handle,
                              dobeta, false>(exec, handle, alpha, A, x, beta, y);
     }
   } else if (mode[0] == Conjugate[0]) {
-    if (handle->algo == SPMV_MERGE_PATH) {
+    if (handle->algo == SPMV_MERGE_PATH ||
+        handle->algo == SPMV_NATIVE_MERGE_PATH) {
       SpmvMergeHierarchical<execution_space, AMatrix, XVector, YVector>::spmv(
           exec, mode, alpha, A, x, beta, y);
     } else {
diff --git a/sparse/src/KokkosSparse_spmv_handle.hpp b/sparse/src/KokkosSparse_spmv_handle.hpp
index 6d23d2bde1..38f6056615 100644
--- a/sparse/src/KokkosSparse_spmv_handle.hpp
+++ b/sparse/src/KokkosSparse_spmv_handle.hpp
@@ -36,8 +36,11 @@ enum SPMVAlgorithm {
                     /// is only used once.
   SPMV_NATIVE,      /// Use the best KokkosKernels implementation, even if a TPL
                     /// implementation is available.
-  SPMV_MERGE_PATH,  /// Use load-balancing merge path algorithm (for CrsMatrix
-                    /// only)
+  SPMV_MERGE_PATH,  /// Use algorithm optimized for matrices with
+                    /// imbalanced/irregular sparsity patterns (merge path or
+                    /// similar). May call a TPL. For CrsMatrix only.
+  SPMV_NATIVE_MERGE_PATH,  /// Use the KokkosKernels implementation of merge
+                           /// path. For CrsMatrix only.
   SPMV_BSR_V41,  /// Use experimental version 4.1 algorithm (for BsrMatrix only)
   SPMV_BSR_V42,  /// Use experimental version 4.2 algorithm (for BsrMatrix only)
   SPMV_BSR_TC    /// Use experimental tensor core algorithm (for BsrMatrix only)
@@ -59,6 +62,7 @@ inline const char* get_spmv_algorithm_name(SPMVAlgorithm a) {
     case SPMV_FAST_SETUP: return "SPMV_FAST_SETUP";
     case SPMV_NATIVE: return "SPMV_NATIVE";
     case SPMV_MERGE_PATH: return "SPMV_MERGE_PATH";
+    case SPMV_NATIVE_MERGE_PATH: return "SPMV_NATIVE_MERGE_PATH";
     case SPMV_BSR_V41: return "SPMV_BSR_V41";
     case SPMV_BSR_V42: return "SPMV_BSR_V42";
     case SPMV_BSR_TC: return "SPMV_BSR_TC";
@@ -73,10 +77,11 @@ inline const char* get_spmv_algorithm_name(SPMVAlgorithm a) {
 inline bool is_spmv_algorithm_native(SPMVAlgorithm a) {
   switch (a) {
     case SPMV_NATIVE:
-    case SPMV_MERGE_PATH:
+    case SPMV_NATIVE_MERGE_PATH:
     case SPMV_BSR_V41:
     case SPMV_BSR_V42:
     case SPMV_BSR_TC: return true;
+    // DEFAULT, FAST_SETUP and MERGE_PATH may call TPLs
     default: return false;
   }
 }
@@ -351,6 +356,7 @@ struct SPMVHandle
     } else {
       switch (get_algorithm()) {
         case SPMV_MERGE_PATH:
+        case SPMV_NATIVE_MERGE_PATH:
           throw std::invalid_argument(std::string("SPMVHandle: algorithm ") +
                                       get_spmv_algorithm_name(get_algorithm()) +
                                       " cannot be used if A is a BsrMatrix");
diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp
index 2057a8ba14..0921a1b45a 100644
--- a/sparse/unit_test/Test_Sparse_spmv.hpp
+++ b/sparse/unit_test/Test_Sparse_spmv.hpp
@@ -518,7 +518,11 @@ template <typename scalar_t, typename lno_t, typename size_type,
 void test_spmv_algorithms(lno_t numRows, size_type nnz, lno_t bandwidth,
                           lno_t row_size_variance, bool heavy) {
   using namespace KokkosSparse;
-  for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_MERGE_PATH}) {
+  // Here, SPMV_MERGE_PATH will test a TPL's algorithm for imbalanced matrices
+  // if available (like cuSPARSE ALG2). SPMV_NATIVE_MERGE_PATH will always call
+  // the KokkosKernels implmentation of merge path.
+  for (SPMVAlgorithm algo :
+       {SPMV_DEFAULT, SPMV_NATIVE, SPMV_MERGE_PATH, SPMV_NATIVE_MERGE_PATH}) {
     test_spmv<scalar_t, lno_t, size_type, Device>(algo, numRows, nnz, bandwidth,
                                                   row_size_variance, heavy);
   }

From efbf210d923e407dd0a3048a24497512a0f0ee32 Mon Sep 17 00:00:00 2001
From: brian-kelley <bmkelle@sandia.gov>
Date: Tue, 11 Jun 2024 23:02:16 -0600
Subject: [PATCH 03/32] spgemm unit test: change matrix value distribution
 (#2241)

Change the distribution A, B values are sampled from so that
values in C can't end up close to 0 (as the result of summing terms
that are larger). The relative error metric in is_same_matrix is sensitive
to this.

Fixes #2232
---
 common/src/KokkosKernels_SimpleUtils.hpp | 4 ++--
 sparse/unit_test/Test_Sparse_spgemm.hpp  | 7 ++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp
index b447f13397..055c1d6d32 100644
--- a/common/src/KokkosKernels_SimpleUtils.hpp
+++ b/common/src/KokkosKernels_SimpleUtils.hpp
@@ -342,9 +342,9 @@ struct IsRelativelyIdenticalFunctor {
     if (val_diff > mag_type(eps)) {
       Kokkos::printf(
           "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much "
-          "(eps = %e)\n",
+          "(eps = %e, rel err = %e)\n",
           (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)),
-          KAT::imag(view2(i)), eps);
+          KAT::imag(view2(i)), eps, val_diff);
       num_diffs++;
     }
   }
diff --git a/sparse/unit_test/Test_Sparse_spgemm.hpp b/sparse/unit_test/Test_Sparse_spgemm.hpp
index bd1e68c370..139e47dcef 100644
--- a/sparse/unit_test/Test_Sparse_spgemm.hpp
+++ b/sparse/unit_test/Test_Sparse_spgemm.hpp
@@ -69,7 +69,10 @@ void randomize_matrix_values(const Values &v) {
   ScalarType randStart, randEnd;
   KokkosKernels::Impl::getRandomBounds(50.0, randStart, randEnd);
   Kokkos::Random_XorShift64_Pool<typename Values::execution_space> pool(13718);
-  Kokkos::fill_random(v, pool, randStart, randEnd);
+  // Instead of sampling from [-50, 50] or [-50-50i, 50+50i],
+  // sample from [1, 50] or [1+i, 50+50i]. That way relative
+  // error between values can't become large if values happen to sum close to 0.
+  Kokkos::fill_random(v, pool, randEnd / 50.0, randEnd);
 }
 
 template <typename crsMat_t>
@@ -254,6 +257,8 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth,
       m, k, nnz, row_size_variance, bandwidth);
   crsMat_t B = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
       k, n, nnz, row_size_variance, bandwidth);
+  randomize_matrix_values(A.values);
+  randomize_matrix_values(B.values);
 
   KokkosSparse::sort_crs_matrix(A);
   KokkosSparse::sort_crs_matrix(B);

From f400cc98fbdd060224e9b8662a17a065e7958cbf Mon Sep 17 00:00:00 2001
From: Nathan Ellingwood <ndellin@sandia.gov>
Date: Wed, 12 Jun 2024 15:18:25 -0600
Subject: [PATCH 04/32] kokkoskernels_tpls.cmake: remove duplicates arguments
 when creating argument for exported INTERFACE_INCLUDE_DIRECTORIES

Attempt to workaround issue #2238
---
 cmake/kokkoskernels_tpls.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake
index 6af952ce94..49d1adcdcb 100644
--- a/cmake/kokkoskernels_tpls.cmake
+++ b/cmake/kokkoskernels_tpls.cmake
@@ -330,6 +330,9 @@ MACRO(kokkoskernels_export_imported_tpl NAME)
 
       GET_TARGET_PROPERTY(TPL_INCLUDES ${TPL_IMPORTED_NAME} INTERFACE_INCLUDE_DIRECTORIES)
       IF(TPL_INCLUDES)
+        # remove duplicates to prevent incorrect number of arguments to INTERFACE_INCLUDE_DIRECTORIES
+        # see issue #2238
+        LIST(REMOVE_DUPLICATES TPL_INCLUDES)
         KOKKOSKERNELS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES ${TPL_INCLUDES}")
       ENDIF()
 

From e220db3603057d5ce7c70f9fc69beaacb407b918 Mon Sep 17 00:00:00 2001
From: Luc Berger <lberge@sandia.gov>
Date: Wed, 12 Jun 2024 16:50:50 -0600
Subject: [PATCH 05/32] Sparse - BsrMatrix: adding new wiki example for
 documentation (#2228)

There is already an example for this but it uses a CrsMatrix as
starting point to build a BsrMatrix which is not really helpful in
general as the hope is that you can use the BsrMatrix without needing
the CrsMatrix as it would double the storage needed...

Addressing Kim's comments
---
 example/wiki/sparse/CMakeLists.txt            |   5 +
 .../sparse/KokkosSparse_wiki_bsrmatrix_2.cpp  | 247 ++++++++++++++++++
 2 files changed, 252 insertions(+)
 create mode 100644 example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp

diff --git a/example/wiki/sparse/CMakeLists.txt b/example/wiki/sparse/CMakeLists.txt
index 16d6a3a89d..8d061c24f8 100644
--- a/example/wiki/sparse/CMakeLists.txt
+++ b/example/wiki/sparse/CMakeLists.txt
@@ -10,6 +10,11 @@ if (KOKKOSKERNELS_ENABLE_EXPERIMENTAL)
   )
 endif()
 
+KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
+        wiki_bsrmatrix_2
+        SOURCES KokkosSparse_wiki_bsrmatrix_2.cpp
+  )
+
 KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST(
   wiki_crsmatrix
   SOURCES KokkosSparse_wiki_crsmatrix.cpp
diff --git a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp
new file mode 100644
index 0000000000..7ff56ff14a
--- /dev/null
+++ b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp
@@ -0,0 +1,247 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+
+#include "Kokkos_Core.hpp"
+
+#include "KokkosKernels_default_types.hpp"
+#include "KokkosSparse_BsrMatrix.hpp"
+
+using Scalar  = default_scalar;
+using Ordinal = default_lno_t;
+using Offset  = default_size_type;
+using Layout  = default_layout;
+
+template <class bsrmatrix_type>
+struct bsr_fill {
+  bsrmatrix_type bsr_mat;
+
+  bsr_fill(bsrmatrix_type bsr_mat_) : bsr_mat(bsr_mat_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& rowIdx) const {
+    if (rowIdx == 0) {  // Left boundary condition
+      auto block_tmp  = bsr_mat.unmanaged_block(0);
+      block_tmp(0, 0) = 1.0;
+      block_tmp(0, 1) = 0.0;
+      block_tmp(1, 0) = 0.0;
+      block_tmp(1, 1) = 1.0;
+    } else if (rowIdx == bsr_mat.numRows() - 1) {  // Right boundary condition
+      auto block_tmp =
+          bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1);
+      block_tmp(0, 0) = 1.0;
+      block_tmp(1, 1) = 1.0;
+    } else {
+      auto block_tmp  = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx));
+      block_tmp(0, 0) = -1.0;
+      block_tmp(0, 1) = -1.0 / 2.0;
+      block_tmp(1, 0) = 0.0;
+      block_tmp(1, 1) = -1.0;
+
+      block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 1);
+      block_tmp(0, 0) = 2.0;
+      block_tmp(0, 1) = 0.0;
+      block_tmp(1, 0) = 0.0;
+      block_tmp(1, 1) = 2.0;
+
+      block_tmp = bsr_mat.unmanaged_block(bsr_mat.graph.row_map(rowIdx) + 2);
+      block_tmp(0, 0) = -1.0;
+      block_tmp(0, 1) = 1.0 / 2.0;
+      block_tmp(1, 0) = 0.0;
+      block_tmp(1, 1) = -1.0;
+    }
+  }
+};
+
+template <class bsrmatrix_type, class diag_blocks_type>
+struct diagonal_extractor {
+  using graph_type     = typename bsrmatrix_type::staticcrsgraph_type;
+  using row_map_type   = typename graph_type::row_map_type;
+  using entries_type   = typename graph_type::entries_type;
+  using bsr_block_type = typename bsrmatrix_type::block_type;
+
+  bsrmatrix_type bsr_mat;
+  row_map_type row_map;
+  entries_type entries;
+  diag_blocks_type diag_blocks;
+
+  diagonal_extractor(bsrmatrix_type bsr_mat_, diag_blocks_type diag_blocks_)
+      : bsr_mat(bsr_mat_),
+        row_map(bsr_mat_.graph.row_map),
+        entries(bsr_mat_.graph.entries),
+        diag_blocks(diag_blocks_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& rowIdx) const {
+    for (Offset entryIdx = row_map(rowIdx); entryIdx < row_map(rowIdx + 1);
+         ++entryIdx) {
+      if (entries(entryIdx) == rowIdx) {
+        bsr_block_type bsr_diag_block = bsr_mat.unmanaged_block(entryIdx);
+        for (int i = 0; i < bsr_mat.blockDim(); ++i) {
+          for (int j = 0; j < bsr_mat.blockDim(); ++j) {
+            diag_blocks(rowIdx, i, j) = bsr_diag_block(i, j);
+          }
+        }
+      }
+    }
+  }
+};
+
+int main(int argc, char* argv[]) {
+  using device_type = typename Kokkos::Device<
+      Kokkos::DefaultExecutionSpace,
+      typename Kokkos::DefaultExecutionSpace::memory_space>;
+  using bsrmatrix_type =
+      typename KokkosSparse::Experimental::BsrMatrix<Scalar, Ordinal,
+                                                     device_type, void, Offset>;
+  using graph_type   = typename bsrmatrix_type::staticcrsgraph_type;
+  using row_map_type = typename graph_type::row_map_type;
+  using entries_type = typename graph_type::entries_type;
+
+  Kokkos::initialize(argc, argv);
+  {
+    //
+    // We will create a 1D discretization for the coupled thermo-elastic
+    // diffusion
+    //
+    //    -\div(EA \grad_s(u) - \alpha(T-T0)I) = f_u
+    //                        -\kappa\Delta(T) = f_T
+    //
+    // The problem is discretized using finite differences as follows:
+    //    \frac{d^2 u}{dx^2}\approx \frac{u_{i+1}-2u_i+u_{i-1}}{h_x^2}
+    //    \frac{dT}{dx}\approx\frac{T_{i+1}-T_{i-1}}{2h_x}
+    //    \frac{d^2T}{dx^2}\approx\frac{T_{i+1}-2T_i+T_{i-1}}{h_x^2}
+    //
+    // This leads to the combined stencil (assuming all unit coefficients):
+    //
+    // [-1  1/2] [2 0] [-1  -1/2]
+    // [ 0   -1] [0 2] [ 0    -1]
+    //
+    // First the graph for the mesh will be constructed.
+    // Second a BsrMatrix will be constructed from the graph
+    // Third the values of the BsrMatrix will be filled.
+
+    constexpr Ordinal blockSize = 2;
+    constexpr Ordinal numRows   = 10;
+    constexpr Offset numNNZ     = 3 * numRows - 2;
+    bsrmatrix_type bsr_mat;
+
+    {
+      typename row_map_type::non_const_type row_map(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "row pointers"),
+          numRows + 1);
+      typename entries_type::non_const_type entries(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "column indices"),
+          numNNZ);
+      typename row_map_type::HostMirror row_map_h =
+          Kokkos::create_mirror_view(row_map);
+      typename entries_type::HostMirror entries_h =
+          Kokkos::create_mirror_view(entries);
+
+      // First Step: build the CrsGraph
+      {
+        // Build the row pointers and store numNNZ
+
+        row_map_h(0) = 0;
+        for (Ordinal rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+          if (rowIdx == 0) {
+            row_map_h(rowIdx + 1) = row_map_h(rowIdx) + 2;
+
+            entries_h(row_map_h(rowIdx))     = rowIdx;
+            entries_h(row_map_h(rowIdx) + 1) = rowIdx + 1;
+          } else if (rowIdx == numRows - 1) {
+            row_map_h(rowIdx + 1) = row_map_h(rowIdx) + 2;
+
+            entries_h(row_map_h(rowIdx))     = rowIdx - 1;
+            entries_h(row_map_h(rowIdx) + 1) = rowIdx;
+          } else {
+            row_map_h(rowIdx + 1) = row_map_h(rowIdx) + 3;
+
+            entries_h(row_map_h(rowIdx))     = rowIdx - 1;
+            entries_h(row_map_h(rowIdx) + 1) = rowIdx;
+            entries_h(row_map_h(rowIdx) + 2) = rowIdx + 1;
+          }
+        }
+
+        if (row_map_h(numRows) != numNNZ) {
+          std::ostringstream error_msg;
+          error_msg << "error: row_map(numRows) != numNNZ, row_map_h(numRows)="
+                    << row_map_h(numRows) << ", numNNZ=" << numNNZ;
+          throw std::runtime_error(error_msg.str());
+        }
+        Kokkos::deep_copy(row_map, row_map_h);
+        Kokkos::deep_copy(entries, entries_h);
+      }
+
+      graph_type myGraph(entries, row_map);
+
+      // Second Step: build the BsrMatrix from graph and block size
+      bsr_mat = bsrmatrix_type("block matrix", myGraph, blockSize);
+
+      bsr_fill fillFunctor(bsr_mat);
+      Kokkos::parallel_for(Kokkos::RangePolicy<int>(0, numRows), fillFunctor);
+
+      std::cout << "BsrMatrix graph: " << std::endl;
+      for (int rowIdx = 0; rowIdx < numRows; ++rowIdx) {
+        std::cout << "  [";
+        for (int colIdx = 0; colIdx < entries_h(row_map_h(rowIdx)); ++colIdx) {
+          std::cout << " ";
+        }
+        std::cout << "*";
+        for (Offset entryIdx = row_map_h(rowIdx);
+             entryIdx < row_map_h(rowIdx + 1) - 1; ++entryIdx) {
+          for (int colIdx = entries_h(entryIdx) + 1;
+               colIdx < entries_h(entryIdx + 1); ++colIdx) {
+            std::cout << " ";
+          }
+          std::cout << "*";
+        }
+        for (int colIdx = entries_h(row_map_h(rowIdx + 1) - 1) + 1;
+             colIdx < numRows; ++colIdx) {
+          std::cout << " ";
+        }
+        std::cout << "]" << std::endl;
+      }
+    }
+
+    // Extract diagonal block and store them in a rank-3 view
+    using diag_blocks_type =
+        Kokkos::View<Scalar***, typename bsrmatrix_type::block_layout_type,
+                     device_type>;
+    diag_blocks_type diag_blocks("diagonal blocks", numRows, blockSize,
+                                 blockSize);
+    diagonal_extractor myFunc(bsr_mat, diag_blocks);
+    Kokkos::parallel_for(Kokkos::RangePolicy<int>(0, numRows), myFunc);
+
+    auto diag_blocks_h =
+        Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, diag_blocks);
+
+    std::cout << "\nBsrMatrix diagonal blocks: " << std::endl;
+    for (int blockId = 0; blockId < diag_blocks_h.extent_int(0); ++blockId) {
+      std::cout << "  [" << diag_blocks_h(blockId, 0, 0) << ", "
+                << diag_blocks_h(blockId, 0, 1) << "]" << std::endl;
+      std::cout << "  [" << diag_blocks_h(blockId, 1, 0) << ", "
+                << diag_blocks_h(blockId, 1, 1) << "]\n"
+                << std::endl;
+    }
+  }
+  Kokkos::finalize();
+
+  return 0;
+}

From 3ad65b226a419023d8b45ffe9158fc3a650d3aec Mon Sep 17 00:00:00 2001
From: Luc Berger <lberge@sandia.gov>
Date: Thu, 13 Jun 2024 08:45:34 -0600
Subject: [PATCH 06/32] Sparse - CrsToBsr: fix type mismatch (#2242)

---
 sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp
index 7f1ff2171e..f773bdc0d8 100644
--- a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp
+++ b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp
@@ -99,6 +99,7 @@ template <typename Bsr, typename Crs>
 Bsr blocked_crs_to_bsr(const Crs &crs, size_t blockSize) {
   using bsr_value_type   = typename Bsr::value_type;
   using bsr_ordinal_type = typename Bsr::ordinal_type;
+  using crs_size_type    = typename Crs::non_const_size_type;
 
   // copy matrix data to host
   auto hRowMap  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
@@ -119,7 +120,7 @@ Bsr blocked_crs_to_bsr(const Crs &crs, size_t blockSize) {
 
   for (bsr_ordinal_type row = 0; row < bsr_ordinal_type(hRowMap.size()) - 1;
        ++row) {
-    for (size_t ci = hRowMap(row); ci < hRowMap(row + 1); ++ci) {
+    for (crs_size_type ci = hRowMap(row); ci < hRowMap(row + 1); ++ci) {
       bsr_ordinal_type col = hColInds(ci);
       bsr_value_type val   = hVals(ci);
 

From 119eb18cefd8165d743ade19642678ffb933e465 Mon Sep 17 00:00:00 2001
From: brian-kelley <bmkelle@sandia.gov>
Date: Thu, 13 Jun 2024 09:45:20 -0600
Subject: [PATCH 07/32] Update rocsparse algo defaults (#2245)

* Update default spmv algorithms for rocsparse

- Use stream for common cases (default, fast setup) as it has nearly zero
  setup cost and performs well for somewhat balanced matrices
- Use adaptive (which is rocsparse's default) only if SPMV_MERGE_PATH
  is the algorithm, as it has a very high setup cost

* Re-enable rocsparse spmv for SPMV_FAST_SETUP
---
 sparse/src/KokkosSparse_spmv.hpp                |  7 -------
 sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 10 +++++++++-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp
index ddbef56504..5fa0be3619 100644
--- a/sparse/src/KokkosSparse_spmv.hpp
+++ b/sparse/src/KokkosSparse_spmv.hpp
@@ -247,13 +247,6 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[],
   YVector_Internal y_i(y);
 
   bool useNative = is_spmv_algorithm_native(handle->get_algorithm());
-  // Also use the native algorithm if SPMV_FAST_SETUP was selected and
-  // rocSPARSE is the possible TPL to use. Native is faster in this case.
-#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE
-  if (handle->get_algorithm() == SPMV_FAST_SETUP &&
-      std::is_same_v<ExecutionSpace, Kokkos::HIP>)
-    useNative = true;
-#endif
 
   // Now call the proper implementation depending on isBSR and the rank of X/Y
   if constexpr (!isBSR) {
diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
index be2588483f..33eb052135 100644
--- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
+++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
@@ -392,7 +392,15 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[],
       &vecY, y.extent_int(0), y_data,
       rocsparse_compute_type<typename YVector::non_const_value_type>()));
 
-  rocsparse_spmv_alg alg = rocsparse_spmv_alg_default;
+  // Default to using the "stream" algorithm which has almost no setup cost,
+  // and performs well for reasonably balanced matrices
+  rocsparse_spmv_alg alg = rocsparse_spmv_alg_csr_stream;
+  if (handle->get_algorithm() == SPMV_MERGE_PATH) {
+    // Only use the "adaptive" algorithm if the user has indicated that the
+    // matrix is very imbalanced, by asking for merge path. This algorithm
+    // has fairly expensive setup
+    alg = rocsparse_spmv_alg_csr_adaptive;
+  }
 
   KokkosSparse::Impl::RocSparse_CRS_SpMV_Data* subhandle;
   if (handle->tpl_rank1) {

From 774eff42fb66b017be4b0ea598057ad378ff9351 Mon Sep 17 00:00:00 2001
From: brian-kelley <bmkelle@sandia.gov>
Date: Thu, 13 Jun 2024 11:19:25 -0600
Subject: [PATCH 08/32] In deprecated spmv, fix Controls algorithm mapping
 (#2246)

native -> SPMV_NATIVE
native-merge -> SPMV_NATIVE_MERGE_PATH
merge -> SPMV_MERGE_PATH
tpl -> SPMV_FAST_SETUP
---
 sparse/src/KokkosSparse_spmv_deprecated.hpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/sparse/src/KokkosSparse_spmv_deprecated.hpp b/sparse/src/KokkosSparse_spmv_deprecated.hpp
index f29caaec0c..0faef2ef4e 100644
--- a/sparse/src/KokkosSparse_spmv_deprecated.hpp
+++ b/sparse/src/KokkosSparse_spmv_deprecated.hpp
@@ -191,20 +191,18 @@ spmv(const ExecutionSpace& space,
   // Default to fast setup, since this handle can't be reused
   SPMVAlgorithm algo = SPMV_FAST_SETUP;
   // Translate the Controls algorithm selection to the SPMVHandle algorithm.
-  // This maintains the old behavior, where any manually set name that isn't
-  // "tpl" gives native.
-  //
-  // This also uses the behavior set by #2021: "merge" was a hint to use
-  // cuSPARSE merge path, but that path is gone so just use the normal TPL.
-  // "merge-path" means to use the KK merge-path implementation.
   //
   // And also support the 3 different BSR algorithms by their old names.
   if (controls.isParameter("algorithm")) {
     std::string algoName = controls.getParameter("algorithm");
-    if (algoName == "merge" || algoName == "tpl")
+    if (algoName == "tpl")
       algo = SPMV_FAST_SETUP;
-    else if (algoName == "native-merge")
+    else if (algoName == "native")
+      algo = SPMV_NATIVE;
+    else if (algoName == "merge")
       algo = SPMV_MERGE_PATH;
+    else if (algoName == "native-merge")
+      algo = SPMV_NATIVE_MERGE_PATH;
     else if (algoName == "v4.1")
       algo = SPMV_BSR_V41;
     else if (algoName == "v4.2")

From b5e6fa915da060f373e93c4500001bf3c10705f0 Mon Sep 17 00:00:00 2001
From: yasahi-hpc <57478230+yasahi-hpc@users.noreply.github.com>
Date: Mon, 17 Jun 2024 21:55:01 +0200
Subject: [PATCH 09/32] Add batched serial tbsv (#2202)

* Add batched serial tbsv

* remove incx argument and use strided views instead

* Add a new line at the end of files

* fix random number generation for complex numbers

* remove unused variables from internal tbsv serial functions

* remove allclose for testing

---------

Co-authored-by: Yuuichi Asahi <y.asahi@nr.titech.ac.jp>
---
 .../impl/KokkosBatched_Tbsv_Serial_Impl.hpp   | 169 +++++++++
 .../KokkosBatched_Tbsv_Serial_Internal.hpp    | 224 +++++++++++
 batched/dense/src/KokkosBatched_Tbsv.hpp      |  56 +++
 .../dense/unit_test/Test_Batched_Dense.hpp    |   3 +
 .../unit_test/Test_Batched_DenseUtils.hpp     |  61 ++-
 .../unit_test/Test_Batched_SerialTbsv.hpp     | 349 ++++++++++++++++++
 .../Test_Batched_SerialTbsv_Complex.hpp       | 120 ++++++
 .../Test_Batched_SerialTbsv_Real.hpp          | 137 +++++++
 blas/impl/KokkosBlas_util.hpp                 |   1 +
 9 files changed, 1118 insertions(+), 2 deletions(-)
 create mode 100644 batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp
 create mode 100644 batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp
 create mode 100644 batched/dense/src/KokkosBatched_Tbsv.hpp
 create mode 100644 batched/dense/unit_test/Test_Batched_SerialTbsv.hpp
 create mode 100644 batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp
 create mode 100644 batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp

diff --git a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp
new file mode 100644
index 0000000000..675e73f744
--- /dev/null
+++ b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Impl.hpp
@@ -0,0 +1,169 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOSBATCHED_TBSV_SERIAL_IMPL_HPP_
+#define KOKKOSBATCHED_TBSV_SERIAL_IMPL_HPP_
+
+/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Tbsv_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+
+template <typename AViewType, typename XViewType>
+KOKKOS_INLINE_FUNCTION static int checkTbsvInput(
+    [[maybe_unused]] const AViewType &A, [[maybe_unused]] const XViewType &x,
+    [[maybe_unused]] const int k) {
+  static_assert(Kokkos::is_view<AViewType>::value,
+                "KokkosBatched::tbsv: AViewType is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<XViewType>::value,
+                "KokkosBatched::tbsv: XViewType is not a Kokkos::View.");
+  static_assert(AViewType::rank == 2,
+                "KokkosBatched::tbsv: AViewType must have rank 2.");
+  static_assert(XViewType::rank == 1,
+                "KokkosBatched::tbsv: XViewType must have rank 1.");
+
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  if (k < 0) {
+    Kokkos::printf(
+        "KokkosBatched::tbsv: input parameter k must not be less than 0: k = "
+        "%d\n",
+        k);
+    return 1;
+  }
+
+  const int lda = A.extent(0), n = A.extent(1);
+  if (lda < (k + 1)) {
+    Kokkos::printf(
+        "KokkosBatched::tbsv: leading dimension of A must be smaller than k+1: "
+        "lda = %d, k = %d\n",
+        lda, k);
+    return 1;
+  }
+
+  const int nx = x.extent(0);
+  if (nx != n) {
+    Kokkos::printf(
+        "KokkosBatched::tbsv: Dimensions of x and A do not match: X: %d, A: %d "
+        "x %d\n"
+        "x.extent(0) must be equal to A.extent(1)\n",
+        nx, lda, n);
+    return 1;
+  }
+#endif
+  return 0;
+}
+
+//// Lower non-transpose ////
+template <typename ArgDiag>
+struct SerialTbsv<Uplo::Lower, Trans::NoTranspose, ArgDiag,
+                  Algo::Tbsv::Unblocked> {
+  template <typename AViewType, typename XViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
+                                           const XViewType &x, const int k) {
+    auto info = checkTbsvInput(A, x, k);
+    if (info) return info;
+
+    return SerialTbsvInternalLower<Algo::Tbsv::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(),
+        A.stride_1(), x.data(), x.stride_0(), k);
+  }
+};
+
+//// Lower transpose ////
+template <typename ArgDiag>
+struct SerialTbsv<Uplo::Lower, Trans::Transpose, ArgDiag,
+                  Algo::Tbsv::Unblocked> {
+  template <typename AViewType, typename XViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
+                                           const XViewType &x, const int k) {
+    auto info = checkTbsvInput(A, x, k);
+    if (info) return info;
+
+    return SerialTbsvInternalLowerTranspose<Algo::Tbsv::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(),
+        A.stride_1(), x.data(), x.stride_0(), k);
+  }
+};
+
+//// Lower conjugate-transpose ////
+template <typename ArgDiag>
+struct SerialTbsv<Uplo::Lower, Trans::ConjTranspose, ArgDiag,
+                  Algo::Tbsv::Unblocked> {
+  template <typename AViewType, typename XViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
+                                           const XViewType &x, const int k) {
+    auto info = checkTbsvInput(A, x, k);
+    if (info) return info;
+
+    return SerialTbsvInternalLowerTranspose<Algo::Tbsv::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(),
+        A.stride_1(), x.data(), x.stride_0(), k);
+  }
+};
+
+//// Upper non-transpose ////
+template <typename ArgDiag>
+struct SerialTbsv<Uplo::Upper, Trans::NoTranspose, ArgDiag,
+                  Algo::Tbsv::Unblocked> {
+  template <typename AViewType, typename XViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
+                                           const XViewType &x, const int k) {
+    auto info = checkTbsvInput(A, x, k);
+    if (info) return info;
+
+    return SerialTbsvInternalUpper<Algo::Tbsv::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, A.extent(1), A.data(), A.stride_0(),
+        A.stride_1(), x.data(), x.stride_0(), k);
+  }
+};
+
+//// Upper transpose ////
+template <typename ArgDiag>
+struct SerialTbsv<Uplo::Upper, Trans::Transpose, ArgDiag,
+                  Algo::Tbsv::Unblocked> {
+  template <typename AViewType, typename XViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
+                                           const XViewType &x, const int k) {
+    auto info = checkTbsvInput(A, x, k);
+    if (info) return info;
+
+    return SerialTbsvInternalUpperTranspose<Algo::Tbsv::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, false, A.extent(1), A.data(), A.stride_0(),
+        A.stride_1(), x.data(), x.stride_0(), k);
+  }
+};
+
+//// Upper conjugate-transpose ////
+template <typename ArgDiag>
+struct SerialTbsv<Uplo::Upper, Trans::ConjTranspose, ArgDiag,
+                  Algo::Tbsv::Unblocked> {
+  template <typename AViewType, typename XViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
+                                           const XViewType &x, const int k) {
+    auto info = checkTbsvInput(A, x, k);
+    if (info) return info;
+
+    return SerialTbsvInternalUpperTranspose<Algo::Tbsv::Unblocked>::invoke(
+        ArgDiag::use_unit_diag, true, A.extent(1), A.data(), A.stride_0(),
+        A.stride_1(), x.data(), x.stride_0(), k);
+  }
+};
+
+}  // namespace KokkosBatched
+
+#endif  // KOKKOSBATCHED_TBSV_SERIAL_IMPL_HPP_
diff --git a/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp
new file mode 100644
index 0000000000..d2f5df4649
--- /dev/null
+++ b/batched/dense/impl/KokkosBatched_Tbsv_Serial_Internal.hpp
@@ -0,0 +1,224 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOSBATCHED_TBSV_SERIAL_INTERNAL_HPP_
+#define KOKKOSBATCHED_TBSV_SERIAL_INTERNAL_HPP_
+
+/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr)
+
+#include "KokkosBatched_Util.hpp"
+
+namespace KokkosBatched {
+
+///
+/// Serial Internal Impl
+/// ====================
+
+///
+/// Lower, Non-Transpose
+///
+
+template <typename AlgoType>
+struct SerialTbsvInternalLower {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag,
+                                           const int an,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1,
+                                           /**/ ValueType *KOKKOS_RESTRICT x,
+                                           const int xs0, const int k);
+};
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTbsvInternalLower<Algo::Tbsv::Unblocked>::invoke(
+    const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A,
+    const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+  for (int j = 0; j < an; ++j) {
+    if (x[j * xs0] != static_cast<ValueType>(0)) {
+      if (!use_unit_diag) x[j * xs0] = x[j * xs0] / A[0 + j * as1];
+
+      auto temp = x[j * xs0];
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+      for (int i = j + 1; i < Kokkos::min(an, j + k + 1); ++i) {
+        x[i * xs0] = x[i * xs0] - temp * A[(i - j) * as0 + j * as1];
+      }
+    }
+  }
+
+  return 0;
+}
+
+///
+/// Lower, Transpose
+///
+
+template <typename AlgoType>
+struct SerialTbsvInternalLowerTranspose {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag,
+                                           const bool do_conj, const int an,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1,
+                                           /**/ ValueType *KOKKOS_RESTRICT x,
+                                           const int xs0, const int k);
+};
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTbsvInternalLowerTranspose<Algo::Tbsv::Unblocked>::invoke(
+    const bool use_unit_diag, const bool do_conj, const int an,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+  for (int j = an - 1; j >= 0; --j) {
+    auto temp = x[j * xs0];
+
+    if (do_conj) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+      for (int i = Kokkos::min(an - 1, j + k); i > j; --i) {
+        temp -=
+            Kokkos::ArithTraits<ValueType>::conj(A[(i - j) * as0 + j * as1]) *
+            x[i * xs0];
+      }
+      if (!use_unit_diag)
+        temp = temp / Kokkos::ArithTraits<ValueType>::conj(A[0 + j * as1]);
+    } else {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+      for (int i = Kokkos::min(an - 1, j + k); i > j; --i) {
+        temp -= A[(i - j) * as0 + j * as1] * x[i * xs0];
+      }
+      if (!use_unit_diag) temp = temp / A[0 + j * as1];
+    }
+    x[j * xs0] = temp;
+  }
+
+  return 0;
+}
+
+///
+/// Upper, Non-Transpose
+///
+
+template <typename AlgoType>
+struct SerialTbsvInternalUpper {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag,
+                                           const int an,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1,
+                                           /**/ ValueType *KOKKOS_RESTRICT x,
+                                           const int xs0, const int k);
+};
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTbsvInternalUpper<Algo::Tbsv::Unblocked>::invoke(
+    const bool use_unit_diag, const int an, const ValueType *KOKKOS_RESTRICT A,
+    const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+  for (int j = an - 1; j >= 0; --j) {
+    if (x[j * xs0] != 0) {
+      if (!use_unit_diag) x[j * xs0] = x[j * xs0] / A[k * as0 + j * as1];
+
+      auto temp = x[j * xs0];
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+      for (int i = j - 1; i >= Kokkos::max(0, j - k); --i) {
+        x[i * xs0] = x[i * xs0] - temp * A[(k - j + i) * as0 + j * as1];
+      }
+    }
+  }
+
+  return 0;
+}
+
+///
+/// Upper, Transpose
+///
+
+template <typename AlgoType>
+struct SerialTbsvInternalUpperTranspose {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const bool use_unit_diag,
+                                           const bool do_conj, const int an,
+                                           const ValueType *KOKKOS_RESTRICT A,
+                                           const int as0, const int as1,
+                                           /**/ ValueType *KOKKOS_RESTRICT x,
+                                           const int xs0, const int k);
+};
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int
+SerialTbsvInternalUpperTranspose<Algo::Tbsv::Unblocked>::invoke(
+    const bool use_unit_diag, const bool do_conj, const int an,
+    const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1,
+    /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int k) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+  for (int j = 0; j < an; j++) {
+    auto temp = x[j * xs0];
+    if (do_conj) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+      for (int i = Kokkos::max(0, j - k); i < j; ++i) {
+        temp -= Kokkos::ArithTraits<ValueType>::conj(
+                    A[(i + k - j) * as0 + j * as1]) *
+                x[i * xs0];
+      }
+      if (!use_unit_diag)
+        temp =
+            temp / Kokkos::ArithTraits<ValueType>::conj(A[k * as0 + j * as1]);
+    } else {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+      for (int i = Kokkos::max(0, j - k); i < j; ++i) {
+        temp -= A[(i + k - j) * as0 + j * as1] * x[i * xs0];
+      }
+      if (!use_unit_diag) temp = temp / A[k * as0 + j * as1];
+    }
+    x[j * xs0] = temp;
+  }
+
+  return 0;
+}
+
+}  // namespace KokkosBatched
+
+#endif  // KOKKOSBATCHED_TBSV_SERIAL_INTERNAL_HPP_
diff --git a/batched/dense/src/KokkosBatched_Tbsv.hpp b/batched/dense/src/KokkosBatched_Tbsv.hpp
new file mode 100644
index 0000000000..7510c07969
--- /dev/null
+++ b/batched/dense/src/KokkosBatched_Tbsv.hpp
@@ -0,0 +1,56 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+#ifndef KOKKOSBATCHED_TBSV_HPP_
+#define KOKKOSBATCHED_TBSV_HPP_
+
+#include <KokkosBatched_Util.hpp>
+
+/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr)
+
+namespace KokkosBatched {
+
+/// \brief Serial Batched Tbsv:
+///
+/// Solve Ab_l x_l = b_l for all l = 0, ..., N
+///   using the triangular solve algorithm Tbsv. Ab is an n by n unit, or
+///   non-unit, upper or lower triangular band matrix, with ( k + 1 )
+///   diagonals.
+///
+/// \tparam AViewType: Input type for the matrix, needs to be a 2D view
+/// \tparam XViewType: Input type for the right-hand side and the solution,
+/// needs to be a 1D view
+///
+/// \param A [in]: A is a lda by n banded matrix, with ( k + 1 ) diagonals
+/// \param X [inout]: right-hand side and the solution, a rank 1 view
+/// \param k [in]: k specifies the number of superdiagonals or subdiagonals of
+/// matrix A. k >= 0
+///
+/// No nested parallel_for is used inside of the function.
+///
+
+template <typename ArgUplo, typename ArgTrans, typename ArgDiag,
+          typename ArgAlgo>
+struct SerialTbsv {
+  template <typename AViewType, typename XViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A,
+                                           const XViewType &X, const int k);
+};
+
+}  // namespace KokkosBatched
+
+#include "KokkosBatched_Tbsv_Serial_Impl.hpp"
+
+#endif  // KOKKOSBATCHED_TBSV_HPP_
diff --git a/batched/dense/unit_test/Test_Batched_Dense.hpp b/batched/dense/unit_test/Test_Batched_Dense.hpp
index cf9b3c23f4..7b0ee58312 100644
--- a/batched/dense/unit_test/Test_Batched_Dense.hpp
+++ b/batched/dense/unit_test/Test_Batched_Dense.hpp
@@ -42,6 +42,9 @@
 #include "Test_Batched_SerialTrsv.hpp"
 #include "Test_Batched_SerialTrsv_Real.hpp"
 #include "Test_Batched_SerialTrsv_Complex.hpp"
+#include "Test_Batched_SerialTbsv.hpp"
+#include "Test_Batched_SerialTbsv_Real.hpp"
+#include "Test_Batched_SerialTbsv_Complex.hpp"
 #include "Test_Batched_SerialTrtri.hpp"
 #include "Test_Batched_SerialTrtri_Real.hpp"
 #include "Test_Batched_SerialTrtri_Complex.hpp"
diff --git a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp
index 6a96bd193a..689ff4f7a5 100644
--- a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp
+++ b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp
@@ -16,10 +16,12 @@
 #ifndef TEST_BATCHED_DENSE_HELPER_HPP
 #define TEST_BATCHED_DENSE_HELPER_HPP
 
+#include "KokkosBatched_Util.hpp"
+
 namespace KokkosBatched {
 template <typename MatrixViewType, typename VectorViewType>
-void create_tridiagonal_batched_matrices(const MatrixViewType &A,
-                                         const VectorViewType &B) {
+void create_tridiagonal_batched_matrices(const MatrixViewType& A,
+                                         const VectorViewType& B) {
   Kokkos::Random_XorShift64_Pool<
       typename VectorViewType::device_type::execution_space>
       random(13718);
@@ -54,6 +56,61 @@ void create_tridiagonal_batched_matrices(const MatrixViewType &A,
 
   Kokkos::fence();
 }
+
+template <typename InViewType, typename OutViewType, typename UploType>
+void create_banded_triangular_matrix(InViewType& in, OutViewType& out,
+                                     int k = 1, bool band_storage = true) {
+  auto h_in   = Kokkos::create_mirror_view(in);
+  auto h_out  = Kokkos::create_mirror_view(out);
+  const int N = in.extent(0), BlkSize = in.extent(1);
+
+  Kokkos::deep_copy(h_in, in);
+  if (band_storage) {
+    assert(out.extent(0) == in.extent(0));
+    assert(out.extent(1) == static_cast<std::size_t>(k + 1));
+    assert(out.extent(2) == in.extent(2));
+    if constexpr (std::is_same_v<UploType, KokkosBatched::Uplo::Upper>) {
+      for (int i0 = 0; i0 < N; i0++) {
+        for (int i1 = 0; i1 < k + 1; i1++) {
+          for (int i2 = i1; i2 < BlkSize; i2++) {
+            h_out(i0, k - i1, i2) = h_in(i0, i2 - i1, i2);
+          }
+        }
+      }
+    } else {
+      for (int i0 = 0; i0 < N; i0++) {
+        for (int i1 = 0; i1 < k + 1; i1++) {
+          for (int i2 = 0; i2 < BlkSize - i1; i2++) {
+            h_out(i0, i1, i2) = h_in(i0, i2 + i1, i2);
+          }
+        }
+      }
+    }
+  } else {
+    for (std::size_t i = 0; i < InViewType::rank(); i++) {
+      assert(out.extent(i) == in.extent(i));
+    }
+
+    if constexpr (std::is_same_v<UploType, KokkosBatched::Uplo::Upper>) {
+      for (int i0 = 0; i0 < N; i0++) {
+        for (int i1 = 0; i1 < BlkSize; i1++) {
+          for (int i2 = i1; i2 < Kokkos::min(i1 + k + 1, BlkSize); i2++) {
+            h_out(i0, i1, i2) = h_in(i0, i1, i2);
+          }
+        }
+      }
+    } else {
+      for (int i0 = 0; i0 < N; i0++) {
+        for (int i1 = 0; i1 < BlkSize; i1++) {
+          for (int i2 = Kokkos::max(0, i1 - k); i2 <= i1; i2++) {
+            h_out(i0, i1, i2) = h_in(i0, i1, i2);
+          }
+        }
+      }
+    }
+  }
+  Kokkos::deep_copy(out, h_out);
+}
 }  // namespace KokkosBatched
 
 #endif  // TEST_BATCHED_DENSE_HELPER_HPP
diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp
new file mode 100644
index 0000000000..572e02053b
--- /dev/null
+++ b/batched/dense/unit_test/Test_Batched_SerialTbsv.hpp
@@ -0,0 +1,349 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr)
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Tbsv.hpp"
+#include "Test_Batched_DenseUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace Tbsv {
+
+template <typename U, typename T, typename D>
+struct ParamTag {
+  using uplo  = U;
+  using trans = T;
+  using diag  = D;
+};
+
+template <typename DeviceType, typename AViewType, typename BViewType,
+          typename ScalarType, typename ParamTagType, typename AlgoTagType>
+struct Functor_BatchedSerialTrsv {
+  using execution_space = typename DeviceType::execution_space;
+  AViewType _a;
+  BViewType _b;
+
+  ScalarType _alpha;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_BatchedSerialTrsv(const ScalarType alpha, const AViewType &a,
+                            const BViewType &b)
+      : _a(a), _b(b), _alpha(alpha) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ParamTagType &, const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL());
+
+    KokkosBatched::SerialTrsv<
+        typename ParamTagType::uplo, typename ParamTagType::trans,
+        typename ParamTagType::diag, AlgoTagType>::invoke(_alpha, aa, bb);
+  }
+
+  inline void run() {
+    using value_type = typename AViewType::non_const_value_type;
+    std::string name_region("KokkosBatched::Test::SerialTbsv");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::RangePolicy<execution_space, ParamTagType> policy(0, _b.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+  }
+};
+
+template <typename DeviceType, typename AViewType, typename BViewType,
+          typename ParamTagType, typename AlgoTagType>
+struct Functor_BatchedSerialTbsv {
+  using execution_space = typename DeviceType::execution_space;
+  AViewType _a;
+  BViewType _b;
+  int _k;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_BatchedSerialTbsv(const AViewType &a, const BViewType &b, const int k)
+      : _a(a), _b(b), _k(k) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const ParamTagType &, const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL());
+
+    KokkosBatched::SerialTbsv<
+        typename ParamTagType::uplo, typename ParamTagType::trans,
+        typename ParamTagType::diag, AlgoTagType>::invoke(aa, bb, _k);
+  }
+
+  inline void run() {
+    using value_type = typename AViewType::non_const_value_type;
+    std::string name_region("KokkosBatched::Test::SerialTbsv");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<execution_space, ParamTagType> policy(0, _b.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+    Kokkos::Profiling::popRegion();
+  }
+};
+
+template <typename DeviceType, typename ScalarType, typename LayoutType,
+          typename ParamTagType, typename AlgoTagType>
+/// \brief Implementation details of batched tbsv test
+///
+/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix)
+/// \param k [in] Number of superdiagonals or subdiagonals of matrix A
+/// \param BlkSize [in] Block size of matrix A
+void impl_test_batched_tbsv(const int N, const int k, const int BlkSize) {
+  using execution_space = typename DeviceType::execution_space;
+  using View2DType = Kokkos::View<ScalarType **, LayoutType, execution_space>;
+  using View3DType = Kokkos::View<ScalarType ***, LayoutType, execution_space>;
+
+  // Reference is created by trsv from triangular matrix
+  View3DType A("A", N, BlkSize, BlkSize), Ref("Ref", N, BlkSize, BlkSize);
+  View3DType Ab("Ab", N, k + 1, BlkSize);                 // Banded matrix
+  View2DType x0("x0", N, BlkSize), x1("x1", N, BlkSize);  // Solutions
+
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(13718);
+  ScalarType randStart, randEnd;
+  Test::getRandomBounds(1.0, randStart, randEnd);
+  Kokkos::fill_random(Ref, rand_pool, randStart, randEnd);
+  Kokkos::fill_random(x0, rand_pool, randStart, randEnd);
+
+  Kokkos::deep_copy(x1, x0);
+
+  // Create triangluar or banded matrix
+  create_banded_triangular_matrix<View3DType, View3DType,
+                                  typename ParamTagType::uplo>(Ref, A, k,
+                                                               false);
+  create_banded_triangular_matrix<View3DType, View3DType,
+                                  typename ParamTagType::uplo>(Ref, Ab, k,
+                                                               true);
+
+  // Reference trsv
+  Functor_BatchedSerialTrsv<DeviceType, View3DType, View2DType, ScalarType,
+                            ParamTagType, Algo::Trsv::Unblocked>(1.0, A, x0)
+      .run();
+
+  // tbsv
+  Functor_BatchedSerialTbsv<DeviceType, View3DType, View2DType, ParamTagType,
+                            AlgoTagType>(Ab, x1, k)
+      .run();
+
+  Kokkos::fence();
+
+  // this eps is about 10^-14
+  using ats      = typename Kokkos::ArithTraits<ScalarType>;
+  using mag_type = typename ats::mag_type;
+  mag_type eps   = 1.0e3 * ats::epsilon();
+
+  // Check x0 = x1
+  auto h_x0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x0);
+  auto h_x1 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x1);
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < BlkSize; j++) {
+      EXPECT_NEAR_KK(h_x0(i, j), h_x1(i, j), eps);
+    }
+  }
+}
+
+template <typename DeviceType, typename ScalarType, typename LayoutType,
+          typename ParamTagType, typename AlgoTagType>
+/// \brief Implementation details of batched tbsv test
+///
+/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix)
+void impl_test_batched_tbsv_analytical(const std::size_t N) {
+  using execution_space = typename DeviceType::execution_space;
+  using View2DType = Kokkos::View<ScalarType **, LayoutType, execution_space>;
+  using StridedView2DType =
+      Kokkos::View<ScalarType **, Kokkos::LayoutStride, execution_space>;
+  using View3DType = Kokkos::View<ScalarType ***, LayoutType, execution_space>;
+
+  // Reference is created by trsv from triangular matrix
+  constexpr std::size_t BlkSize = 3, k = 2, incx = 2;
+
+  View3DType A("A", N, BlkSize, BlkSize), ref("Ref", N, BlkSize, BlkSize);
+  View3DType Ab("Ab", N, k + 1, BlkSize);                       // Banded matrix
+  View2DType x0("x0", N, BlkSize), x_ref("x_ref", N, BlkSize);  // Solutions
+
+  // Testing incx argument with strided Views
+  Kokkos::LayoutStride layout{N, incx, BlkSize, N * incx};
+  StridedView2DType x1("x1", layout);  // Solutions
+
+  Kokkos::RangePolicy<execution_space> policy(0, N);
+  Kokkos::parallel_for(
+      "KokkosBatched::Test::SerialTbsv::Initialize", policy,
+      KOKKOS_LAMBDA(const std::size_t ib) {
+        for (std::size_t i = 0; i < BlkSize; i++) {
+          for (std::size_t j = 0; j < BlkSize; j++) {
+            ref(ib, i, j) = i + 1;
+          }
+        }
+        for (std::size_t j = 0; j < BlkSize; j++) {
+          x0(ib, j) = 1;
+          x1(ib, j) = 1;
+        }
+
+        if (std::is_same_v<typename ParamTagType::uplo,
+                           KokkosBatched::Uplo::Upper>) {
+          if (std::is_same_v<typename ParamTagType::trans,
+                             Trans::NoTranspose>) {
+            if (std::is_same_v<typename ParamTagType::diag, Diag::NonUnit>) {
+              x_ref(ib, 0) = 1.0 / 2.0;
+              x_ref(ib, 1) = 1.0 / 6.0;
+              x_ref(ib, 2) = 1.0 / 3.0;
+            } else {
+              x_ref(ib, 0) = 1.0;
+              x_ref(ib, 1) = -1.0;
+              x_ref(ib, 2) = 1.0;
+            }
+          } else {
+            if (std::is_same_v<typename ParamTagType::diag, Diag::NonUnit>) {
+              x_ref(ib, 0) = 1.0;
+              x_ref(ib, 1) = 0.0;
+              x_ref(ib, 2) = 0.0;
+            } else {
+              x_ref(ib, 0) = 1.0;
+              x_ref(ib, 1) = 0.0;
+              x_ref(ib, 2) = 0.0;
+            }
+          }
+        } else {
+          if (std::is_same_v<typename ParamTagType::trans,
+                             Trans::NoTranspose>) {
+            if (std::is_same_v<typename ParamTagType::diag, Diag::NonUnit>) {
+              x_ref(ib, 0) = 1.0;
+              x_ref(ib, 1) = -1.0 / 2.0;
+              x_ref(ib, 2) = -1.0 / 6.0;
+            } else {
+              x_ref(ib, 0) = 1.0;
+              x_ref(ib, 1) = -1.0;
+              x_ref(ib, 2) = 1.0;
+            }
+          } else {
+            if (std::is_same_v<typename ParamTagType::diag, Diag::NonUnit>) {
+              x_ref(ib, 0) = 0.0;
+              x_ref(ib, 1) = 0.0;
+              x_ref(ib, 2) = 1.0 / 3.0;
+            } else {
+              x_ref(ib, 0) = 2.0;
+              x_ref(ib, 1) = -2.0;
+              x_ref(ib, 2) = 1.0;
+            }
+          }
+        }
+      });
+
+  Kokkos::fence();
+
+  // Create triangluar or banded matrix
+  create_banded_triangular_matrix<View3DType, View3DType,
+                                  typename ParamTagType::uplo>(ref, A, k,
+                                                               false);
+  create_banded_triangular_matrix<View3DType, View3DType,
+                                  typename ParamTagType::uplo>(ref, Ab, k,
+                                                               true);
+
+  // tbsv
+  Functor_BatchedSerialTbsv<DeviceType, View3DType, View2DType, ParamTagType,
+                            AlgoTagType>(Ab, x0, k)
+      .run();
+
+  // tbsv with incx == 2
+  Functor_BatchedSerialTbsv<DeviceType, View3DType, StridedView2DType,
+                            ParamTagType, AlgoTagType>(Ab, x1, k)
+      .run();
+
+  Kokkos::fence();
+
+  // Check x0 = x_ref and x1 = x_ref
+  // Firstly, prepare contiguous views on host
+  auto h_x0 = Kokkos::create_mirror_view(x0);
+  auto h_x1 = Kokkos::create_mirror_view(x0);
+
+  Kokkos::deep_copy(h_x0, x0);
+
+  // Pack x1 into x0 for contiguous storage
+  Kokkos::parallel_for(
+      "KokkosBatched::Test::SerialTbsv::Copy", policy,
+      KOKKOS_LAMBDA(const std::size_t ib) {
+        for (std::size_t j = 0; j < BlkSize; j++) {
+          x0(ib, j) = x1(ib, j);
+        }
+      });
+
+  Kokkos::fence();
+  Kokkos::deep_copy(h_x1, x0);
+
+  // this eps is about 10^-14
+  using ats      = typename Kokkos::ArithTraits<ScalarType>;
+  using mag_type = typename ats::mag_type;
+  mag_type eps   = 1.0e3 * ats::epsilon();
+
+  auto h_x_ref =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x_ref);
+  for (std::size_t ib = 0; ib < N; ib++) {
+    for (std::size_t j = 0; j < BlkSize; j++) {
+      // Check x0 = x_ref
+      EXPECT_NEAR_KK(h_x0(ib, j), h_x_ref(ib, j), eps);
+
+      // Check x1 = x_ref
+      EXPECT_NEAR_KK(h_x1(ib, j), h_x_ref(ib, j), eps);
+    }
+  }
+}
+
+}  // namespace Tbsv
+}  // namespace Test
+
+template <typename DeviceType, typename ScalarType, typename ParamTagType,
+          typename AlgoTagType>
+int test_batched_tbsv() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    using LayoutType = Kokkos::LayoutLeft;
+    Test::Tbsv::impl_test_batched_tbsv_analytical<
+        DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(0);
+    Test::Tbsv::impl_test_batched_tbsv_analytical<
+        DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(1);
+    Test::Tbsv::impl_test_batched_tbsv<DeviceType, ScalarType, LayoutType,
+                                       ParamTagType, AlgoTagType>(0, 1, 10);
+    for (int i = 0; i < 10; i++) {
+      Test::Tbsv::impl_test_batched_tbsv<DeviceType, ScalarType, LayoutType,
+                                         ParamTagType, AlgoTagType>(1, 1, i);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    using LayoutType = Kokkos::LayoutRight;
+    Test::Tbsv::impl_test_batched_tbsv_analytical<
+        DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(0);
+    Test::Tbsv::impl_test_batched_tbsv_analytical<
+        DeviceType, ScalarType, LayoutType, ParamTagType, AlgoTagType>(1);
+    Test::Tbsv::impl_test_batched_tbsv<DeviceType, ScalarType, LayoutType,
+                                       ParamTagType, AlgoTagType>(0, 1, 10);
+    for (int i = 0; i < 10; i++) {
+      Test::Tbsv::impl_test_batched_tbsv<DeviceType, ScalarType, LayoutType,
+                                         ParamTagType, AlgoTagType>(1, 1, i);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp
new file mode 100644
index 0000000000..8789cc6931
--- /dev/null
+++ b/batched/dense/unit_test/Test_Batched_SerialTbsv_Complex.hpp
@@ -0,0 +1,120 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
+// NO TRANSPOSE
+TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_dcomplex) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, Kokkos::complex<double>, param_tag_type,
+                    algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_dcomplex) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, Kokkos::complex<double>, param_tag_type,
+                    algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_dcomplex) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, Kokkos::complex<double>, param_tag_type,
+                    algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_dcomplex) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, Kokkos::complex<double>, param_tag_type,
+                    algo_tag_type>();
+}
+// TRANSPOSE
+TEST_F(TestCategory, batched_serial_tbsv_l_t_u_dcomplex) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::Transpose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, Kokkos::complex<double>, param_tag_type,
+                    algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_l_t_n_dcomplex) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::Transpose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, Kokkos::complex<double>, param_tag_type,
+                    algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_t_u_dcomplex) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::Transpose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, Kokkos::complex<double>, param_tag_type,
+                    algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_t_n_dcomplex) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::Transpose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, Kokkos::complex<double>, param_tag_type,
+                    algo_tag_type>();
+}
+
+/* [FIXME] These tests need Trans::ConjTranspose in trsv.
+// CONJUGATE TRANSPOSE
+TEST_F(TestCategory, batched_serial_tbsv_l_ct_u_dcomplex) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::ConjTranspose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, Kokkos::complex<double>, param_tag_type,
+                    algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_l_ct_n_dcomplex) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::ConjTranspose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, Kokkos::complex<double>, param_tag_type,
+                    algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_ct_u_dcomplex) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::ConjTranspose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, Kokkos::complex<double>, param_tag_type,
+                    algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_ct_n_dcomplex) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::ConjTranspose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, Kokkos::complex<double>, param_tag_type,
+                    algo_tag_type>();
+}
+*/
+#endif
diff --git a/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp
new file mode 100644
index 0000000000..8915b4ad05
--- /dev/null
+++ b/batched/dense/unit_test/Test_Batched_SerialTbsv_Real.hpp
@@ -0,0 +1,137 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+// NO TRANSPOSE
+TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_float) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, float, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_float) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, float, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_float) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, float, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_float) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, float, param_tag_type, algo_tag_type>();
+}
+// TRANSPOSE
+TEST_F(TestCategory, batched_serial_tbsv_l_t_u_float) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::Transpose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, float, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_l_t_n_float) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::Transpose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, float, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_t_u_float) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::Transpose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, float, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_t_n_float) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::Transpose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, float, param_tag_type, algo_tag_type>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+// NO TRANSPOSE
+TEST_F(TestCategory, batched_serial_tbsv_l_nt_u_double) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, double, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_l_nt_n_double) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::NoTranspose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, double, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_nt_u_double) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, double, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_nt_n_double) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::NoTranspose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, double, param_tag_type, algo_tag_type>();
+}
+// TRANSPOSE
+TEST_F(TestCategory, batched_serial_tbsv_l_t_u_double) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::Transpose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, double, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_l_t_n_double) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Lower, Trans::Transpose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, double, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_t_u_double) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::Transpose, Diag::Unit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, double, param_tag_type, algo_tag_type>();
+}
+TEST_F(TestCategory, batched_serial_tbsv_u_t_n_double) {
+  using param_tag_type =
+      ::Test::Tbsv::ParamTag<Uplo::Upper, Trans::Transpose, Diag::NonUnit>;
+  using algo_tag_type = typename Algo::Tbsv::Unblocked;
+
+  test_batched_tbsv<TestDevice, double, param_tag_type, algo_tag_type>();
+}
+#endif
diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp
index 50173538fb..ecb72e7c9a 100644
--- a/blas/impl/KokkosBlas_util.hpp
+++ b/blas/impl/KokkosBlas_util.hpp
@@ -116,6 +116,7 @@ struct Algo {
   using Gemv   = Level2;
   using Trsv   = Level2;
   using ApplyQ = Level2;
+  using Tbsv   = Level2;
 };
 
 namespace Impl {

From 49b1d46fcb08e71e8f652cef8b488ac60d6c99d5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 17 Jun 2024 21:56:18 +0200
Subject: [PATCH 10/32] Bump actions/checkout from 4.1.6 to 4.1.7 (#2248)

Bumps [actions/checkout](https://github.com/actions/checkout) from 4.1.6 to 4.1.7.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/a5ac7e51b41094c92402da3b24376905380afc29...692973e3d937129bcbf40652eb9f2f61becf3332)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/bdw.yml               | 4 ++--
 .github/workflows/codeql.yml            | 4 ++--
 .github/workflows/dependency-review.yml | 2 +-
 .github/workflows/docs.yml              | 4 ++--
 .github/workflows/format.yml            | 2 +-
 .github/workflows/h100.yml              | 4 ++--
 .github/workflows/mi210.yml             | 4 ++--
 .github/workflows/osx.yml               | 4 ++--
 .github/workflows/scorecards.yml        | 2 +-
 .github/workflows/spr.yml               | 4 ++--
 10 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/bdw.yml b/.github/workflows/bdw.yml
index f60008ab72..450a0975ab 100644
--- a/.github/workflows/bdw.yml
+++ b/.github/workflows/bdw.yml
@@ -188,12 +188,12 @@ jobs:
     
     steps:
       - name: checkout_kokkos_kernels
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           path: kokkos-kernels
 
       - name: checkout_kokkos
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           repository: kokkos/kokkos
           ref: ${{ github.base_ref }}
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 06328c83c1..073453d075 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -38,7 +38,7 @@ jobs:
         egress-policy: audit
 
     - name: checkout_kokkos_kernels
-      uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+      uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
       with:
         path: kokkos-kernels
 
@@ -52,7 +52,7 @@ jobs:
         # Prefix the list here with "+" to use these queries and those in the config file.
 
     - name: checkout_kokkos
-      uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+      uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
       with:
         repository: 'kokkos/kokkos'
         path: 'kokkos'
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
index b911317970..1792f0181c 100644
--- a/.github/workflows/dependency-review.yml
+++ b/.github/workflows/dependency-review.yml
@@ -22,6 +22,6 @@ jobs:
           egress-policy: audit
 
       - name: 'Checkout Repository'
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
       - name: 'Dependency Review'
         uses: actions/dependency-review-action@72eb03d02c7872a771aacd928f3123ac62ad6d3a # v4.3.3
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 901e218fdc..9690446a4f 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -25,12 +25,12 @@ jobs:
           doxygen --version
 
       - name: checkout_kokkos_kernels
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           path: kokkos-kernels
 
       - name: checkout_kokkos
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           repository: kokkos/kokkos
           ref: 4.3.00
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index 5517b68dbb..08b541587f 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -13,7 +13,7 @@ jobs:
   clang-format-check:
     runs-on: ubuntu-20.04
     steps:
-      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
 
       - name: Install Dependencies
         run: sudo apt install clang-format-8
diff --git a/.github/workflows/h100.yml b/.github/workflows/h100.yml
index 5fd01d972b..0d20177b96 100644
--- a/.github/workflows/h100.yml
+++ b/.github/workflows/h100.yml
@@ -26,12 +26,12 @@ jobs:
     
     steps:
       - name: checkout_kokkos_kernels
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           path: kokkos-kernels
 
       - name: checkout_kokkos
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           repository: kokkos/kokkos
           ref: ${{ github.base_ref }}
diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml
index 7b55f065bf..9735e405f1 100644
--- a/.github/workflows/mi210.yml
+++ b/.github/workflows/mi210.yml
@@ -107,12 +107,12 @@ jobs:
     
     steps:
       - name: checkout_kokkos_kernels
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           path: kokkos-kernels
 
       - name: checkout_kokkos
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           repository: kokkos/kokkos
           ref: ${{ github.base_ref }}
diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml
index 082467d614..fa23b5dd72 100644
--- a/.github/workflows/osx.yml
+++ b/.github/workflows/osx.yml
@@ -50,12 +50,12 @@ jobs:
 
     steps:
       - name: checkout_kokkos_kernels
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           path: kokkos-kernels
 
       - name: checkout_kokkos
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           repository: kokkos/kokkos
           ref: 4.3.00
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 2885ca7fae..dee549daeb 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -38,7 +38,7 @@ jobs:
           egress-policy: audit
 
       - name: "Checkout code"
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           persist-credentials: false
 
diff --git a/.github/workflows/spr.yml b/.github/workflows/spr.yml
index 8fe8053f5b..c38d04ac8d 100644
--- a/.github/workflows/spr.yml
+++ b/.github/workflows/spr.yml
@@ -26,12 +26,12 @@ jobs:
     
     steps:
       - name: checkout_kokkos_kernels
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           path: kokkos-kernels
 
       - name: checkout_kokkos
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           repository: kokkos/kokkos
           ref: ${{ github.base_ref }}

From fe2a92f9f20b37589c89180f9aa6ac18ee020926 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 17 Jun 2024 21:57:00 +0200
Subject: [PATCH 11/32] Bump github/codeql-action from 3.25.8 to 3.25.10
 (#2249)

Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.25.8 to 3.25.10.
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/2e230e8fe0ad3a14a340ad0815ddb96d599d2aff...23acc5c183826b7a8a97bce3cecc52db901f8251)

---
updated-dependencies:
- dependency-name: github/codeql-action
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/codeql.yml     | 4 ++--
 .github/workflows/scorecards.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 073453d075..7ed1a206a3 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -44,7 +44,7 @@ jobs:
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8
+      uses: github/codeql-action/init@23acc5c183826b7a8a97bce3cecc52db901f8251 # v3.25.10
       with:
         languages: c-cpp
         # If you wish to specify custom queries, you can do so here or in a config file.
@@ -100,6 +100,6 @@ jobs:
       run: make -j2
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8
+      uses: github/codeql-action/analyze@23acc5c183826b7a8a97bce3cecc52db901f8251 # v3.25.10
       with:
         category: "/language:c-cpp"
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index dee549daeb..32dcabc873 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -73,6 +73,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@2e230e8fe0ad3a14a340ad0815ddb96d599d2aff # v3.25.8
+        uses: github/codeql-action/upload-sarif@23acc5c183826b7a8a97bce3cecc52db901f8251 # v3.25.10
         with:
           sarif_file: results.sarif

From 819c40b84aa2ce7b6075eec50f6bb140dd2909a4 Mon Sep 17 00:00:00 2001
From: Nathan Ellingwood <ndellin@sandia.gov>
Date: Mon, 17 Jun 2024 16:02:15 -0600
Subject: [PATCH 12/32] FindTPLROC*: updates to fix export of import targets

Changes for the Rocm tpls to match the handling as done with the Cuda tpls

Should resolve issue #2238
---
 cmake/Modules/FindTPLROCBLAS.cmake   | 58 ++++++++++++++++++++++------
 cmake/Modules/FindTPLROCSOLVER.cmake | 55 ++++++++++++++++++++++----
 cmake/Modules/FindTPLROCSPARSE.cmake | 54 ++++++++++++++++++++++----
 3 files changed, 139 insertions(+), 28 deletions(-)

diff --git a/cmake/Modules/FindTPLROCBLAS.cmake b/cmake/Modules/FindTPLROCBLAS.cmake
index c0a9de3b50..4edcd82944 100644
--- a/cmake/Modules/FindTPLROCBLAS.cmake
+++ b/cmake/Modules/FindTPLROCBLAS.cmake
@@ -1,13 +1,47 @@
-# MPL: 12/29/2022: CMake regular way to find a package
-FIND_PACKAGE(ROCBLAS)
-if(TARGET roc::rocblas)
-## MPL: 12/29/2022: Variable TPL_ROCBLAS_IMPORTED_NAME follows the requested convention
-## of KokkosKernel (method kokkoskernels_import_tpl of kokkoskernels_tpls.cmake)
-  SET(TPL_ROCBLAS_IMPORTED_NAME roc::rocblas)
-  SET(TPL_IMPORTED_NAME roc::rocblas)
-## MPL: 12/29/2022: A target comming from a TPL must follows the requested convention
-## of KokkosKernel (method kokkoskernels_link_tpl of kokkoskernels_tpls.cmake)
-  ADD_LIBRARY(KokkosKernels::ROCBLAS ALIAS roc::rocblas)
-ELSE()
-  MESSAGE(FATAL_ERROR "Package ROCBLAS requested but not found")
+IF(ROCBLAS_LIBRARIES AND ROCBLAS_LIBRARY_DIRS AND ROCBLAS_INCLUDE_DIRS)
+  kokkoskernels_find_imported(ROCBLAS INTERFACE
+    LIBRARIES ${ROCBLAS_LIBRARIES}
+    LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}
+    HEADER_PATHS ${ROCBLAS_INCLUDE_DIRS}
+  )
+ELSEIF(ROCBLAS_LIBRARIES AND ROCBLAS_LIBRARY_DIRS)
+  kokkoskernels_find_imported(ROCBLAS INTERFACE
+    LIBRARIES ${ROCBLAS_LIBRARIES}
+    LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}
+    HEADER rocblas.h
+  )
+ELSEIF(ROCBLAS_LIBRARIES)
+  kokkoskernels_find_imported(ROCBLAS INTERFACE
+    LIBRARIES ${ROCBLAS_LIBRARIES}
+    HEADER rocblas.h
+  )
+ELSEIF(ROCBLAS_LIBRARY_DIRS)
+  kokkoskernels_find_imported(ROCBLAS INTERFACE
+    LIBRARIES rocblas
+    LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}
+    HEADER rocblas.h
+  )
+ELSEIF(ROCBLAS_ROOT OR KokkosKernels_ROCBLAS_ROOT) # nothing specific provided, just ROOT
+  kokkoskernels_find_imported(ROCBLAS INTERFACE
+    LIBRARIES rocblas
+    HEADER rocblas.h
+  )
+ELSE() # backwards-compatible way
+  FIND_PACKAGE(ROCBLAS)
+  INCLUDE(FindPackageHandleStandardArgs)
+  IF (NOT ROCBLAS_FOUND)
+    #Important note here: this find Module is named TPLROCBLAS
+    #The eventual target is named ROCBLAS. To avoid naming conflicts
+    #the find module is called TPLROCBLAS. This call will cause
+    #the find_package call to fail in a "standard" CMake way
+    FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_FOUND)
+  ELSE()
+    #The libraries might be empty - OR they might explicitly be not found
+    IF("${ROCBLAS_LIBRARIES}" MATCHES "NOTFOUND")
+      FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_LIBRARIES)
+    ELSE()
+      KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE
+        LINK_LIBRARIES "${ROCBLAS_LIBRARIES}")
+    ENDIF()
+  ENDIF()
 ENDIF()
diff --git a/cmake/Modules/FindTPLROCSOLVER.cmake b/cmake/Modules/FindTPLROCSOLVER.cmake
index 8f2a92cfda..58eae9f8f5 100644
--- a/cmake/Modules/FindTPLROCSOLVER.cmake
+++ b/cmake/Modules/FindTPLROCSOLVER.cmake
@@ -1,9 +1,48 @@
-# LBV: 11/08/2023: This file follows the partern of FindTPLROCBLAS.cmake/FindTPLROCSPARSE.cmake
-FIND_PACKAGE(ROCSOLVER)
-if(TARGET roc::rocsolver)
-  SET(TPL_ROCSOLVER_IMPORTED_NAME roc::rocsolver)
-  SET(TPL_IMPORTED_NAME roc::rocsolver)
-  ADD_LIBRARY(KokkosKernels::ROCSOLVER ALIAS roc::rocsolver)
-ELSE()
-  MESSAGE(FATAL_ERROR "Package ROCSOLVER requested but not found")
+IF(ROCSOLVER_LIBRARIES AND ROCSOLVER_LIBRARY_DIRS AND ROCSOLVER_INCLUDE_DIRS)
+  kokkoskernels_find_imported(ROCSOLVER INTERFACE
+    LIBRARIES ${ROCSOLVER_LIBRARIES}
+    LIBRARY_PATHS ${ROCSOLVER_LIBRARY_DIRS}
+    HEADER_PATHS ${ROCSOLVER_INCLUDE_DIRS}
+  )
+ELSEIF(ROCSOLVER_LIBRARIES AND ROCSOLVER_LIBRARY_DIRS)
+  kokkoskernels_find_imported(ROCSOLVER INTERFACE
+    LIBRARIES ${ROCSOLVER_LIBRARIES}
+    LIBRARY_PATHS ${ROCSOLVER_LIBRARY_DIRS}
+    HEADER rocsolver.h
+  )
+ELSEIF(ROCSOLVER_LIBRARIES)
+  kokkoskernels_find_imported(ROCSOLVER INTERFACE
+    LIBRARIES ${ROCSOLVER_LIBRARIES}
+    HEADER rocsolver.h
+  )
+ELSEIF(ROCSOLVER_LIBRARY_DIRS)
+  kokkoskernels_find_imported(ROCSOLVER INTERFACE
+    LIBRARIES rocsolver
+    LIBRARY_PATHS ${ROCSOLVER_LIBRARY_DIRS}
+    HEADER rocsolver.h
+  )
+ELSEIF(ROCSOLVER_ROOT OR KokkosKernels_ROCSOLVER_ROOT) # nothing specific provided, just ROOT
+  kokkoskernels_find_imported(ROCSOLVER INTERFACE
+    LIBRARIES rocsolver
+    HEADER rocsolver.h
+  )
+ELSE() # backwards-compatible way
+  FIND_PACKAGE(ROCSOLVER)
+  INCLUDE(FindPackageHandleStandardArgs)
+  IF (NOT ROCSOLVER_FOUND)
+    #Important note here: this find Module is named TPLROCSOLVER
+    #The eventual target is named ROCSOLVER. To avoid naming conflicts
+    #the find module is called TPLROCSOLVER. This call will cause
+    #the find_package call to fail in a "standard" CMake way
+    FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSOLVER REQUIRED_VARS ROCSOLVER_FOUND)
+  ELSE()
+    #The libraries might be empty - OR they might explicitly be not found
+    IF("${ROCSOLVER_LIBRARIES}" MATCHES "NOTFOUND")
+      FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSOLVER REQUIRED_VARS ROCSOLVER_LIBRARIES)
+    ELSE()
+      KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSOLVER INTERFACE
+        LINK_LIBRARIES "${ROCSOLVER_LIBRARIES}")
+    ENDIF()
+  ENDIF()
 ENDIF()
+
diff --git a/cmake/Modules/FindTPLROCSPARSE.cmake b/cmake/Modules/FindTPLROCSPARSE.cmake
index 5f985ff3a8..3b45ba5e82 100644
--- a/cmake/Modules/FindTPLROCSPARSE.cmake
+++ b/cmake/Modules/FindTPLROCSPARSE.cmake
@@ -1,9 +1,47 @@
-# MPL: 05/01/2023: This file follows the partern of FindTPLROCBLAS.cmake
-FIND_PACKAGE(ROCSPARSE)
-if(TARGET roc::rocsparse)
-  SET(TPL_ROCSPARSE_IMPORTED_NAME roc::rocsparse)
-  SET(TPL_IMPORTED_NAME roc::rocsparse)
-  ADD_LIBRARY(KokkosKernels::ROCSPARSE ALIAS roc::rocsparse)
-ELSE()
-  MESSAGE(FATAL_ERROR "Package ROCSPARSE requested but not found")
+IF(ROCSPARSE_LIBRARIES AND ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_INCLUDE_DIRS)
+  kokkoskernels_find_imported(ROCSPARSE INTERFACE
+    LIBRARIES ${ROCSPARSE_LIBRARIES}
+    LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}
+    HEADER_PATHS ${ROCSPARSE_INCLUDE_DIRS}
+  )
+ELSEIF(ROCSPARSE_LIBRARIES AND ROCSPARSE_LIBRARY_DIRS)
+  kokkoskernels_find_imported(ROCSPARSE INTERFACE
+    LIBRARIES ${ROCSPARSE_LIBRARIES}
+    LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}
+    HEADER rocsparse.h
+  )
+ELSEIF(ROCSPARSE_LIBRARIES)
+  kokkoskernels_find_imported(ROCSPARSE INTERFACE
+    LIBRARIES ${ROCSPARSE_LIBRARIES}
+    HEADER rocsparse.h
+  )
+ELSEIF(ROCSPARSE_LIBRARY_DIRS)
+  kokkoskernels_find_imported(ROCSPARSE INTERFACE
+    LIBRARIES rocsparse
+    LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}
+    HEADER rocsparse.h
+  )
+ELSEIF(ROCSPARSE_ROOT OR KokkosKernels_ROCSPARSE_ROOT) # nothing specific provided, just ROOT
+  kokkoskernels_find_imported(ROCSPARSE INTERFACE
+    LIBRARIES rocsparse
+    HEADER rocsparse.h
+  )
+ELSE() # backwards-compatible way
+  FIND_PACKAGE(ROCSPARSE)
+  INCLUDE(FindPackageHandleStandardArgs)
+  IF (NOT ROCSPARSE_FOUND)
+    #Important note here: this find Module is named TPLROCSPARSE
+    #The eventual target is named ROCSPARSE. To avoid naming conflicts
+    #the find module is called TPLROCSPARSE. This call will cause
+    #the find_package call to fail in a "standard" CMake way
+    FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_FOUND)
+  ELSE()
+    #The libraries might be empty - OR they might explicitly be not found
+    IF("${ROCSPARSE_LIBRARIES}" MATCHES "NOTFOUND")
+      FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_LIBRARIES)
+    ELSE()
+      KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE
+        LINK_LIBRARIES "${ROCSPARSE_LIBRARIES}")
+    ENDIF()
+  ENDIF()
 ENDIF()

From 47942bf7c87c470275c5bb221e4a52eefa989e40 Mon Sep 17 00:00:00 2001
From: brian-kelley <bmkelle@sandia.gov>
Date: Mon, 24 Jun 2024 08:58:55 -0600
Subject: [PATCH 13/32] Fix warning about memcpy (#2252)

When building Stokhos BlockCrs, this util function gave a warning
about memcpy modifying a non-trivially-copyable type. Silence it
by casting to void*
---
 common/src/KokkosKernels_BlockUtils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/src/KokkosKernels_BlockUtils.hpp b/common/src/KokkosKernels_BlockUtils.hpp
index 006a38a6e4..6fd9d9b656 100644
--- a/common/src/KokkosKernels_BlockUtils.hpp
+++ b/common/src/KokkosKernels_BlockUtils.hpp
@@ -39,7 +39,7 @@ template <typename size_type, typename value_type>
 KOKKOS_INLINE_FUNCTION void kk_block_set(const size_type block_dim,
                                          value_type *dst,
                                          const value_type *val) {
-  memcpy(dst, val, block_dim * block_dim * sizeof(value_type));
+  memcpy((void *)dst, val, block_dim * block_dim * sizeof(value_type));
 }
 
 // Performs A += B on blocks

From e9f7913d9ca017c297f15ebd9156f985efb52f13 Mon Sep 17 00:00:00 2001
From: brian-kelley <bmkelle@sandia.gov>
Date: Mon, 24 Jun 2024 22:23:30 -0600
Subject: [PATCH 14/32] RCM fixes, improvements (#2254)

* Fix RCM starting vertex issue, improve testing
* apply reversing as labels are computed
instead of at the end. Saves a loop over all the labels
* use min-degree starting vertex within each connected component
---
 graph/impl/KokkosGraph_BFS_impl.hpp |  57 +++++------
 graph/unit_test/Test_Graph_rcm.hpp  | 145 ++++++++++++++++++++++------
 2 files changed, 148 insertions(+), 54 deletions(-)

diff --git a/graph/impl/KokkosGraph_BFS_impl.hpp b/graph/impl/KokkosGraph_BFS_impl.hpp
index e73c1cb489..9ea5d63e07 100644
--- a/graph/impl/KokkosGraph_BFS_impl.hpp
+++ b/graph/impl/KokkosGraph_BFS_impl.hpp
@@ -38,7 +38,7 @@ struct SerialRCM {
   host_lno_view_t entries;
 
   SerialRCM(const rowmap_t& rowmap_, const entries_t& entries_)
-      : numVerts(rowmap_.extent(0) - 1),
+      : numVerts(std::max(rowmap_.extent_int(0), 1) - 1),
         rowmap(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostRowmap"),
                rowmap_.extent(0)),
         entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "HostEntries"),
@@ -47,35 +47,39 @@ struct SerialRCM {
     Kokkos::deep_copy(entries, entries_);
   }
 
-  lno_t findPseudoPeripheral() {
-    // Choose vertex with smallest degree
-    lno_t periph    = -1;
-    lno_t periphDeg = numVerts;
-    for (lno_t i = 0; i < numVerts; i++) {
-      lno_t deg = rowmap(i + 1) - rowmap(i);
-      if (deg < periphDeg) {
-        periph    = i;
-        periphDeg = deg;
-        if (deg == 0) break;
-      }
-    }
-    return periph;
-  }
-
   lno_view_t rcm() {
-    lno_t start = findPseudoPeripheral();
+    // Given a label L, labelReverse - L gives the reversed label (as in reverse
+    // Cuthill McKee)
+    lno_t labelReverse = numVerts - 1;
     host_lno_view_t q(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Queue"),
                       numVerts);
     host_lno_view_t label(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "Permutation"),
         numVerts);
     for (lno_t i = 0; i < numVerts; i++) label(i) = -1;
-    lno_t qhead  = 0;
-    lno_t qtail  = 0;
-    label(start) = qtail;
+    lno_t qhead = 0;
+    lno_t qtail = 0;
+    // List of all vertices, in order from lowest to highest degree
+    // (heuristic for best to worst starting vertex for RCM).
+    // If the graph has multiple connected components, restart at the first
+    // unlabeled vertex in this list.
+    host_lno_view_t allVertices(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "allVertices"),
+        numVerts);
+    for (lno_t i = 0; i < numVerts; i++) allVertices(i) = i;
+    std::sort(allVertices.data(), allVertices.data() + numVerts,
+              [&](lno_t n1, lno_t n2) -> bool {
+                // return true if n1 has a lower degree than n2
+                return (rowmap(n1 + 1) - rowmap(n1)) <
+                       (rowmap(n2 + 1) - rowmap(n2));
+              });
+    lno_t allVerticesIter = 0;
+    // Start RCM with the first vertex in allVertices
+    lno_t start  = allVertices(allVerticesIter++);
+    label(start) = labelReverse - qtail;
     q(qtail++)   = start;
+    // Reuse this neighbor list for all levels without deallocating
     std::vector<lno_t> neighbors;
-    lno_t outerQueue = 0;
     while (true) {
       lno_t v = q(qhead++);
       neighbors.clear();
@@ -94,7 +98,7 @@ struct SerialRCM {
                 });
       // label and enqueue all unlabeled neighbors
       for (lno_t nei : neighbors) {
-        label(nei) = qtail;
+        label(nei) = labelReverse - qtail;
         q(qtail++) = nei;
       }
       if (qtail == numVerts) {
@@ -102,16 +106,15 @@ struct SerialRCM {
         break;
       } else if (qhead == qtail) {
         // have exhausted this connected component, but others remain unlabeled
-        while (label(outerQueue) != -1) outerQueue++;
-        label(outerQueue) = qtail;
-        q(qtail++)        = outerQueue;
+        while (label(allVertices(allVerticesIter)) != -1) allVerticesIter++;
+        lno_t restart  = allVertices(allVerticesIter);
+        label(restart) = labelReverse - qtail;
+        q(qtail++)     = restart;
       }
     }
     lno_view_t labelOut(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCM Permutation"),
         numVerts);
-    // reverse the labels
-    for (lno_t i = 0; i < numVerts; i++) label(i) = numVerts - label(i) - 1;
     Kokkos::deep_copy(labelOut, label);
     return labelOut;
   }
diff --git a/graph/unit_test/Test_Graph_rcm.hpp b/graph/unit_test/Test_Graph_rcm.hpp
index 2e05554d2d..a6d165d8c3 100644
--- a/graph/unit_test/Test_Graph_rcm.hpp
+++ b/graph/unit_test/Test_Graph_rcm.hpp
@@ -19,7 +19,7 @@
 
 #include "KokkosGraph_RCM.hpp"
 #include "KokkosKernels_IOUtils.hpp"
-#include "KokkosSparse_CrsMatrix.hpp"
+#include "Kokkos_StaticCrsGraph.hpp"
 
 #include <vector>
 
@@ -81,7 +81,7 @@ int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries,
                  const labels_t& invPerm, const labels_t& perm) {
   using size_type = typename rowmap_t::non_const_value_type;
   using lno_t     = typename entries_t::non_const_value_type;
-  lno_t numVerts  = rowmap.extent(0) - 1;
+  lno_t numVerts  = std::max(1, rowmap.extent_int(0)) - 1;
   int bw          = 0;
   for (lno_t i = 0; i < numVerts; i++) {
     lno_t origRow = perm(i);
@@ -97,18 +97,10 @@ int maxBandwidth(const rowmap_t& rowmap, const entries_t& entries,
   return bw;
 }
 
-template <typename lno_t, typename size_type, typename device>
-void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) {
-  typedef
-      typename KokkosSparse::CrsMatrix<double, lno_t, device, void, size_type>
-          crsMat_t;
-  typedef typename crsMat_t::StaticCrsGraphType graph_t;
-  typedef typename graph_t::row_map_type rowmap_t;
-  typedef typename graph_t::entries_type entries_t;
-  lno_t numVerts = gridX * gridY * gridZ;
-  typename rowmap_t::non_const_type rowmap;
-  typename entries_t::non_const_type entries;
-  generate7pt(rowmap, entries, gridX, gridY, gridZ);
+template <typename device, typename rowmap_t, typename entries_t>
+void test_rcm(const rowmap_t& rowmap, const entries_t& entries,
+              bool expectBandwidthReduced) {
+  using lno_t = typename entries_t::non_const_value_type;
   auto rcm = KokkosGraph::Experimental::graph_rcm<device, rowmap_t, entries_t>(
       rowmap, entries);
   auto rowmapHost =
@@ -116,6 +108,7 @@ void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) {
   auto entriesHost =
       Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries);
   auto rcmHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rcm);
+  lno_t numVerts = std::max(rowmap.extent_int(0), 1) - 1;
   decltype(rcmHost) rcmPermHost(
       Kokkos::view_alloc(Kokkos::WithoutInitializing, "RCMPerm"), numVerts);
   for (lno_t i = 0; i < numVerts; i++) rcmPermHost(rcmHost(i)) = i;
@@ -130,21 +123,119 @@ void test_rcm(lno_t gridX, lno_t gridY, lno_t gridZ) {
     }
     for (lno_t i = 0; i < numVerts; i++) ASSERT_EQ(counts[i], 1);
   }
-  Kokkos::View<lno_t*, Kokkos::HostSpace> identityOrder(
-      Kokkos::view_alloc(Kokkos::WithoutInitializing, "Identity"), numVerts);
-  for (lno_t i = 0; i < numVerts; i++) identityOrder(i) = i;
-  size_t origBW =
-      maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder);
-  size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost);
-  EXPECT_LE(rcmBW, origBW);
+  if (expectBandwidthReduced) {
+    Kokkos::View<lno_t*, Kokkos::HostSpace> identityOrder(
+        Kokkos::view_alloc(Kokkos::WithoutInitializing, "Identity"), numVerts);
+    for (lno_t i = 0; i < numVerts; i++) identityOrder(i) = i;
+    size_t origBW =
+        maxBandwidth(rowmapHost, entriesHost, identityOrder, identityOrder);
+    size_t rcmBW = maxBandwidth(rowmapHost, entriesHost, rcmHost, rcmPermHost);
+    EXPECT_LE(rcmBW, origBW);
+  }
+}
+
+template <typename lno_t, typename size_type, typename device>
+void test_rcm_zerorows() {
+  using graph_t =
+      Kokkos::StaticCrsGraph<lno_t, default_layout, device, void, size_type>;
+  using rowmap_t  = typename graph_t::row_map_type::non_const_type;
+  using entries_t = typename graph_t::entries_type::non_const_type;
+  rowmap_t rowmap;
+  entries_t entries;
+  test_rcm<device>(rowmap, entries, false);
+}
+
+template <typename lno_t, typename size_type, typename device>
+void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ,
+                  bool expectBandwidthReduced) {
+  using graph_t =
+      Kokkos::StaticCrsGraph<lno_t, default_layout, device, void, size_type>;
+  using rowmap_t  = typename graph_t::row_map_type::non_const_type;
+  using entries_t = typename graph_t::entries_type::non_const_type;
+  rowmap_t rowmap;
+  entries_t entries;
+  generate7pt(rowmap, entries, gridX, gridY, gridZ);
+  test_rcm<device>(rowmap, entries, expectBandwidthReduced);
+}
+
+template <typename lno_t, typename size_type, typename device>
+void test_rcm_4clique() {
+  using graph_t =
+      Kokkos::StaticCrsGraph<lno_t, default_layout, device, void, size_type>;
+  using rowmap_t  = typename graph_t::row_map_type::non_const_type;
+  using entries_t = typename graph_t::entries_type::non_const_type;
+  rowmap_t rowmap("rowmap", 5);
+  entries_t entries("entries", 16);
+  auto rowmap_host  = Kokkos::create_mirror_view(rowmap);
+  auto entries_host = Kokkos::create_mirror_view(entries);
+  for (lno_t i = 0; i < 5; i++) rowmap_host(i) = i * 4;
+  for (lno_t i = 0; i < 16; i++) entries_host(i) = i % 4;
+  Kokkos::deep_copy(rowmap, rowmap_host);
+  Kokkos::deep_copy(entries, entries_host);
+  test_rcm<device>(rowmap, entries, false);
+}
+
+template <typename lno_t, typename size_type, typename device>
+void test_rcm_multiple_components() {
+  using graph_t =
+      Kokkos::StaticCrsGraph<lno_t, default_layout, device, void, size_type>;
+  using rowmap_t  = typename graph_t::row_map_type::non_const_type;
+  using entries_t = typename graph_t::entries_type::non_const_type;
+  // Generate a single 3D grid first
+  rowmap_t rowmap_cube;
+  entries_t entries_cube;
+  generate7pt(rowmap_cube, entries_cube, 7, 7, 7);
+  auto rowmap_cube_host =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowmap_cube);
+  auto entries_cube_host =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries_cube);
+  lno_t nv_cube = 7 * 7 * 7;
+  lno_t ne_cube = entries_cube.extent(0);
+  // Now replicate the graph twice, so there are 2 disconnected copies of the
+  // cube
+  rowmap_t rowmap("rowmap", nv_cube * 2 + 1);
+  entries_t entries("entries", ne_cube * 2);
+  auto rowmap_host  = Kokkos::create_mirror_view(rowmap);
+  auto entries_host = Kokkos::create_mirror_view(entries);
+  for (lno_t i = 0; i <= nv_cube * 2; i++) {
+    if (i < nv_cube)
+      rowmap_host(i) = rowmap_cube_host(i);
+    else
+      rowmap_host(i) = ne_cube + rowmap_cube_host(i - nv_cube);
+  }
+  for (lno_t i = 0; i < ne_cube * 2; i++) {
+    if (i < ne_cube)
+      entries_host(i) = entries_cube_host(i);
+    else
+      entries_host(i) = nv_cube + entries_cube_host(i - ne_cube);
+  }
+  Kokkos::deep_copy(rowmap, rowmap_host);
+  Kokkos::deep_copy(entries, entries_host);
+  test_rcm<device>(rowmap, entries, true);
 }
 
-#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                  \
-  TEST_F(TestCategory,                                                 \
-         graph##_##rcm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
-    test_rcm<ORDINAL, OFFSET, DEVICE>(6, 3, 3);                        \
-    test_rcm<ORDINAL, OFFSET, DEVICE>(20, 20, 20);                     \
-    test_rcm<ORDINAL, OFFSET, DEVICE>(100, 100, 1);                    \
+#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)                                   \
+  TEST_F(                                                                               \
+      TestCategory,                                                                     \
+      graph##_##rcm_zerorows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {            \
+    test_rcm_zerorows<ORDINAL, OFFSET, DEVICE>();                                       \
+  }                                                                                     \
+  TEST_F(TestCategory,                                                                  \
+         graph##_##rcm_7pt##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {              \
+    test_rcm_7pt<ORDINAL, OFFSET, DEVICE>(1, 1, 1, false);                              \
+    test_rcm_7pt<ORDINAL, OFFSET, DEVICE>(2, 1, 1, false);                              \
+    test_rcm_7pt<ORDINAL, OFFSET, DEVICE>(6, 3, 3, true);                               \
+    test_rcm_7pt<ORDINAL, OFFSET, DEVICE>(20, 20, 20, true);                            \
+    test_rcm_7pt<ORDINAL, OFFSET, DEVICE>(100, 100, 1, true);                           \
+  }                                                                                     \
+  TEST_F(TestCategory,                                                                  \
+         graph##_##rcm_4clique##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) {          \
+    test_rcm_4clique<ORDINAL, OFFSET, DEVICE>();                                        \
+  }                                                                                     \
+  TEST_F(                                                                               \
+      TestCategory,                                                                     \
+      graph##_##rcm_multiple_components##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \
+    test_rcm_multiple_components<ORDINAL, OFFSET, DEVICE>();                            \
   }
 
 #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \

From 31be65899841af53928451311658b8c5a7500831 Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpearson@users.noreply.github.com>
Date: Tue, 25 Jun 2024 08:09:07 -0600
Subject: [PATCH 15/32] spgemm: add profiling regions to native implementations
 (#2253)

* spgemm: add profiling regions to native implementations

* Add profiling region to KokkosSPGEMM::KokkosSPGEMM_symbolic
---
 sparse/impl/KokkosSparse_spgemm_impl_def.hpp   | 2 ++
 sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp | 2 ++
 sparse/impl/KokkosSparse_spgemm_impl_seq.hpp   | 5 +++++
 sparse/impl/KokkosSparse_spgemm_impl_speed.hpp | 2 ++
 4 files changed, 11 insertions(+)

diff --git a/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
index a420a81c90..54e4e228c8 100644
--- a/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
@@ -59,6 +59,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
                   a_scalar_nnz_view_t_, b_lno_row_view_t_, b_lno_nnz_view_t_,
                   b_scalar_nnz_view_t_>::KokkosSPGEMM_symbolic(c_row_view_t
                                                                    rowmapC_) {
+  Kokkos::Profiling::pushRegion("KokkosSparse::spgemm_symbolic[NATIVE]");
   {
     if (KOKKOSKERNELS_VERBOSE) {
       std::cout << "SYMBOLIC PHASE" << std::endl;
@@ -162,6 +163,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     }
 #endif
   }
+  Kokkos::Profiling::popRegion();
 }
 
 template <typename HandleType, typename a_row_view_t_,
diff --git a/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
index 0a3113e5e4..a7fee71278 100644
--- a/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
@@ -1241,6 +1241,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
         c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
         c_scalar_nnz_view_t valuesC_,
         KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space) {
+  Kokkos::Profiling::pushRegion("KokkosSparse::spgemm_numeric[NATIVE/HASH]");
   if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\tHASH MODE" << std::endl;
   }
@@ -1699,6 +1700,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
   if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl;
   }
+  Kokkos::Profiling::popRegion();
 }
 
 // 01/30/2020: this code seems to be unused within any of the kokkos-kernels
diff --git a/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp b/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp
index b37328aa70..3915bb7d93 100644
--- a/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp
@@ -32,6 +32,8 @@ void spgemm_debug_symbolic(KernelHandle *handle,
                            bool /* transposeA */, blno_row_view_t_ row_mapB,
                            blno_nnz_view_t_ entriesB, bool /* transposeB */,
                            clno_row_view_t_ row_mapC) {
+  Kokkos::Profiling::pushRegion("KokkosSparse::spgemm_symbolic[NATIVE/DEBUG]");
+
   typename alno_row_view_t_::HostMirror h_rma =
       Kokkos::create_mirror_view(row_mapA);
   Kokkos::deep_copy(h_rma, row_mapA);
@@ -100,6 +102,7 @@ void spgemm_debug_symbolic(KernelHandle *handle,
   handle->get_spgemm_handle()->set_c_nnz(result_index);
   Kokkos::deep_copy(row_mapC, h_rmc);
   Kokkos::fence();
+  Kokkos::Profiling::popRegion();
 }
 
 template <typename KernelHandle, typename alno_row_view_t_,
@@ -119,6 +122,7 @@ void spgemm_debug_numeric(KernelHandle * /* handle */,
                           bscalar_nnz_view_t_ valuesB, bool /* transposeB */,
                           clno_row_view_t_ row_mapC, clno_nnz_view_t_ entriesC,
                           cscalar_nnz_view_t_ valuesC) {
+  Kokkos::Profiling::pushRegion("KokkosSparse::spgemm_numeric[NATIVE/DEBUG]");
   typename alno_row_view_t_::HostMirror h_rma =
       Kokkos::create_mirror_view(row_mapA);
   Kokkos::deep_copy(h_rma, row_mapA);
@@ -199,6 +203,7 @@ void spgemm_debug_numeric(KernelHandle * /* handle */,
   Kokkos::deep_copy(entriesC, h_entc);
   Kokkos::deep_copy(valuesC, h_valc);
   Kokkos::fence();
+  Kokkos::Profiling::popRegion();
 }
 
 }  // namespace Impl
diff --git a/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
index 942e1be7bf..954bfb3f3e 100644
--- a/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
@@ -468,6 +468,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
         c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_,
         c_scalar_nnz_view_t valuesC_,
         KokkosKernels::Impl::ExecSpaceType my_exec_space_) {
+  Kokkos::Profiling::pushRegion("KokkosSparse::spgemm_numeric[NATIVE/SPEED]");
   if (KOKKOSKERNELS_VERBOSE) {
     std::cout << "\tSPEED MODE" << std::endl;
   }
@@ -604,6 +605,7 @@ void KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     std::cout << "\t\tNumeric SPEED TIME WITH FREE:"
               << numeric_speed_timer_with_free.seconds() << std::endl;
   }
+  Kokkos::Profiling::popRegion();
 }
 }  // namespace Impl
 }  // namespace KokkosSparse

From bbfc3ff659e774cb6ce6f93875a79d75ee70907a Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpearson@users.noreply.github.com>
Date: Fri, 28 Jun 2024 09:35:40 -0600
Subject: [PATCH 16/32] sparse: replace macros with constexpr bools (#2260)

---
 .../impl/KokkosSparse_spadd_numeric_impl.hpp  |  61 +++---
 .../impl/KokkosSparse_spadd_symbolic_impl.hpp |  34 +--
 sparse/src/KokkosSparse_par_ilut.hpp          | 195 +++++++++---------
 3 files changed, 149 insertions(+), 141 deletions(-)

diff --git a/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp b/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp
index fa356dc963..16c228d8ec 100644
--- a/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp
+++ b/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp
@@ -169,10 +169,11 @@ struct UnsortedNumericSumFunctor {
   const CcolindsT Bpos;
 };
 
-// Helper macro to check that two types are the same (ignoring const)
-#define SAME_TYPE(A, B)                             \
-  std::is_same<typename std::remove_const<A>::type, \
-               typename std::remove_const<B>::type>::value
+// Two types are the same (ignoring const)
+template <typename T, typename U>
+constexpr bool spadd_numeric_same_type =
+    std::is_same_v<typename std::remove_const_t<T>,
+                   typename std::remove_const_t<U>>;
 
 template <
     typename execution_space, typename KernelHandle, typename alno_row_view_t,
@@ -193,46 +194,56 @@ void spadd_numeric_impl(
   typedef typename KernelHandle::nnz_scalar_t scalar_type;
   // Check that A/B/C data types match KernelHandle types, and that C data types
   // are nonconst (doesn't matter if A/B types are const)
-  static_assert(SAME_TYPE(ascalar_t, scalar_type),
+  static_assert(spadd_numeric_same_type<ascalar_t, scalar_type>,
                 "A scalar type must match handle scalar type");
-  static_assert(SAME_TYPE(bscalar_t, scalar_type),
+  static_assert(spadd_numeric_same_type<bscalar_t, scalar_type>,
                 "B scalar type must match handle scalar type");
-  static_assert(SAME_TYPE(typename alno_row_view_t::value_type, size_type),
-                "add_symbolic: A size_type must match KernelHandle size_type "
-                "(const doesn't matter)");
-  static_assert(SAME_TYPE(typename blno_row_view_t::value_type, size_type),
-                "add_symbolic: B size_type must match KernelHandle size_type "
-                "(const doesn't matter)");
   static_assert(
-      SAME_TYPE(typename clno_row_view_t::non_const_value_type, size_type),
+      spadd_numeric_same_type<typename alno_row_view_t::value_type, size_type>,
+      "add_symbolic: A size_type must match KernelHandle size_type "
+      "(const doesn't matter)");
+  static_assert(
+      spadd_numeric_same_type<typename blno_row_view_t::value_type, size_type>,
+      "add_symbolic: B size_type must match KernelHandle size_type "
+      "(const doesn't matter)");
+  static_assert(
+      spadd_numeric_same_type<typename clno_row_view_t::non_const_value_type,
+                              size_type>,
       "add_symbolic: C size_type must match KernelHandle size_type)");
-  static_assert(SAME_TYPE(typename alno_nnz_view_t::value_type, ordinal_type),
+  static_assert(spadd_numeric_same_type<typename alno_nnz_view_t::value_type,
+                                        ordinal_type>,
                 "add_symbolic: A entry type must match KernelHandle entry type "
                 "(aka nnz_lno_t, and const doesn't matter)");
-  static_assert(SAME_TYPE(typename blno_nnz_view_t::value_type, ordinal_type),
+  static_assert(spadd_numeric_same_type<typename blno_nnz_view_t::value_type,
+                                        ordinal_type>,
                 "add_symbolic: B entry type must match KernelHandle entry type "
                 "(aka nnz_lno_t, and const doesn't matter)");
-  static_assert(SAME_TYPE(typename clno_nnz_view_t::value_type, ordinal_type),
+  static_assert(spadd_numeric_same_type<typename clno_nnz_view_t::value_type,
+                                        ordinal_type>,
                 "add_symbolic: C entry type must match KernelHandle entry type "
                 "(aka nnz_lno_t)");
-  static_assert(std::is_same<typename clno_nnz_view_t::non_const_value_type,
-                             typename clno_nnz_view_t::value_type>::value,
+  static_assert(std::is_same_v<typename clno_nnz_view_t::non_const_value_type,
+                               typename clno_nnz_view_t::value_type>,
                 "add_symbolic: C entry type must not be const");
   static_assert(
-      SAME_TYPE(typename ascalar_nnz_view_t::value_type, scalar_type),
+      spadd_numeric_same_type<typename ascalar_nnz_view_t::value_type,
+                              scalar_type>,
       "add_symbolic: A scalar type must match KernelHandle entry type (aka "
       "nnz_lno_t, and const doesn't matter)");
   static_assert(
-      SAME_TYPE(typename bscalar_nnz_view_t::value_type, scalar_type),
+      spadd_numeric_same_type<typename bscalar_nnz_view_t::value_type,
+                              scalar_type>,
       "add_symbolic: B scalar type must match KernelHandle entry type (aka "
       "nnz_lno_t, and const doesn't matter)");
   static_assert(
-      SAME_TYPE(typename cscalar_nnz_view_t::value_type, scalar_type),
+      spadd_numeric_same_type<typename cscalar_nnz_view_t::value_type,
+                              scalar_type>,
       "add_symbolic: C scalar type must match KernelHandle entry type (aka "
       "nnz_lno_t)");
-  static_assert(std::is_same<typename cscalar_nnz_view_t::non_const_value_type,
-                             typename cscalar_nnz_view_t::value_type>::value,
-                "add_symbolic: C scalar type must not be const");
+  static_assert(
+      std::is_same_v<typename cscalar_nnz_view_t::non_const_value_type,
+                     typename cscalar_nnz_view_t::value_type>,
+      "add_symbolic: C scalar type must not be const");
   typedef Kokkos::RangePolicy<execution_space, size_type> range_type;
   auto addHandle = kernel_handle->get_spadd_handle();
   // rowmap length can be 0 or 1 if #rows is 0.
@@ -269,8 +280,6 @@ void spadd_numeric_impl(
   addHandle->set_call_numeric();
 }
 
-#undef SAME_TYPE
-
 }  // namespace Impl
 }  // namespace KokkosSparse
 
diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp
index 80506e3056..764d185f90 100644
--- a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp
+++ b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp
@@ -24,10 +24,11 @@
 namespace KokkosSparse {
 namespace Impl {
 
-// Helper macro to check that two types are the same (ignoring const)
-#define SAME_TYPE(A, B)                             \
-  std::is_same<typename std::remove_const<A>::type, \
-               typename std::remove_const<B>::type>::value
+// Two types are the same (ignoring const)
+template <typename T, typename U>
+constexpr bool spadd_symbolic_same_type =
+    std::is_same_v<typename std::remove_const_t<T>,
+                   typename std::remove_const_t<U>>;
 
 // get C rowmap for sorted input
 template <typename size_type, typename ordinal_type, typename ARowPtrsT,
@@ -479,29 +480,34 @@ void spadd_symbolic_impl(
   // Check that A/B/C data types match KernelHandle types, and that C data types
   // are nonconst (doesn't matter if A/B types are const)
   static_assert(
-      SAME_TYPE(typename alno_row_view_t_::non_const_value_type, size_type),
+      spadd_symbolic_same_type<typename alno_row_view_t_::non_const_value_type,
+                               size_type>,
       "add_symbolic: A size_type must match KernelHandle size_type (const "
       "doesn't matter)");
   static_assert(
-      SAME_TYPE(typename blno_row_view_t_::non_const_value_type, size_type),
+      spadd_symbolic_same_type<typename blno_row_view_t_::non_const_value_type,
+                               size_type>,
       "add_symbolic: B size_type must match KernelHandle size_type (const "
       "doesn't matter)");
   static_assert(
-      SAME_TYPE(typename clno_row_view_t_::non_const_value_type, size_type),
+      spadd_symbolic_same_type<typename clno_row_view_t_::non_const_value_type,
+                               size_type>,
       "add_symbolic: C size_type must match KernelHandle size_type)");
-  static_assert(std::is_same<typename clno_row_view_t_::non_const_value_type,
-                             typename clno_row_view_t_::value_type>::value,
+  static_assert(std::is_same_v<typename clno_row_view_t_::non_const_value_type,
+                               typename clno_row_view_t_::value_type>,
                 "add_symbolic: C size_type must not be const");
   static_assert(
-      SAME_TYPE(typename alno_nnz_view_t_::non_const_value_type, ordinal_type),
+      spadd_symbolic_same_type<typename alno_nnz_view_t_::non_const_value_type,
+                               ordinal_type>,
       "add_symbolic: A entry type must match KernelHandle entry type (aka "
       "nnz_lno_t, and const doesn't matter)");
   static_assert(
-      SAME_TYPE(typename blno_nnz_view_t_::non_const_value_type, ordinal_type),
+      spadd_symbolic_same_type<typename blno_nnz_view_t_::non_const_value_type,
+                               ordinal_type>,
       "add_symbolic: B entry type must match KernelHandle entry type (aka "
       "nnz_lno_t, and const doesn't matter)");
-  static_assert(std::is_same<typename clno_row_view_t_::non_const_value_type,
-                             typename clno_row_view_t_::value_type>::value,
+  static_assert(std::is_same_v<typename clno_row_view_t_::non_const_value_type,
+                               typename clno_row_view_t_::value_type>,
                 "add_symbolic: C entry type must not be const");
   // symbolic just needs to compute c_rowmap
   // easy for sorted, but for unsorted is easiest to just compute the whole sum
@@ -594,8 +600,6 @@ void spadd_symbolic_impl(
   addHandle->set_call_numeric(false);
 }
 
-#undef SAME_TYPE
-
 }  // namespace Impl
 }  // namespace KokkosSparse
 
diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp
index 8ded6209ec..edaae8192f 100644
--- a/sparse/src/KokkosSparse_par_ilut.hpp
+++ b/sparse/src/KokkosSparse_par_ilut.hpp
@@ -44,9 +44,11 @@
 namespace KokkosSparse {
 namespace Experimental {
 
-#define KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(A, B)      \
-  std::is_same<typename std::remove_const<A>::type, \
-               typename std::remove_const<B>::type>::value
+// Two types are the same (ignoring const)
+template <typename T, typename U>
+constexpr bool parilut_same_type =
+    std::is_same_v<typename std::remove_const_t<T>,
+                   typename std::remove_const_t<U>>;
 
 /// @brief Performs the symbolic phase of par_ilut.
 ///        This is a non-blocking function.
@@ -78,24 +80,24 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap,
   using size_type    = typename KernelHandle::size_type;
   using ordinal_type = typename KernelHandle::nnz_lno_t;
 
-  static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-                    typename ARowMapType::non_const_value_type, size_type),
-                "par_ilut_symbolic: A size_type must match KernelHandle "
-                "size_type (const doesn't matter)");
-  static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-                    typename AEntriesType::non_const_value_type, ordinal_type),
+  static_assert(
+      parilut_same_type<typename ARowMapType::non_const_value_type, size_type>,
+      "par_ilut_symbolic: A size_type must match KernelHandle "
+      "size_type (const doesn't matter)");
+  static_assert(parilut_same_type<typename AEntriesType::non_const_value_type,
+                                  ordinal_type>,
                 "par_ilut_symbolic: A entry type must match KernelHandle entry "
                 "type (aka nnz_lno_t, and const doesn't matter)");
 
-  static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-                    typename LRowMapType::non_const_value_type, size_type),
-                "par_ilut_symbolic: L size_type must match KernelHandle "
-                "size_type (const doesn't matter)");
+  static_assert(
+      parilut_same_type<typename LRowMapType::non_const_value_type, size_type>,
+      "par_ilut_symbolic: L size_type must match KernelHandle "
+      "size_type (const doesn't matter)");
 
-  static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-                    typename URowMapType::non_const_value_type, size_type),
-                "par_ilut_symbolic: U size_type must match KernelHandle "
-                "size_type (const doesn't matter)");
+  static_assert(
+      parilut_same_type<typename URowMapType::non_const_value_type, size_type>,
+      "par_ilut_symbolic: U size_type must match KernelHandle "
+      "size_type (const doesn't matter)");
 
   static_assert(Kokkos::is_view<ARowMapType>::value,
                 "par_ilut_symbolic: A_rowmap is not a Kokkos::View.");
@@ -118,25 +120,25 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap,
                 "par_ilut_symbolic: A_rowmap, L_rowmap and U_rowmap must all "
                 "have rank 1.");
 
-  static_assert(std::is_same<typename LRowMapType::value_type,
-                             typename LRowMapType::non_const_value_type>::value,
+  static_assert(std::is_same_v<typename LRowMapType::value_type,
+                               typename LRowMapType::non_const_value_type>,
                 "par_ilut_symbolic: The output L_rowmap must be nonconst.");
-  static_assert(std::is_same<typename URowMapType::value_type,
-                             typename URowMapType::non_const_value_type>::value,
+  static_assert(std::is_same_v<typename URowMapType::value_type,
+                               typename URowMapType::non_const_value_type>,
                 "par_ilut_symbolic: The output U_rowmap must be nonconst.");
-  static_assert(std::is_same<typename LRowMapType::device_type,
-                             typename ARowMapType::device_type>::value,
+  static_assert(std::is_same_v<typename LRowMapType::device_type,
+                               typename ARowMapType::device_type>,
                 "par_ilut_symbolic: Views LRowMapType and ARowMapType have "
                 "different device_types.");
-  static_assert(std::is_same<typename LRowMapType::device_type,
-                             typename URowMapType::device_type>::value,
+  static_assert(std::is_same_v<typename LRowMapType::device_type,
+                               typename URowMapType::device_type>,
                 "par_ilut_symbolic: Views LRowMapType and URowMapType have "
                 "different device_types.");
 
   static_assert(
-      std::is_same<
+      std::is_same_v<
           typename LRowMapType::device_type::execution_space,
-          typename KernelHandle::PAR_ILUTHandleType::execution_space>::value,
+          typename KernelHandle::PAR_ILUTHandleType::execution_space>,
       "par_ilut_symbolic: KernelHandle and Views have different execution "
       "spaces.");
 
@@ -165,26 +167,26 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap,
       typename ARowMapType::const_value_type*,
       typename KokkosKernels::Impl::GetUnifiedLayout<ARowMapType>::array_layout,
       typename ARowMapType::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >;
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;
 
   using AEntries_Internal = Kokkos::View<
       typename AEntriesType::const_value_type*,
       typename KokkosKernels::Impl::GetUnifiedLayout<
           AEntriesType>::array_layout,
       typename AEntriesType::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >;
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;
 
   using LRowMap_Internal = Kokkos::View<
       typename LRowMapType::non_const_value_type*,
       typename KokkosKernels::Impl::GetUnifiedLayout<LRowMapType>::array_layout,
       typename LRowMapType::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >;
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;
 
   using URowMap_Internal = Kokkos::View<
       typename URowMapType::non_const_value_type*,
       typename KokkosKernels::Impl::GetUnifiedLayout<URowMapType>::array_layout,
       typename URowMapType::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >;
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;
 
   ARowMap_Internal A_rowmap_i   = A_rowmap;
   AEntries_Internal A_entries_i = A_entries;
@@ -240,46 +242,43 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap,
   using scalar_type  = typename KernelHandle::nnz_scalar_t;
 
   static_assert(
-      KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-          typename ARowMapType::non_const_value_type, size_type),
+      parilut_same_type<typename ARowMapType::non_const_value_type, size_type>,
       "par_ilut_numeric: A size_type must match KernelHandle size_type "
       "(const doesn't matter)");
-  static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-                    typename AEntriesType::non_const_value_type, ordinal_type),
+  static_assert(parilut_same_type<typename AEntriesType::non_const_value_type,
+                                  ordinal_type>,
                 "par_ilut_numeric: A entry type must match KernelHandle entry "
                 "type (aka nnz_lno_t, and const doesn't matter)");
-  static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-                    typename AValuesType::value_type, scalar_type),
-                "par_ilut_numeric: A scalar type must match KernelHandle entry "
-                "type (aka nnz_scalar_t, and const doesn't matter)");
+  static_assert(
+      parilut_same_type<typename AValuesType::value_type, scalar_type>,
+      "par_ilut_numeric: A scalar type must match KernelHandle entry "
+      "type (aka nnz_scalar_t, and const doesn't matter)");
 
   static_assert(
-      KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-          typename LRowMapType::non_const_value_type, size_type),
+      parilut_same_type<typename LRowMapType::non_const_value_type, size_type>,
       "par_ilut_numeric: L size_type must match KernelHandle size_type "
       "(const doesn't matter)");
-  static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-                    typename LEntriesType::non_const_value_type, ordinal_type),
+  static_assert(parilut_same_type<typename LEntriesType::non_const_value_type,
+                                  ordinal_type>,
                 "par_ilut_numeric: L entry type must match KernelHandle entry "
                 "type (aka nnz_lno_t, and const doesn't matter)");
-  static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-                    typename LValuesType::value_type, scalar_type),
-                "par_ilut_numeric: L scalar type must match KernelHandle entry "
-                "type (aka nnz_scalar_t, and const doesn't matter)");
+  static_assert(
+      parilut_same_type<typename LValuesType::value_type, scalar_type>,
+      "par_ilut_numeric: L scalar type must match KernelHandle entry "
+      "type (aka nnz_scalar_t, and const doesn't matter)");
 
   static_assert(
-      KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-          typename URowMapType::non_const_value_type, size_type),
+      parilut_same_type<typename URowMapType::non_const_value_type, size_type>,
       "par_ilut_numeric: U size_type must match KernelHandle size_type "
       "(const doesn't matter)");
-  static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-                    typename UEntriesType::non_const_value_type, ordinal_type),
+  static_assert(parilut_same_type<typename UEntriesType::non_const_value_type,
+                                  ordinal_type>,
                 "par_ilut_numeric: U entry type must match KernelHandle entry "
                 "type (aka nnz_lno_t, and const doesn't matter)");
-  static_assert(KOKKOSKERNELS_PAR_ILUT_SAME_TYPE(
-                    typename UValuesType::value_type, scalar_type),
-                "par_ilut_numeric: U scalar type must match KernelHandle entry "
-                "type (aka nnz_scalar_t, and const doesn't matter)");
+  static_assert(
+      parilut_same_type<typename UValuesType::value_type, scalar_type>,
+      "par_ilut_numeric: U scalar type must match KernelHandle entry "
+      "type (aka nnz_scalar_t, and const doesn't matter)");
 
   static_assert(Kokkos::is_view<ARowMapType>::value,
                 "par_ilut_numeric: A_rowmap is not a Kokkos::View.");
@@ -330,73 +329,71 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap,
                 "par_ilut_numeric: A_values, L_values and U_values must all "
                 "have rank 1.");
 
-  static_assert(
-      std::is_same<typename LEntriesType::value_type,
-                   typename LEntriesType::non_const_value_type>::value,
-      "par_ilut_numeric: The output L_entries must be nonconst.");
-  static_assert(std::is_same<typename LValuesType::value_type,
-                             typename LValuesType::non_const_value_type>::value,
+  static_assert(std::is_same_v<typename LEntriesType::value_type,
+                               typename LEntriesType::non_const_value_type>,
+                "par_ilut_numeric: The output L_entries must be nonconst.");
+  static_assert(std::is_same_v<typename LValuesType::value_type,
+                               typename LValuesType::non_const_value_type>,
                 "par_ilut_numeric: The output L_values must be nonconst.");
-  static_assert(
-      std::is_same<typename UEntriesType::value_type,
-                   typename UEntriesType::non_const_value_type>::value,
-      "par_ilut_numeric: The output U_entries must be nonconst.");
-  static_assert(std::is_same<typename UValuesType::value_type,
-                             typename UValuesType::non_const_value_type>::value,
+  static_assert(std::is_same_v<typename UEntriesType::value_type,
+                               typename UEntriesType::non_const_value_type>,
+                "par_ilut_numeric: The output U_entries must be nonconst.");
+  static_assert(std::is_same_v<typename UValuesType::value_type,
+                               typename UValuesType::non_const_value_type>,
                 "par_ilut_numeric: The output U_values must be nonconst.");
 
-  static_assert(std::is_same<typename LRowMapType::device_type,
-                             typename ARowMapType::device_type>::value,
+  static_assert(std::is_same_v<typename LRowMapType::device_type,
+                               typename ARowMapType::device_type>,
                 "par_ilut_numeric: Views LRowMapType and ARowMapType have "
                 "different device_types.");
-  static_assert(std::is_same<typename LEntriesType::device_type,
-                             typename AEntriesType::device_type>::value,
+  static_assert(std::is_same_v<typename LEntriesType::device_type,
+                               typename AEntriesType::device_type>,
                 "par_ilut_numeric: Views LEntriesType and AEntriesType have "
                 "different device_types.");
-  static_assert(std::is_same<typename LValuesType::device_type,
-                             typename AValuesType::device_type>::value,
+  static_assert(std::is_same_v<typename LValuesType::device_type,
+                               typename AValuesType::device_type>,
                 "par_ilut_numeric: Views LValuesType and AValuesType have "
                 "different device_types.");
 
-  static_assert(std::is_same<typename LRowMapType::device_type,
-                             typename URowMapType::device_type>::value,
+  static_assert(std::is_same_v<typename LRowMapType::device_type,
+                               typename URowMapType::device_type>,
                 "par_ilut_numeric: Views LRowMapType and URowMapType have "
                 "different device_types.");
-  static_assert(std::is_same<typename LEntriesType::device_type,
-                             typename UEntriesType::device_type>::value,
+  static_assert(std::is_same_v<typename LEntriesType::device_type,
+                               typename UEntriesType::device_type>,
                 "par_ilut_numeric: Views LEntriesType and UEntriesType have "
                 "different device_types.");
-  static_assert(std::is_same<typename LValuesType::device_type,
-                             typename UValuesType::device_type>::value,
+  static_assert(std::is_same_v<typename LValuesType::device_type,
+                               typename UValuesType::device_type>,
                 "par_ilut_numeric: Views LValuesType and UValuesType have "
                 "different device_types.");
 
   static_assert(
-      std::is_same<
+      std::is_same_v<
           typename LRowMapType::device_type::execution_space,
-          typename KernelHandle::PAR_ILUTHandleType::execution_space>::value,
+          typename KernelHandle::PAR_ILUTHandleType::execution_space>,
       "par_ilut_numeric: KernelHandle and Views have different execution "
       "spaces.");
   static_assert(
-      std::is_same<
+      std::is_same_v<
           typename LEntriesType::device_type::execution_space,
-          typename KernelHandle::PAR_ILUTHandleType::execution_space>::value,
+          typename KernelHandle::PAR_ILUTHandleType::execution_space>,
       "par_ilut_numeric: KernelHandle and Views have different execution "
       "spaces.");
   static_assert(
-      std::is_same<
+      std::is_same_v<
           typename LValuesType::device_type::execution_space,
-          typename KernelHandle::PAR_ILUTHandleType::execution_space>::value,
+          typename KernelHandle::PAR_ILUTHandleType::execution_space>,
       "par_ilut_numeric: KernelHandle and Views have different execution "
       "spaces.");
 
   static_assert(
-      std::is_same<typename LRowMapType::device_type,
-                   typename LEntriesType::device_type>::value,
+      std::is_same_v<typename LRowMapType::device_type,
+                     typename LEntriesType::device_type>,
       "par_ilut_numeric: rowmap and entries have different device types.");
   static_assert(
-      std::is_same<typename LRowMapType::device_type,
-                   typename LValuesType::device_type>::value,
+      std::is_same_v<typename LRowMapType::device_type,
+                     typename LValuesType::device_type>,
       "par_ilut_numeric: rowmap and values have different device types.");
 
   // Check if symbolic has been called
@@ -431,58 +428,58 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap,
       typename ARowMapType::const_value_type*,
       typename KokkosKernels::Impl::GetUnifiedLayout<ARowMapType>::array_layout,
       typename ARowMapType::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >;
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;
 
   using AEntries_Internal = Kokkos::View<
       typename AEntriesType::const_value_type*,
       typename KokkosKernels::Impl::GetUnifiedLayout<
           AEntriesType>::array_layout,
       typename AEntriesType::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >;
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;
 
   using AValues_Internal = Kokkos::View<
       typename AValuesType::const_value_type*,
       typename KokkosKernels::Impl::GetUnifiedLayout<AValuesType>::array_layout,
       typename AValuesType::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >;
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;
 
   using LRowMap_Internal = Kokkos::View<
       typename LRowMapType::non_const_value_type*,
       typename KokkosKernels::Impl::GetUnifiedLayout<LRowMapType>::array_layout,
       typename LRowMapType::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >;
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;
 
   using LEntries_Internal =
       Kokkos::View<typename LEntriesType::non_const_value_type*,
                    typename KokkosKernels::Impl::GetUnifiedLayout<
                        LEntriesType>::array_layout,
                    typename LEntriesType::device_type,
-                   Kokkos::MemoryTraits<Kokkos::RandomAccess> >;
+                   Kokkos::MemoryTraits<Kokkos::RandomAccess>>;
 
   using LValues_Internal = Kokkos::View<
       typename LValuesType::non_const_value_type*,
       typename KokkosKernels::Impl::GetUnifiedLayout<LValuesType>::array_layout,
       typename LValuesType::device_type,
-      Kokkos::MemoryTraits<Kokkos::RandomAccess> >;
+      Kokkos::MemoryTraits<Kokkos::RandomAccess>>;
 
   using URowMap_Internal = Kokkos::View<
       typename URowMapType::non_const_value_type*,
       typename KokkosKernels::Impl::GetUnifiedLayout<URowMapType>::array_layout,
       typename URowMapType::device_type,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >;
+      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>;
 
   using UEntries_Internal =
       Kokkos::View<typename UEntriesType::non_const_value_type*,
                    typename KokkosKernels::Impl::GetUnifiedLayout<
                        UEntriesType>::array_layout,
                    typename UEntriesType::device_type,
-                   Kokkos::MemoryTraits<Kokkos::RandomAccess> >;
+                   Kokkos::MemoryTraits<Kokkos::RandomAccess>>;
 
   using UValues_Internal = Kokkos::View<
       typename UValuesType::non_const_value_type*,
       typename KokkosKernels::Impl::GetUnifiedLayout<UValuesType>::array_layout,
       typename UValuesType::device_type,
-      Kokkos::MemoryTraits<Kokkos::RandomAccess> >;
+      Kokkos::MemoryTraits<Kokkos::RandomAccess>>;
 
   ARowMap_Internal A_rowmap_i   = A_rowmap;
   AEntries_Internal A_entries_i = A_entries;
@@ -519,6 +516,4 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap,
 }  // namespace Experimental
 }  // namespace KokkosSparse
 
-#undef KOKKOSKERNELS_PAR_ILUT_SAME_TYPE
-
 #endif  // KOKKOSSPARSE_PAR_ILUT_HPP_

From 41954e20d6811b5c21918bb195c8b6dd833c7736 Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpearson@users.noreply.github.com>
Date: Fri, 28 Jun 2024 12:44:58 -0600
Subject: [PATCH 17/32] Rename `Impl::alignPtr` to `Impl::alignPtrTo`, allow it
 to infer argument type (#2261)

* KokkosKernels::Impl::alignPtr infers argument type

* Rename KokkosKernels::Impl::alignPtr -> alignPtrTo
---
 common/src/KokkosKernels_Utils.hpp                |  4 ++--
 sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp   | 12 +++++-------
 sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp   |  2 +-
 sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp    | 15 ++++++---------
 sparse/impl/KokkosSparse_spgemm_impl_speed.hpp    |  2 +-
 .../KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp | 12 +++++-------
 6 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp
index ba8049cecf..45aa8132bc 100644
--- a/common/src/KokkosKernels_Utils.hpp
+++ b/common/src/KokkosKernels_Utils.hpp
@@ -1527,8 +1527,8 @@ struct array_sum_reduce {
   }
 };
 
-template <typename InPtr, typename T>
-KOKKOS_INLINE_FUNCTION T *alignPtr(InPtr p) {
+template <typename T, typename InPtr>
+KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr p) {
   // ugly but computationally free and the "right" way to do this in C++
   std::uintptr_t ptrVal = reinterpret_cast<std::uintptr_t>(p);
   // ptrVal + (align - 1) lands inside the next valid aligned scalar_t,
diff --git a/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp
index 6eb9044733..a36200b295 100644
--- a/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp
+++ b/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp
@@ -270,8 +270,7 @@ struct KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     nnz_lno_t *hash_ids = (nnz_lno_t *)(tmp);
     tmp += pow2_hash_size;
 
-    scalar_t *hash_values =
-        KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(tmp);
+    scalar_t *hash_values = KokkosKernels::Impl::alignPtrTo<scalar_t>(tmp);
 
     BlockAccumulator hm(block_dim, pow2_hash_size, pow2_hash_func, nullptr,
                         nullptr, hash_ids, hash_values);
@@ -414,7 +413,7 @@ struct KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size;
     // remainder of shmem allocation for vals
     scalar_t *vals =
-        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+        KokkosKernels::Impl::alignPtrTo<scalar_t>(all_shared_memory);
 
     BlockAccumulator hm(block_dim, thread_shmem_key_size,
                         thread_shared_memory_hash_func, begins, nexts, keys,
@@ -554,7 +553,7 @@ struct KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
     scalar_t *vals =
-        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+        KokkosKernels::Impl::alignPtrTo<scalar_t>(all_shared_memory);
 
     int thread_rank = teamMember.team_rank();
 
@@ -601,8 +600,7 @@ struct KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
           }
           global_acc_row_keys = (nnz_lno_t *)(tmp);
           global_acc_row_vals =
-              KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(
-                  tmp + pow2_hash_size);
+              KokkosKernels::Impl::alignPtrTo<scalar_t>(tmp + pow2_hash_size);
         }
         // initialize begins.
         {
@@ -885,7 +883,7 @@ struct KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
     scalar_t *vals =
-        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+        KokkosKernels::Impl::alignPtrTo<scalar_t>(all_shared_memory);
 
     int thread_rank = teamMember.team_rank();
 
diff --git a/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp b/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp
index bc1b378558..22111d3752 100644
--- a/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp
+++ b/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp
@@ -325,7 +325,7 @@ struct KokkosBSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size;
     scalar_t *vals =
-        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+        KokkosKernels::Impl::alignPtrTo<scalar_t>(all_shared_memory);
 
     KokkosKernels::Experimental::BlockHashmapAccumulator<
         nnz_lno_t, nnz_lno_t, scalar_t,
diff --git a/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
index a7fee71278..8fb2711cdf 100644
--- a/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
@@ -261,8 +261,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     nnz_lno_t *hash_ids = (nnz_lno_t *)(tmp);
     tmp += pow2_hash_size;
 
-    scalar_t *hash_values =
-        KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(tmp);
+    scalar_t *hash_values = KokkosKernels::Impl::alignPtrTo<scalar_t>(tmp);
 
     Kokkos::parallel_for(
         Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
@@ -409,8 +408,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
 
     hm2.keys = (nnz_lno_t *)(tmp);
     tmp += max_nnz;
-    hm2.values =
-        KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(tmp);
+    hm2.values = KokkosKernels::Impl::alignPtrTo<scalar_t>(tmp);
 
     Kokkos::parallel_for(
         Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
@@ -498,7 +496,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size;
     // remainder of shmem allocation for vals
     scalar_t *vals =
-        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+        KokkosKernels::Impl::alignPtrTo<scalar_t>(all_shared_memory);
 
     KokkosKernels::Experimental::HashmapAccumulator<
         nnz_lno_t, nnz_lno_t, scalar_t,
@@ -639,7 +637,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
     scalar_t *vals =
-        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+        KokkosKernels::Impl::alignPtrTo<scalar_t>(all_shared_memory);
 
     int thread_rank = teamMember.team_rank();
 
@@ -686,8 +684,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
           }
           global_acc_row_keys = (nnz_lno_t *)(tmp);
           global_acc_row_vals =
-              KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(
-                  tmp + pow2_hash_size);
+              KokkosKernels::Impl::alignPtrTo<scalar_t>(tmp + pow2_hash_size);
         }
         // initialize begins.
         {
@@ -970,7 +967,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
     scalar_t *vals =
-        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+        KokkosKernels::Impl::alignPtrTo<scalar_t>(all_shared_memory);
 
     int thread_rank = teamMember.team_rank();
 
diff --git a/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
index 954bfb3f3e..e19f5b7bc5 100644
--- a/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
@@ -304,7 +304,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size;
     scalar_t *vals =
-        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+        KokkosKernels::Impl::alignPtrTo<scalar_t>(all_shared_memory);
 
     KokkosKernels::Experimental::HashmapAccumulator<
         nnz_lno_t, nnz_lno_t, scalar_t,
diff --git a/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
index f638b76b9b..d48d297e2d 100644
--- a/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
+++ b/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
@@ -260,8 +260,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     tmp += max_nnz;
     nnz_lno_t *hash_ids = (nnz_lno_t *)(tmp);
     tmp += pow2_hash_size;
-    scalar_t *hash_values =
-        KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(tmp);
+    scalar_t *hash_values = KokkosKernels::Impl::alignPtrTo<scalar_t>(tmp);
 
     Kokkos::parallel_for(
         Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end),
@@ -452,7 +451,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
 
     // Remainder of shmem allocation for vals
     scalar_t *vals =
-        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+        KokkosKernels::Impl::alignPtrTo<scalar_t>(all_shared_memory);
 
     // Create the hashmaps
     KokkosKernels::Experimental::HashmapAccumulator<
@@ -610,7 +609,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
     scalar_t *vals =
-        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+        KokkosKernels::Impl::alignPtrTo<scalar_t>(all_shared_memory);
 
     int thread_rank = teamMember.team_rank();
     int vector_rank = 0;
@@ -826,7 +825,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
     nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory);
     all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size;
     scalar_t *vals =
-        KokkosKernels::Impl::alignPtr<char *, scalar_t>(all_shared_memory);
+        KokkosKernels::Impl::alignPtrTo<scalar_t>(all_shared_memory);
 
     int thread_rank = teamMember.team_rank();
     int vector_rank = 0;
@@ -871,8 +870,7 @@ struct KokkosSPGEMM<HandleType, a_row_view_t_, a_lno_nnz_view_t_,
         }
         global_acc_row_keys = (nnz_lno_t *)(tmp);
         global_acc_row_vals =
-            KokkosKernels::Impl::alignPtr<volatile nnz_lno_t *, scalar_t>(
-                tmp + pow2_hash_size);
+            KokkosKernels::Impl::alignPtrTo<scalar_t>(tmp + pow2_hash_size);
 
         nnz_lno_t num_threads = pow2_hash_size / vector_size;
         Kokkos::parallel_for(

From 1df84bb48ede8a61ba949e844745842b0b3ba11f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 1 Jul 2024 08:08:46 -0600
Subject: [PATCH 18/32] Bump github/codeql-action from 3.25.10 to 3.25.11
 (#2263)

Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.25.10 to 3.25.11.
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/23acc5c183826b7a8a97bce3cecc52db901f8251...b611370bb5703a7efb587f9d136a52ea24c5c38c)

---
updated-dependencies:
- dependency-name: github/codeql-action
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/codeql.yml     | 4 ++--
 .github/workflows/scorecards.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 7ed1a206a3..e1f8aa51f8 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -44,7 +44,7 @@ jobs:
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@23acc5c183826b7a8a97bce3cecc52db901f8251 # v3.25.10
+      uses: github/codeql-action/init@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11
       with:
         languages: c-cpp
         # If you wish to specify custom queries, you can do so here or in a config file.
@@ -100,6 +100,6 @@ jobs:
       run: make -j2
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@23acc5c183826b7a8a97bce3cecc52db901f8251 # v3.25.10
+      uses: github/codeql-action/analyze@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11
       with:
         category: "/language:c-cpp"
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 32dcabc873..4396ad1cdb 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -73,6 +73,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@23acc5c183826b7a8a97bce3cecc52db901f8251 # v3.25.10
+        uses: github/codeql-action/upload-sarif@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11
         with:
           sarif_file: results.sarif

From 0a0c04836c0ffd7cb85ab17b3f1e34ae3c919bff Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpearson@users.noreply.github.com>
Date: Mon, 1 Jul 2024 09:01:14 -0600
Subject: [PATCH 19/32] sparse: spadd_symbolic fences before device values used
 on host (#2259)

* sparse: spadd_symbolic fences before device values used on host

* sparse: use prefix sum to remove explicit spadd fence
---
 sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp
index 764d185f90..9744e75ec3 100644
--- a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp
+++ b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp
@@ -546,9 +546,7 @@ void spadd_symbolic_impl(
           "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries",
           range_type(exec, 0, nrows), countEntries);
       KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<execution_space>(
-          exec, nrows + 1, c_rowmap_upperbound);
-      Kokkos::deep_copy(exec, c_nnz_upperbound,
-                        Kokkos::subview(c_rowmap_upperbound, nrows));
+          exec, nrows + 1, c_rowmap_upperbound, c_nnz_upperbound);
     }
     ordinal_view_t c_entries_uncompressed(
         Kokkos::view_alloc(exec, Kokkos::WithoutInitializing,
@@ -595,6 +593,7 @@ void spadd_symbolic_impl(
   // provide the number of NNZ in C to user through handle
   size_type cmax;
   Kokkos::deep_copy(exec, cmax, Kokkos::subview(c_rowmap, nrows));
+  exec.fence("fence before cmax used on host");
   addHandle->set_c_nnz(cmax);
   addHandle->set_call_symbolic();
   addHandle->set_call_numeric(false);

From cfcde6777bc490271d113f7fc9599039e009aa74 Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpearson@users.noreply.github.com>
Date: Mon, 1 Jul 2024 16:56:04 -0600
Subject: [PATCH 20/32] sycl: use alternative `alignPtrTo` when SYCL is enabled
 (SpGEMM) (#2262)

* sycl: use alternative alignPtr when SYCL is enabled

The current alignPtr, as well as two other alternatives below, do not work on SYCL on Intel PVC.

unsigned int f1(unsigned int i, unsigned int align) {
    return ((i + align - 1) / align * align);
}

unsigned int f2(unsigned int i, unsigned int align) {
    return (i + align - 1) & (-align);
}

* alignPtrTo unit tests
---
 common/src/KokkosKernels_Utils.hpp          |  27 ++++
 common/unit_test/Test_Common.hpp            |   1 +
 common/unit_test/Test_Common_AlignPtrTo.hpp | 166 ++++++++++++++++++++
 3 files changed, 194 insertions(+)
 create mode 100644 common/unit_test/Test_Common_AlignPtrTo.hpp

diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp
index 45aa8132bc..89aeabb823 100644
--- a/common/src/KokkosKernels_Utils.hpp
+++ b/common/src/KokkosKernels_Utils.hpp
@@ -1527,6 +1527,32 @@ struct array_sum_reduce {
   }
 };
 
+/* Several alternatives were considered for SYCL, including
+
+unsigned int f1(unsigned int i, unsigned int align)
+{
+    return ((i + align - 1) / align * align);
+}
+
+unsigned int f2(unsigned int i, unsigned int align)
+{
+    return (i + align - 1) & (-align);
+}
+
+f1 should be equivalent to the below, but it produces incorrect results on SYCL
+f2 is how GCC does std::align, but it also produces incorrect results on SYCL
+possibly alignof(T) is not a power-of-2 on SYCL? Or a compiler error.
+*/
+#if defined(KOKKOS_ENABLE_SYCL)
+template <typename T, typename InPtr>
+KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr p) {
+  std::uintptr_t ptrVal = reinterpret_cast<std::uintptr_t>(p);
+  while (ptrVal % alignof(T)) {
+    ++ptrVal;
+  }
+  return reinterpret_cast<T *>(ptrVal);
+}
+#else
 template <typename T, typename InPtr>
 KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr p) {
   // ugly but computationally free and the "right" way to do this in C++
@@ -1535,6 +1561,7 @@ KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr p) {
   // and the mask produces the start of that scalar_t.
   return reinterpret_cast<T *>((ptrVal + alignof(T) - 1) & (~(alignof(T) - 1)));
 }
+#endif
 
 }  // namespace Impl
 }  // namespace KokkosKernels
diff --git a/common/unit_test/Test_Common.hpp b/common/unit_test/Test_Common.hpp
index 2ccf9c2103..fb93a494d6 100644
--- a/common/unit_test/Test_Common.hpp
+++ b/common/unit_test/Test_Common.hpp
@@ -16,6 +16,7 @@
 #ifndef TEST_COMMON_HPP
 #define TEST_COMMON_HPP
 
+#include <Test_Common_AlignPtrTo.hpp>
 #include <Test_Common_ArithTraits.hpp>
 // #include<Test_Common_float128.hpp>
 #include <Test_Common_set_bit_count.hpp>
diff --git a/common/unit_test/Test_Common_AlignPtrTo.hpp b/common/unit_test/Test_Common_AlignPtrTo.hpp
new file mode 100644
index 0000000000..f60887cd80
--- /dev/null
+++ b/common/unit_test/Test_Common_AlignPtrTo.hpp
@@ -0,0 +1,166 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+/*! \file
+
+This test file was motivated by an observation in the SpGEMM on SYCL that
+strange values were coming out of the pointer alignment functions, causing
+Kokkos::atomic_add to be a no-op or write 0. The Kokkos Kernels alignPtrTo
+function was updated with the one of four implementations that was observed to
+work on SYCL (even though all four in here should be okay.)
+
+TEST_FN 0-3 are various implemetations, and TEST_FN 4 is testing Kokkos Kernels
+implementation. The tests are written to PASS for the observed SYCL behavor -
+i.e., that TEST_FN 1,4 produce aligned pointers, and the others do not (even
+though they should). If the other functions start working on SYCL, then this
+test will "fail", and the Kokkos Kernels implementation should be updated with
+one of the now-working (and faster?) implementations.
+*/
+
+#ifndef TEST_COMMON_ALIGNPTRTO_HPP
+#define TEST_COMMON_ALIGNPTRTO_HPP
+
+#include <type_traits>
+#include <Kokkos_Core.hpp>
+#include <KokkosKernels_Utils.hpp>
+
+namespace {
+
+// the original Kokkos Kernels implementation
+template <typename T, typename InPtr>
+KOKKOS_INLINE_FUNCTION T *f0(InPtr p) {
+  std::uintptr_t ptrVal = reinterpret_cast<std::uintptr_t>(p);
+  return reinterpret_cast<T *>((ptrVal + alignof(T) - 1) & (~(alignof(T) - 1)));
+}
+
+// an implementation that works for SYCL
+template <typename T, typename InPtr>
+KOKKOS_INLINE_FUNCTION T *f1(InPtr p) {
+  std::uintptr_t ptrVal = reinterpret_cast<std::uintptr_t>(p);
+  while (ptrVal % alignof(T)) {
+    ++ptrVal;
+  }
+  return reinterpret_cast<T *>(ptrVal);
+}
+
+// another valid implementation
+template <typename T, typename InPtr>
+KOKKOS_INLINE_FUNCTION T *f2(InPtr p) {
+  std::uintptr_t ptrVal = reinterpret_cast<std::uintptr_t>(p);
+  return reinterpret_cast<T *>((ptrVal + alignof(T) - 1) / alignof(T) *
+                               alignof(T));
+}
+
+// the way GCC does it (roughly)
+template <typename T, typename InPtr>
+KOKKOS_INLINE_FUNCTION T *f3(InPtr p) {
+  std::uintptr_t ptrVal = reinterpret_cast<std::uintptr_t>(p);
+  return reinterpret_cast<T *>((ptrVal - uint64_t(1) + alignof(T)) &
+                               -alignof(T));
+}
+
+// Function to be executed by each team
+template <int TEST_FN, typename Results>
+struct TeamFunction {
+  TeamFunction() = default;
+  TeamFunction(const Results &results) : results_(results) {}
+
+  template <typename Team>
+  KOKKOS_INLINE_FUNCTION void operator()(const Team &team) const {
+    // get an "aligned" pointer to scratch memory
+    char *shmem = (char *)(team.team_shmem().get_shmem(team.team_size() *
+                                                       sizeof(double)));
+    double *vals;
+    if constexpr (0 == TEST_FN) {
+      vals = f0<double>(shmem);
+    } else if constexpr (1 == TEST_FN) {
+      vals = f1<double>(shmem);
+    } else if constexpr (2 == TEST_FN) {
+      vals = f2<double>(shmem);
+    } else if constexpr (3 == TEST_FN) {
+      vals = f3<double>(shmem);
+    } else if constexpr (4 == TEST_FN) {
+      vals = KokkosKernels::Impl::alignPtrTo<double>(shmem);
+    } else {
+      static_assert(std::is_void_v<Results>, "Unexpected test function");
+    }
+
+    const size_t i = team.team_rank();
+    double val     = team.team_rank();
+    vals[i]        = 0;  // zero shared memory
+    Kokkos::atomic_add(&vals[i], val);
+#if 0  // debugging
+    Kokkos::printf("%s:%i result(%lu) += %f yielded %f\n", __FILE__, __LINE__, i, val, vals[i]);
+#endif
+
+    results_(i) = vals[i];
+  }
+
+  size_t team_shmem_size(int team_size) const {
+    return team_size * sizeof(double);
+  }
+
+  Results results_;
+};
+
+// use atomic add to set result(i) = i
+template <int TEST_FN, typename Device>
+void test_alignPtrTo() {
+  using MemorySpace  = typename Device::memory_space;
+  using ExecSpace    = typename Device::execution_space;
+  using TestView     = Kokkos::View<double *, MemorySpace>;
+  using TestPolicy   = Kokkos::TeamPolicy<ExecSpace>;
+  const int teamSize = TestPolicy(1, Kokkos::AUTO)
+                           .team_size_max(TeamFunction<TEST_FN, TestView>(),
+                                          Kokkos::ParallelForTag{});
+
+  ExecSpace space;
+
+  TestView results("TestView", teamSize);
+  TestPolicy policy(space, 1, teamSize);
+  Kokkos::parallel_for("test alignment", policy,
+                       TeamFunction<TEST_FN, TestView>(results));
+
+  int errs;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy(space, 0, teamSize),
+      KOKKOS_LAMBDA(int i, int &lerr) { lerr += (results(i) != i); }, errs);
+
+// if SYCL is enabled, only TEST_FN 1 and 4 should work
+#if defined(KOKKOS_ENABLE_SYCL)
+  if constexpr (std::is_same_v<ExecSpace, Kokkos::Experimental::SYCL>) {
+    if constexpr ((1 == TEST_FN) || (4 == TEST_FN)) {
+      EXPECT_EQ(0, errs);
+    } else {
+      EXPECT_NE(0, errs);
+    }
+  } else {
+    EXPECT_EQ(0, errs);
+  }
+#else
+  EXPECT_EQ(0, errs);
+#endif
+}
+
+TEST_F(TestCategory, common_AlignPtrTo_0) { test_alignPtrTo<0, TestDevice>(); }
+TEST_F(TestCategory, common_AlignPtrTo_1) { test_alignPtrTo<1, TestDevice>(); }
+TEST_F(TestCategory, common_AlignPtrTo_2) { test_alignPtrTo<2, TestDevice>(); }
+TEST_F(TestCategory, common_AlignPtrTo_3) { test_alignPtrTo<3, TestDevice>(); }
+TEST_F(TestCategory, common_AlignPtrTo_kk) { test_alignPtrTo<4, TestDevice>(); }
+
+}  // anonymous namespace
+
+#endif  // TEST_COMMON_ALIGNPTRTO

From 6d7e977ba93906b37ceaa07314052d902e863547 Mon Sep 17 00:00:00 2001
From: Nathan Ellingwood <ndellin@sandia.gov>
Date: Tue, 2 Jul 2024 13:06:44 -0600
Subject: [PATCH 21/32] Help gcc/8.3 with ctad issue

Resolves #2264

Co-authored-by: Carl Pearson <cwpears@sandia.gov>
---
 common/unit_test/Test_Common_AlignPtrTo.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/unit_test/Test_Common_AlignPtrTo.hpp b/common/unit_test/Test_Common_AlignPtrTo.hpp
index f60887cd80..760cddd5a2 100644
--- a/common/unit_test/Test_Common_AlignPtrTo.hpp
+++ b/common/unit_test/Test_Common_AlignPtrTo.hpp
@@ -136,7 +136,7 @@ void test_alignPtrTo() {
 
   int errs;
   Kokkos::parallel_reduce(
-      Kokkos::RangePolicy(space, 0, teamSize),
+      Kokkos::RangePolicy<ExecSpace>(space, 0, teamSize),
       KOKKOS_LAMBDA(int i, int &lerr) { lerr += (results(i) != i); }, errs);
 
 // if SYCL is enabled, only TEST_FN 1 and 4 should work

From 8130cf9ab45db5b2bebad4586c9827a18b2f6bcb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 8 Jul 2024 08:59:56 -0600
Subject: [PATCH 22/32] Bump actions/upload-artifact from 4.3.3 to 4.3.4
 (#2266)

Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.3.3 to 4.3.4.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/65462800fd760344b1a7b4382951275a0abb4808...0b2256b8c012f0828dc542b3febcab082c67f72b)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/scorecards.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 4396ad1cdb..bf06213b40 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -65,7 +65,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
+        uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # v4.3.4
         with:
           name: SARIF file
           path: results.sarif

From 48e941b0b483522762e97a41ce6e9efea1677499 Mon Sep 17 00:00:00 2001
From: Carl Pearson <cwpearson@users.noreply.github.com>
Date: Tue, 9 Jul 2024 10:43:09 -0600
Subject: [PATCH 23/32] handle_t* -> unique_ptr<handle_t> in Bsr SpMV unit
 tests (#2269)

---
 sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp
index e9b23298f9..699afb2510 100644
--- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp
+++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp
@@ -383,9 +383,9 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs,
   using handle_t = SPMVHandle<execution_space, Bsr, decltype(x), decltype(y)>;
 
   // cover a variety of algorithms
-  std::vector<handle_t *> handles;
+  std::vector<std::unique_ptr<handle_t>> handles;
   for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_BSR_V41})
-    handles.push_back(new handle_t(algo));
+    handles.push_back(std::make_unique<handle_t>(algo));
 
   // Tensor core algorithm temporarily disabled, fails on V100
   /*
@@ -405,14 +405,14 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs,
   }
   */
 
-  for (handle_t *handle : handles) {
+  for (std::unique_ptr<handle_t> &handle : handles) {
     for (scalar_type alpha :
          {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) {
       for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1),
                                scalar_type(-1.5)}) {
-        test_spmv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y);
+        test_spmv(handle.get(), mode, alpha, beta, a, acrs, maxNnzPerRow, x, y);
         if (beta == scalar_type(0)) {
-          test_spmv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow,
+          test_spmv(handle.get(), mode, alpha, beta, a, acrs, maxNnzPerRow,
                     x_with_nans, y_with_nans);
         }
       }
@@ -644,9 +644,9 @@ void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs,
       SPMVHandle<execution_space, Bsr, multivector_t, multivector_t>;
 
   // cover a variety of algorithms
-  std::vector<handle_t *> handles;
+  std::vector<std::unique_ptr<handle_t>> handles;
   for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_BSR_V41})
-    handles.push_back(new handle_t(algo));
+    handles.push_back(std::make_unique<handle_t>(algo));
 
   // Tensor core algorithm temporarily disabled, fails on V100
   /*
@@ -670,14 +670,15 @@ void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs,
     auto [x, y] = random_multivecs_for_spm_mv<Layout>(mode, a, numVecs);
     auto [x_with_nans, y_with_nans] =
         random_multivecs_for_spm_mv<Layout>(mode, a, numVecs, true);
-    for (handle_t *handle : handles) {
+    for (std::unique_ptr<handle_t> &handle : handles) {
       for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1),
                                 scalar_type(3.7)}) {
         for (scalar_type beta : {scalar_type(0), scalar_type(1),
                                  scalar_type(-1), scalar_type(-1.5)}) {
-          test_spm_mv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y);
+          test_spm_mv(handle.get(), mode, alpha, beta, a, acrs, maxNnzPerRow, x,
+                      y);
           if (beta == scalar_type(0)) {
-            test_spm_mv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow,
+            test_spm_mv(handle.get(), mode, alpha, beta, a, acrs, maxNnzPerRow,
                         x_with_nans, y_with_nans);
           }
         }

From e1cd832e135628ef27248e368ad4d29d4a8c37e6 Mon Sep 17 00:00:00 2001
From: brian-kelley <bmkelle@sandia.gov>
Date: Tue, 9 Jul 2024 13:07:25 -0600
Subject: [PATCH 24/32] Workarounds for removed cusparse functions (#2270)

cusparse 12.5 removed some functions that were deprecated, like the ILU
factorizations and the legacy csrsv (sparse triangular solve) functions.
As a workaround, if the cusparse version is >= 12.5 then disable the paths
in perftests that call those.
---
 perf_test/sparse/KokkosSparse_spiluk.cpp     | 56 ++++++------------
 perf_test/sparse/KokkosSparse_sptrsv_aux.hpp | 62 ++++++++++++--------
 2 files changed, 56 insertions(+), 62 deletions(-)

diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp
index c85b126019..95dcc78ab1 100644
--- a/perf_test/sparse/KokkosSparse_spiluk.cpp
+++ b/perf_test/sparse/KokkosSparse_spiluk.cpp
@@ -24,7 +24,13 @@
 #include <unordered_map>
 #include <iomanip>  // std::setprecision
 
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+// cuSPARSE ILU and IC factorizations were removed
+// completely in cuSPARSE 12.5
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUSPARSE_VERSION < 12500)
+#define USE_CUSPARSE_ILU
+#endif
+
+#ifdef USE_CUSPARSE_ILU
 #include <cusparse.h>
 #endif
 
@@ -39,8 +45,6 @@
 #include <KokkosKernels_IOUtils.hpp>
 #include <KokkosSparse_IOUtils.hpp>
 
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \
-    (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION))
 using namespace KokkosSparse;
 using namespace KokkosSparse::Experimental;
 using namespace KokkosKernels;
@@ -52,8 +56,8 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
                      int team_size, int /*vector_length*/,
                      /*int idx_offset,*/ int loop) {
   typedef default_scalar scalar_t;
-  typedef default_lno_t lno_t;
-  typedef default_size_type size_type;
+  typedef int lno_t;
+  typedef int size_type;
   typedef Kokkos::DefaultExecutionSpace execution_space;
   typedef typename execution_space::memory_space memory_space;
 
@@ -82,6 +86,11 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
 
   std::cout << "\n\n" << std::endl;
   if (!afilename.empty()) {
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && !defined(USE_CUSPARSE_ILU)
+    std::cout << "** Note: cuSPARSE is enabled, but the cusparseXcsrilu*\n";
+    std::cout << "   functions were removed in cuSPARSE 12.5.\n";
+    std::cout << "   Only KokkosKernels spiluk will be run.\n\n";
+#endif
     std::cout << "ILU(K) Begin: Read matrix filename " << afilename
               << std::endl;
     crsmat_t A = KokkosSparse::Impl::read_kokkos_crst_matrix<crsmat_t>(
@@ -91,11 +100,7 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
     const int nnz         = A.nnz();
     const typename KernelHandle::const_nnz_lno_t fill_lev = lno_t(kin);
 
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-    // cuSPARSE requires lno_t = size_type = int. For both, int is always used
-    // (if enabled)
-#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-    defined(KOKKOSKERNELS_INST_OFFSET_INT)
+#ifdef USE_CUSPARSE_ILU
     // std::cout << "  cusparse: create handle" << std::endl;
     cusparseStatus_t status;
     cusparseHandle_t handle = 0;
@@ -131,10 +136,6 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
                                  info, &pBufferSize);
     // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
     cudaMalloc((void **)&pBuffer, pBufferSize);
-#else
-    std::cout << "Note: the cuSPARSE TPL is enabled, but either offset=int or "
-                 "ordinal=int is disabled, so it can't be used.\n";
-#endif
 #endif
 
     for (auto test : tests) {
@@ -223,11 +224,7 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
       std::cout << "nrm2(A*e-L*U*e) = " << std::setprecision(15) << bb_nrm
                 << std::endl;
 
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-      // cuSPARSE requires lno_t = size_type = int. For both, int is always used
-      // (if enabled)
-#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-    defined(KOKKOSKERNELS_INST_OFFSET_INT)
+#ifdef USE_CUSPARSE_ILU
       if (fill_lev == 0) {
         std::cout << "CUSPARSE: No KK interface added yet" << std::endl;
 
@@ -383,7 +380,6 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
         }  // end row
         std::cout << "ILU(0) SUCCESS!" << std::endl;
       }  // fill_lev=0
-#endif
 #endif
 
       // Benchmark
@@ -407,11 +403,7 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
       std::cout << "LOOP_MAX_TIME:  " << max_time << std::endl;
       std::cout << "LOOP_MIN_TIME:  " << min_time << std::endl;
 
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-      // cuSPARSE requires lno_t = size_type = int. For both, int is always used
-      // (if enabled)
-#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-    defined(KOKKOSKERNELS_INST_OFFSET_INT)
+#ifdef USE_CUSPARSE_ILU
       if (fill_lev == 0) {
         lno_view_t A_row_map("A_row_map", nrows + 1);
         lno_nnz_view_t A_entries("A_entries", nnz);
@@ -441,21 +433,15 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int kin,
         std::cout << "LOOP_MAX_TIME (cuSPARSE):  " << max_time << std::endl;
         std::cout << "LOOP_MIN_TIME (cuSPARSE):  " << min_time << std::endl;
       }  // fill_lev=0
-#endif
 #endif
     }  // end tests
 
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-    // cuSPARSE requires lno_t = size_type = int. For both, int is always used
-    // (if enabled)
-#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \
-    defined(KOKKOSKERNELS_INST_OFFSET_INT)
+#ifdef USE_CUSPARSE_ILU
     // step 6: free resources
     cudaFree(pBuffer);
     cusparseDestroyCsrilu02Info(info);
     cusparseDestroyMatDescr(descr);
     cusparseDestroy(handle);
-#endif
 #endif
   }  // end if (!afilename.empty())
 
@@ -583,9 +569,3 @@ int main(int argc, char **argv) {
   Kokkos::finalize();
   return 0;
 }
-#else
-int main() {
-  std::cout << "The SPILUK perf_test requires CUDA >= 8.0\n";
-  return 0;
-}
-#endif
diff --git a/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp b/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp
index 65120a8827..6b9c244da3 100644
--- a/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp
+++ b/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp
@@ -228,25 +228,37 @@ std::string getCuSparseErrorString(cusparseStatus_t status) {
 /* =========================================================================================
  */
 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
+#if CUSPARSE_VERSION >= 12500
+template <typename crsmat_t, typename host_crsmat_t>
+bool check_cusparse(host_crsmat_t &, bool, crsmat_t &, bool, crsmat_t &, int *,
+                    int *, double, int) {
+  // TODO: call KokkosSparse::sptrsv (if hardcoded problem settings below are
+  // compatible), or add wrappers for modern interface (cusparseSpSV*)
+  throw std::logic_error("Legacy cuSPARSE csrsv interface not available.");
+  return false;
+}
+
+#else
+
 template <typename crsmat_t, typename host_crsmat_t>
 bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L,
                     bool col_majorU, crsmat_t &U, int *perm_r, int *perm_c,
                     double tol, int loop) {
   using values_view_t = typename crsmat_t::values_type::non_const_type;
-  using scalar_t      = typename values_view_t::value_type;
-  using size_type     = typename crsmat_t::size_type;
+  using scalar_t = typename values_view_t::value_type;
+  using size_type = typename crsmat_t::size_type;
 
   using host_values_view_t =
       typename host_crsmat_t::values_type::non_const_type;
 
   using execution_space = typename values_view_t::execution_space;
-  using memory_space    = typename execution_space::memory_space;
+  using memory_space = typename execution_space::memory_space;
 
   using host_execution_space = typename host_values_view_t::execution_space;
-  using host_memory_space    = typename host_execution_space::memory_space;
+  using host_memory_space = typename host_execution_space::memory_space;
 
   using host_scalar_view_t = Kokkos::View<scalar_t *, host_memory_space>;
-  using scalar_view_t      = Kokkos::View<scalar_t *, memory_space>;
+  using scalar_view_t = Kokkos::View<scalar_t *, memory_space>;
 
   const scalar_t ZERO(0.0);
   const scalar_t ONE(1.0);
@@ -258,7 +270,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L,
   // > create a handle
   cusparseStatus_t status;
   cusparseHandle_t handle = 0;
-  status                  = cusparseCreate(&handle);
+  status = cusparseCreate(&handle);
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** cusparseCreate failed with "
               << getCuSparseErrorString(status) << " ** " << std::endl;
@@ -269,7 +281,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L,
 
   // > create a empty info structure for L-solve (e.g., analysis results)
   csrsv2Info_t infoL = 0;
-  status             = cusparseCreateCsrsv2Info(&infoL);
+  status = cusparseCreateCsrsv2Info(&infoL);
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** cusparseCreateCsrsv2Info failed with "
               << getCuSparseErrorString(status) << " ** " << std::endl;
@@ -279,14 +291,14 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L,
   // Preparing for L-solve
   // step 1: create a descriptor
   size_type nnzL = L.nnz();
-  auto graphL    = L.graph;  // in_graph
-  auto row_mapL  = graphL.row_map;
-  auto entriesL  = graphL.entries;
-  auto valuesL   = L.values;
+  auto graphL = L.graph;  // in_graph
+  auto row_mapL = graphL.row_map;
+  auto entriesL = graphL.entries;
+  auto valuesL = L.values;
 
   // NOTE: it is stored in CSC = UPPER + TRANSPOSE
   cusparseMatDescr_t descrL = 0;
-  status                    = cusparseCreateMatDescr(&descrL);
+  status = cusparseCreateMatDescr(&descrL);
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** cusparseCreateMatDescr failed with "
               << getCuSparseErrorString(status) << " ** " << std::endl;
@@ -300,7 +312,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L,
   // step 2: query how much memory used in csrsv2, and allocate the buffer
   // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
   int pBufferSize;
-  void *pBufferL             = 0;
+  void *pBufferL = 0;
   cusparseOperation_t transL = (col_majorL ? CUSPARSE_OPERATION_TRANSPOSE
                                            : CUSPARSE_OPERATION_NON_TRANSPOSE);
   if (std::is_same<scalar_t, double>::value) {
@@ -374,14 +386,14 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L,
   timer.reset();
   if (std::is_same<scalar_t, double>::value) {
     const double alpha = 1.0;
-    status             = cusparseDcsrsv2_solve(
+    status = cusparseDcsrsv2_solve(
         handle, transL, nrows, nnzL, &alpha, descrL,
         reinterpret_cast<double *>(valuesL.data()), row_mapL.data(),
         entriesL.data(), infoL, reinterpret_cast<double *>(rhs.data()),
         reinterpret_cast<double *>(sol.data()), policy, pBufferL);
   } else {
     const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
-    status                      = cusparseZcsrsv2_solve(
+    status = cusparseZcsrsv2_solve(
         handle, transL, nrows, nnzL, &alpha, descrL,
         reinterpret_cast<cuDoubleComplex *>(valuesL.data()), row_mapL.data(),
         entriesL.data(), infoL, reinterpret_cast<cuDoubleComplex *>(rhs.data()),
@@ -404,14 +416,14 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L,
   // ==============================================
   // Preparing for U-solve
   size_type nnzU = U.nnz();
-  auto graphU    = U.graph;  // in_graph
-  auto row_mapU  = graphU.row_map;
-  auto entriesU  = graphU.entries;
-  auto valuesU   = U.values;
+  auto graphU = U.graph;  // in_graph
+  auto row_mapU = graphU.row_map;
+  auto entriesU = graphU.entries;
+  auto valuesU = U.values;
 
   // > create a empty info structure for U-solve (e.g., analysis results)
   csrsv2Info_t infoU = 0;
-  status             = cusparseCreateCsrsv2Info(&infoU);
+  status = cusparseCreateCsrsv2Info(&infoU);
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** cusparseCreateCsrsv2Info failed with "
               << getCuSparseErrorString(status) << " ** " << std::endl;
@@ -420,7 +432,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L,
   // ==============================================
   // step 1: create a descriptor
   cusparseMatDescr_t descrU = 0;
-  status                    = cusparseCreateMatDescr(&descrU);
+  status = cusparseCreateMatDescr(&descrU);
   if (CUSPARSE_STATUS_SUCCESS != status) {
     std::cout << " ** cusparseCreateMatDescr create status error name "
               << getCuSparseErrorString(status) << " ** " << std::endl;
@@ -438,7 +450,7 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L,
   // ==============================================
   // step 2: query how much memory used in csrsv2, and allocate the buffer
   // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
-  void *pBufferU             = 0;
+  void *pBufferU = 0;
   cusparseOperation_t transU = (col_majorU ? CUSPARSE_OPERATION_TRANSPOSE
                                            : CUSPARSE_OPERATION_NON_TRANSPOSE);
   if (std::is_same<scalar_t, double>::value) {
@@ -485,14 +497,14 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L,
   timer.reset();
   if (std::is_same<scalar_t, double>::value) {
     const double alpha = 1.0;
-    status             = cusparseDcsrsv2_solve(
+    status = cusparseDcsrsv2_solve(
         handle, transU, nrows, nnzU, &alpha, descrU,
         reinterpret_cast<double *>(valuesU.data()), row_mapU.data(),
         entriesU.data(), infoU, reinterpret_cast<double *>(sol.data()),
         reinterpret_cast<double *>(rhs.data()), policy, pBufferU);
   } else {
     const cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
-    status                      = cusparseZcsrsv2_solve(
+    status = cusparseZcsrsv2_solve(
         handle, transU, nrows, nnzU, &alpha, descrU,
         reinterpret_cast<cuDoubleComplex *>(valuesU.data()), row_mapU.data(),
         entriesU.data(), infoU, reinterpret_cast<cuDoubleComplex *>(sol.data()),
@@ -652,6 +664,8 @@ bool check_cusparse(host_crsmat_t &Mtx, bool col_majorL, crsmat_t &L,
   }
   return success;
 }
+#endif
+
 #else
 template <typename crsmat_t, typename host_crsmat_t>
 bool check_cusparse(host_crsmat_t & /*Mtx*/, bool /*col_majorL*/,

From ea430c3f558b812b47d6bfc70965005acc9652a2 Mon Sep 17 00:00:00 2001
From: Luc Berger <lberge@sandia.gov>
Date: Tue, 9 Jul 2024 21:45:44 -0600
Subject: [PATCH 25/32] BLAS - gemv: using fallback when mode is 't' or 'c' and
 onemkl is used (#2272)

---
 blas/src/KokkosBlas2_gemv.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp
index 614b48d47a..e68f2cca75 100644
--- a/blas/src/KokkosBlas2_gemv.hpp
+++ b/blas/src/KokkosBlas2_gemv.hpp
@@ -163,9 +163,11 @@ void gemv(const ExecutionSpace& space, const char trans[],
 #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL
 #ifdef KOKKOS_ENABLE_SYCL
   // oneMKL supports both row-major and column-major of A
+  // but only supports oneapi::mkl::transpose::nontrans op
   useFallback =
-      useFallback || !std::is_same_v<typename AViewType::memory_space,
-                                     Kokkos::Experimental::SYCLDeviceUSMSpace>;
+      useFallback || ((tolower(*trans) == 't' || tolower(*trans) == 'c') &&
+                      std::is_same_v<typename AViewType::memory_space,
+                                     Kokkos::Experimental::SYCLDeviceUSMSpace>);
 #endif
 #endif
 

From 994891a23207e5ebbbb81ddbb5f02d90343a2606 Mon Sep 17 00:00:00 2001
From: yasahi-hpc <57478230+yasahi-hpc@users.noreply.github.com>
Date: Wed, 10 Jul 2024 18:38:55 +0200
Subject: [PATCH 26/32] Implement batched serial pttrf (#2256)

* Batched serial pttrf implementation

* fix: use GEMM to add matrices

* fix: initialization order

* fformat

* fix: temporary variable in a test code

* fix: docstring of pttrf

* check_positive_definitiveness only if KOKKOSKERNELS_DEBUG_LEVEL > 0

* Improve the test for pttrf

* fix: int type

* fix: cleanup tests for SerialPttrf

* cleanup: remove unused deep_copies

* fix: docstrings and comments for pttrf

* ConjTranspose with conj and Transpose

* quick return in pttrf for size 1 or 0 matrix

* Add tests for invalid input

* fix: info computation

---------

Co-authored-by: Yuuichi Asahi <y.asahi@nr.titech.ac.jp>
---
 .../impl/KokkosBatched_Pttrf_Serial_Impl.hpp  |  73 +++
 .../KokkosBatched_Pttrf_Serial_Internal.hpp   | 211 ++++++++
 batched/dense/src/KokkosBatched_Pttrf.hpp     |  52 ++
 .../dense/unit_test/Test_Batched_Dense.hpp    |   3 +
 .../unit_test/Test_Batched_DenseUtils.hpp     |  40 ++
 .../unit_test/Test_Batched_SerialPttrf.hpp    | 467 ++++++++++++++++++
 .../Test_Batched_SerialPttrf_Complex.hpp      |  31 ++
 .../Test_Batched_SerialPttrf_Real.hpp         |  31 ++
 blas/impl/KokkosBlas_util.hpp                 |   1 +
 9 files changed, 909 insertions(+)
 create mode 100644 batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp
 create mode 100644 batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp
 create mode 100644 batched/dense/src/KokkosBatched_Pttrf.hpp
 create mode 100644 batched/dense/unit_test/Test_Batched_SerialPttrf.hpp
 create mode 100644 batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp
 create mode 100644 batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp

diff --git a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp
new file mode 100644
index 0000000000..b0ea39fa3f
--- /dev/null
+++ b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Impl.hpp
@@ -0,0 +1,73 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+#ifndef KOKKOSBATCHED_PTTRF_SERIAL_IMPL_HPP_
+#define KOKKOSBATCHED_PTTRF_SERIAL_IMPL_HPP_
+
+#include <KokkosBatched_Util.hpp>
+#include "KokkosBatched_Pttrf_Serial_Internal.hpp"
+
+/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr)
+
+namespace KokkosBatched {
+
+template <typename DViewType, typename EViewType>
+KOKKOS_INLINE_FUNCTION static int checkPttrfInput(
+    [[maybe_unused]] const DViewType &d, [[maybe_unused]] const EViewType &e) {
+  static_assert(Kokkos::is_view<DViewType>::value,
+                "KokkosBatched::pttrf: DViewType is not a Kokkos::View.");
+  static_assert(Kokkos::is_view<EViewType>::value,
+                "KokkosBatched::pttrf: EViewType is not a Kokkos::View.");
+
+  static_assert(DViewType::rank == 1,
+                "KokkosBatched::pttrf: DViewType must have rank 1.");
+  static_assert(EViewType::rank == 1,
+                "KokkosBatched::pttrf: EViewType must have rank 1.");
+
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  const int nd = d.extent(0);
+  const int ne = e.extent(0);
+
+  if (ne + 1 != nd) {
+    Kokkos::printf(
+        "KokkosBatched::pttrf: Dimensions of d and e do not match: d: %d, e: "
+        "%d \n"
+        "e.extent(0) must be equal to d.extent(0) - 1\n",
+        nd, ne);
+    return 1;
+  }
+#endif
+  return 0;
+}
+
+template <>
+struct SerialPttrf<Algo::Pttrf::Unblocked> {
+  template <typename DViewType, typename EViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d,
+                                           const EViewType &e) {
+    // Quick return if possible
+    if (d.extent(0) == 0) return 0;
+    if (d.extent(0) == 1) return (d(0) < 0 ? 1 : 0);
+
+    auto info = checkPttrfInput(d, e);
+    if (info) return info;
+
+    return SerialPttrfInternal<Algo::Pttrf::Unblocked>::invoke(
+        d.extent(0), d.data(), d.stride(0), e.data(), e.stride(0));
+  }
+};
+}  // namespace KokkosBatched
+
+#endif  // KOKKOSBATCHED_PTTRF_SERIAL_IMPL_HPP_
diff --git a/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp
new file mode 100644
index 0000000000..5b4d3fb182
--- /dev/null
+++ b/batched/dense/impl/KokkosBatched_Pttrf_Serial_Internal.hpp
@@ -0,0 +1,211 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+#ifndef KOKKOSBATCHED_PTTRF_SERIAL_INTERNAL_HPP_
+#define KOKKOSBATCHED_PTTRF_SERIAL_INTERNAL_HPP_
+
+#include <KokkosBatched_Util.hpp>
+
+/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr)
+
+namespace KokkosBatched {
+
+template <typename AlgoType>
+struct SerialPttrfInternal {
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const int n,
+                                           ValueType *KOKKOS_RESTRICT d,
+                                           const int ds0,
+                                           ValueType *KOKKOS_RESTRICT e,
+                                           const int es0);
+
+  template <typename ValueType>
+  KOKKOS_INLINE_FUNCTION static int invoke(
+      const int n, ValueType *KOKKOS_RESTRICT d, const int ds0,
+      Kokkos::complex<ValueType> *KOKKOS_RESTRICT e, const int es0);
+};
+
+///
+/// Real matrix
+///
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int SerialPttrfInternal<Algo::Pttrf::Unblocked>::invoke(
+    const int n, ValueType *KOKKOS_RESTRICT d, const int ds0,
+    ValueType *KOKKOS_RESTRICT e, const int es0) {
+  int info = 0;
+
+  auto update = [&](const int i) {
+    auto ei_tmp = e[i * es0];
+    e[i * es0]  = ei_tmp / d[i * ds0];
+    d[(i + 1) * ds0] -= e[i * es0] * ei_tmp;
+  };
+
+  auto check_positive_definitiveness = [&](const int i) {
+    return (d[i] <= 0.0) ? (i + 1) : 0;
+  };
+
+  // Compute the L*D*L' (or U'*D*U) factorization of A.
+  const int i4 = (n - 1) % 4;
+  for (int i = 0; i < i4; i++) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    info = check_positive_definitiveness(i);
+    if (info) {
+      return info;
+    }
+#endif
+
+    update(i);
+  }  // for (int i = 0; i < i4; i++)
+
+  for (int i = i4; i < n - 4; i += 4) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    info = check_positive_definitiveness(i);
+    if (info) {
+      return info;
+    }
+#endif
+
+    update(i);
+
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    info = check_positive_definitiveness(i + 1);
+    if (info) {
+      return info;
+    }
+#endif
+
+    update(i + 1);
+
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    info = check_positive_definitiveness(i + 2);
+    if (info) {
+      return info;
+    }
+#endif
+
+    update(i + 2);
+
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    info = check_positive_definitiveness(i + 3);
+    if (info) {
+      return info;
+    }
+#endif
+
+    update(i + 3);
+
+  }  // for (int i = i4; i < n-4; 4)
+
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  info = check_positive_definitiveness(n - 1);
+  if (info) {
+    return info;
+  }
+#endif
+
+  return 0;
+}
+
+///
+/// Complex matrix
+///
+
+template <>
+template <typename ValueType>
+KOKKOS_INLINE_FUNCTION int SerialPttrfInternal<Algo::Pttrf::Unblocked>::invoke(
+    const int n, ValueType *KOKKOS_RESTRICT d, const int ds0,
+    Kokkos::complex<ValueType> *KOKKOS_RESTRICT e, const int es0) {
+  int info = 0;
+
+  auto update = [&](const int i) {
+    auto eir_tmp     = e[i * es0].real();
+    auto eii_tmp     = e[i * es0].imag();
+    auto f_tmp       = eir_tmp / d[i * ds0];
+    auto g_tmp       = eii_tmp / d[i * ds0];
+    e[i * es0]       = Kokkos::complex<ValueType>(f_tmp, g_tmp);
+    d[(i + 1) * ds0] = d[(i + 1) * ds0] - f_tmp * eir_tmp - g_tmp * eii_tmp;
+  };
+
+  auto check_positive_definitiveness = [&](const int i) {
+    return (d[i] <= 0.0) ? (i + 1) : 0;
+  };
+
+  // Compute the L*D*L' (or U'*D*U) factorization of A.
+  const int i4 = (n - 1) % 4;
+  for (int i = 0; i < i4; i++) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    info = check_positive_definitiveness(i);
+    if (info) {
+      return info;
+    }
+#endif
+
+    update(i);
+  }  // for (int i = 0; i < i4; i++)
+
+  for (int i = i4; i < n - 4; i += 4) {
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    info = check_positive_definitiveness(i);
+    if (info) {
+      return info;
+    }
+#endif
+
+    update(i);
+
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    info = check_positive_definitiveness(i + 1);
+    if (info) {
+      return info;
+    }
+#endif
+
+    update(i + 1);
+
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    info = check_positive_definitiveness(i + 2);
+    if (info) {
+      return info;
+    }
+#endif
+
+    update(i + 2);
+
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+    info = check_positive_definitiveness(i + 3);
+    if (info) {
+      return info;
+    }
+#endif
+
+    update(i + 3);
+
+  }  // for (int i = i4; i < n-4; 4)
+
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  info = check_positive_definitiveness(n - 1);
+  if (info) {
+    return info;
+  }
+#endif
+
+  return 0;
+}
+
+}  // namespace KokkosBatched
+
+#endif  // KOKKOSBATCHED_PTTRF_SERIAL_INTERNAL_HPP_
diff --git a/batched/dense/src/KokkosBatched_Pttrf.hpp b/batched/dense/src/KokkosBatched_Pttrf.hpp
new file mode 100644
index 0000000000..4fcc944dc8
--- /dev/null
+++ b/batched/dense/src/KokkosBatched_Pttrf.hpp
@@ -0,0 +1,52 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+#ifndef KOKKOSBATCHED_PTTRF_HPP_
+#define KOKKOSBATCHED_PTTRF_HPP_
+
+#include <KokkosBatched_Util.hpp>
+
+/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr)
+
+namespace KokkosBatched {
+
+/// \brief Serial Batched Pttrf:
+/// Compute the Cholesky factorization L*D*L**T (or L*D*L**H) of a real
+/// symmetric (or complex Hermitian) positive definite tridiagonal matrix A_l
+/// for all l = 0, ..., N
+///
+/// \tparam DViewType: Input type for the a diagonal matrix, needs to be a 1D
+/// view
+/// \tparam EViewType: Input type for the a upper/lower diagonal matrix,
+/// needs to be a 1D view
+///
+/// \param d [inout]: n diagonal elements of the diagonal matrix D
+/// \param e [inout]: n-1 upper/lower diagonal elements of the diagonal matrix E
+///
+/// No nested parallel_for is used inside of the function.
+///
+
+template <typename ArgAlgo>
+struct SerialPttrf {
+  template <typename DViewType, typename EViewType>
+  KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d,
+                                           const EViewType &e);
+};
+
+}  // namespace KokkosBatched
+
+#include "KokkosBatched_Pttrf_Serial_Impl.hpp"
+
+#endif  // KOKKOSBATCHED_PTTRF_HPP_
diff --git a/batched/dense/unit_test/Test_Batched_Dense.hpp b/batched/dense/unit_test/Test_Batched_Dense.hpp
index 7b0ee58312..76215b58f8 100644
--- a/batched/dense/unit_test/Test_Batched_Dense.hpp
+++ b/batched/dense/unit_test/Test_Batched_Dense.hpp
@@ -49,6 +49,9 @@
 #include "Test_Batched_SerialTrtri_Real.hpp"
 #include "Test_Batched_SerialTrtri_Complex.hpp"
 #include "Test_Batched_SerialSVD.hpp"
+#include "Test_Batched_SerialPttrf.hpp"
+#include "Test_Batched_SerialPttrf_Real.hpp"
+#include "Test_Batched_SerialPttrf_Complex.hpp"
 
 // Team Kernels
 #include "Test_Batched_TeamAxpy.hpp"
diff --git a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp
index 689ff4f7a5..c1328291fb 100644
--- a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp
+++ b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp
@@ -111,6 +111,46 @@ void create_banded_triangular_matrix(InViewType& in, OutViewType& out,
   }
   Kokkos::deep_copy(out, h_out);
 }
+
+/// \brief Create a diagonal matrix from an input vector:
+/// Copies the input vector into the diagonal of the output matrix specified
+/// by the parameter k. k > 0 means that the matrix is upper-diagonal and
+/// k < 0 means the lower-diagonal. k = 0 means the diagonal.
+///
+/// \tparam InViewType: Input type for the vector, needs to be a 2D view
+/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view
+///
+/// \param in [in]: Input batched vector, a rank 2 view
+/// \param out [out]: Output batched matrix, where the diagonal compnent
+/// specified by k is filled with the input vector, a rank 3 view
+/// \param k [in]: The diagonal offset to be filled (default is 0).
+///
+template <typename InViewType, typename OutViewType>
+void create_diagonal_matrix(InViewType& in, OutViewType& out, int k = 0) {
+  auto h_in   = Kokkos::create_mirror_view(in);
+  auto h_out  = Kokkos::create_mirror_view(out);
+  const int N = in.extent(0), BlkSize = in.extent(1);
+
+  assert(out.extent(0) == in.extent(0));
+  assert(out.extent(1) == in.extent(1) + abs(k));
+
+  int i1_start = k >= 0 ? 0 : -k;
+  int i2_start = k >= 0 ? k : 0;
+
+  // Zero clear the output matrix
+  using ScalarType = typename OutViewType::non_const_value_type;
+  Kokkos::deep_copy(h_out, ScalarType(0.0));
+
+  Kokkos::deep_copy(h_in, in);
+  for (int i0 = 0; i0 < N; i0++) {
+    for (int i1 = 0; i1 < BlkSize; i1++) {
+      h_out(i0, i1 + i1_start, i1 + i2_start) = h_in(i0, i1);
+    }
+  }
+
+  Kokkos::deep_copy(out, h_out);
+}
+
 }  // namespace KokkosBatched
 
 #endif  // TEST_BATCHED_DENSE_HELPER_HPP
diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp
new file mode 100644
index 0000000000..6ee7818ddc
--- /dev/null
+++ b/batched/dense/unit_test/Test_Batched_SerialPttrf.hpp
@@ -0,0 +1,467 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr)
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Pttrf.hpp"
+#include "Test_Batched_DenseUtils.hpp"
+
+using namespace KokkosBatched;
+
+namespace Test {
+namespace Pttrf {
+
+template <typename DeviceType, typename DViewType, typename EViewType,
+          typename AlgoTagType>
+struct Functor_BatchedSerialPttrf {
+  using execution_space = typename DeviceType::execution_space;
+  DViewType _d;
+  EViewType _e;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_BatchedSerialPttrf(const DViewType &d, const EViewType &e)
+      : _d(d), _e(e) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int k, int &info) const {
+    auto dd = Kokkos::subview(_d, k, Kokkos::ALL());
+    auto ee = Kokkos::subview(_e, k, Kokkos::ALL());
+
+    info += KokkosBatched::SerialPttrf<AlgoTagType>::invoke(dd, ee);
+  }
+
+  inline int run() {
+    using value_type = typename DViewType::non_const_value_type;
+    std::string name_region("KokkosBatched::Test::SerialPttrf");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    int info_sum                      = 0;
+    Kokkos::Profiling::pushRegion(name.c_str());
+    Kokkos::RangePolicy<execution_space> policy(0, _d.extent(0));
+    Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum);
+    Kokkos::Profiling::popRegion();
+    return info_sum;
+  }
+};
+
+template <typename DeviceType, typename ScalarType, typename AViewType,
+          typename BViewType, typename CViewType, typename ArgTransB>
+struct Functor_BatchedSerialGemm {
+  using execution_space = typename DeviceType::execution_space;
+  AViewType _a;
+  BViewType _b;
+  CViewType _c;
+  ScalarType _alpha, _beta;
+
+  KOKKOS_INLINE_FUNCTION
+  Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a,
+                            const BViewType &b, const ScalarType beta,
+                            const CViewType &c)
+      : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int k) const {
+    auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL());
+    auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL());
+    auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL());
+
+    KokkosBatched::SerialGemm<Trans::NoTranspose, ArgTransB,
+                              Algo::Gemm::Unblocked>::invoke(_alpha, aa, bb,
+                                                             _beta, cc);
+  }
+
+  inline void run() {
+    using value_type = typename AViewType::non_const_value_type;
+    std::string name_region("KokkosBatched::Test::SerialPttrf");
+    const std::string name_value_type = Test::value_type_name<value_type>();
+    std::string name                  = name_region + name_value_type;
+    Kokkos::RangePolicy<execution_space> policy(0, _a.extent(0));
+    Kokkos::parallel_for(name.c_str(), policy, *this);
+  }
+};
+
+template <typename DeviceType, typename ScalarType, typename LayoutType,
+          typename AlgoTagType>
+/// \brief Implementation details of batched pttrf test for random matrix
+///
+/// \param N [in] Batch size of matrix A
+/// \param BlkSize [in] Block size of matrix A
+void impl_test_batched_pttrf(const int N, const int BlkSize) {
+  using ats            = typename Kokkos::ArithTraits<ScalarType>;
+  using RealType       = typename ats::mag_type;
+  using RealView2DType = Kokkos::View<RealType **, LayoutType, DeviceType>;
+  using View2DType     = Kokkos::View<ScalarType **, LayoutType, DeviceType>;
+  using View3DType     = Kokkos::View<ScalarType ***, LayoutType, DeviceType>;
+
+  View3DType A("A", N, BlkSize, BlkSize),
+      A_reconst("A_reconst", N, BlkSize, BlkSize);
+  View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize),
+      D("D", N, BlkSize, BlkSize), LD("LD", N, BlkSize, BlkSize),
+      L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize);
+  RealView2DType d("d", N, BlkSize),  // Diagonal components
+      ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize);
+  View2DType e_upper("e_upper", N, BlkSize - 1),
+      e_lower("e_lower", N,
+              BlkSize - 1);  // upper and lower diagonal components
+
+  using execution_space = typename DeviceType::execution_space;
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(13718);
+  RealType realRandStart, realRandEnd;
+  ScalarType randStart, randEnd;
+
+  KokkosKernels::Impl::getRandomBounds(1.0, realRandStart, realRandEnd);
+  KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd);
+
+  // Add BlkSize to ensure positive definiteness
+  Kokkos::fill_random(d, rand_pool, realRandStart + BlkSize,
+                      realRandEnd + BlkSize);
+  Kokkos::fill_random(e_upper, rand_pool, randStart, randEnd);
+
+  auto h_e_upper =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), e_upper);
+  auto h_e_lower = Kokkos::create_mirror_view(e_lower);
+
+  for (int ib = 0; ib < N; ib++) {
+    for (int i = 0; i < BlkSize - 1; i++) {
+      // Fill the lower diagonal with conjugate of the upper diagonal
+      h_e_lower(ib, i) =
+          Kokkos::ArithTraits<ScalarType>::conj(h_e_upper(ib, i));
+    }
+  }
+
+  Kokkos::deep_copy(e_lower, h_e_lower);
+  Kokkos::deep_copy(ones, RealType(1.0));
+
+  // Reconstruct Tridiagonal matrix A
+  // A = D + EL + EU
+  create_diagonal_matrix(e_lower, EL, -1);
+  create_diagonal_matrix(e_upper, EU, 1);
+  create_diagonal_matrix(d, D);
+  create_diagonal_matrix(ones, I);
+
+  // Matrix matrix addition by Gemm
+  // D + EU by D * I + EU (result stored in EU)
+  Functor_BatchedSerialGemm<DeviceType, ScalarType, View3DType, View3DType,
+                            View3DType, Trans::NoTranspose>(1.0, D, I, 1.0, EU)
+      .run();
+
+  // Copy EL to A
+  Kokkos::deep_copy(A, EL);
+
+  // EU + EL by EU * I + A (result stored in A)
+  Functor_BatchedSerialGemm<DeviceType, ScalarType, View3DType, View3DType,
+                            View3DType, Trans::NoTranspose>(1.0, EU, I, 1.0, A)
+      .run();
+
+  // Factorize matrix A -> L * D * L**H
+  // d and e are updated by pttrf
+  auto info = Functor_BatchedSerialPttrf<DeviceType, RealView2DType, View2DType,
+                                         AlgoTagType>(d, e_lower)
+                  .run();
+
+  Kokkos::fence();
+
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  EXPECT_EQ(info, 0);
+#endif
+
+  // Reconstruct L and D from factorized matrix
+  create_diagonal_matrix(e_lower, EL, -1);
+  create_diagonal_matrix(d, D);
+
+  // Copy I to L
+  Kokkos::deep_copy(L, I);
+
+  // EL + I by EL * I + L (result stored in L)
+  Functor_BatchedSerialGemm<DeviceType, ScalarType, View3DType, View3DType,
+                            View3DType, Trans::NoTranspose>(1.0, EL, I, 1.0, L)
+      .run();
+
+  // Reconstruct A by L*D*L**H
+  // Gemm to compute L*D -> LD
+  Functor_BatchedSerialGemm<DeviceType, ScalarType, View3DType, View3DType,
+                            View3DType, Trans::NoTranspose>(1.0, L, D, 0.0, LD)
+      .run();
+
+  // FIXME: We should use SerialGemm Trans::ConjTranspose.
+  // For the moment, we compute the complex conjugate of L and
+  // then use Trans::Transpose.
+  // Gemm to compute (L*D)*L**H -> A_reconst
+  // Functor_BatchedSerialGemm<DeviceType, ScalarType, View3DType, View3DType,
+  //                          View3DType, Trans::ConjTranspose>(1.0, LD, L, 0.0,
+  //                                                            A_reconst)
+  //    .run();
+
+  // Compute the complex conjugate of L
+  // L -> conj(L)
+  auto h_L = Kokkos::create_mirror_view(L);
+  Kokkos::deep_copy(h_L, L);
+  for (int ib = 0; ib < N; ib++) {
+    for (int i = 0; i < BlkSize; i++) {
+      for (int j = 0; j < BlkSize; j++) {
+        h_L(ib, i, j) = Kokkos::ArithTraits<ScalarType>::conj(h_L(ib, i, j));
+      }
+    }
+  }
+  Kokkos::deep_copy(L, h_L);
+
+  // Gemm to compute (L*D)*(conj(L))**T -> A_reconst
+  Functor_BatchedSerialGemm<DeviceType, ScalarType, View3DType, View3DType,
+                            View3DType, Trans::Transpose>(1.0, LD, L, 0.0,
+                                                          A_reconst)
+      .run();
+
+  Kokkos::fence();
+
+  // this eps is about 10^-14
+  RealType eps = 1.0e3 * ats::epsilon();
+
+  auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A);
+  auto h_A_reconst =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst);
+
+  // Check A = L*D*L**H
+  for (int ib = 0; ib < N; ib++) {
+    for (int i = 0; i < BlkSize; i++) {
+      for (int j = 0; j < BlkSize; j++) {
+        EXPECT_NEAR_KK(h_A_reconst(ib, i, j), h_A(ib, i, j), eps);
+      }
+    }
+  }
+}
+
+template <typename DeviceType, typename ScalarType, typename LayoutType,
+          typename AlgoTagType>
+/// \brief Implementation details of batched pttrf test for early return
+///        BlkSize must be 0 or 1
+///
+/// \param N [in] Batch size of matrix A
+/// \param BlkSize [in] Block size of matrix A
+void impl_test_batched_pttrf_quick_return(const int N, const int BlkSize) {
+  using ats            = typename Kokkos::ArithTraits<ScalarType>;
+  using RealType       = typename ats::mag_type;
+  using RealView2DType = Kokkos::View<RealType **, LayoutType, DeviceType>;
+  using View2DType     = Kokkos::View<ScalarType **, LayoutType, DeviceType>;
+
+  if (BlkSize > 1) return;
+
+  const int BlkSize_minus_1 = BlkSize > 0 ? BlkSize - 1 : 0;
+
+  RealView2DType d("d", N, BlkSize),
+      d2("d2", N, BlkSize);  // Diagonal components
+  View2DType e("e", N,
+               BlkSize_minus_1);  // lower diagonal components
+
+  const RealType reference_value = 4.0;
+
+  Kokkos::deep_copy(d, reference_value);
+  Kokkos::deep_copy(d2, -reference_value);
+  Kokkos::deep_copy(e, ScalarType(1.0));
+
+  // Factorize matrix A -> L * D * L**H
+  // d and e are updated by pttrf
+  // Early return if BlkSize is 0 or 1
+  auto info = Functor_BatchedSerialPttrf<DeviceType, RealView2DType, View2DType,
+                                         AlgoTagType>(d, e)
+                  .run();
+
+  // For negative values, info should be 1 for BlkSize = 1
+  auto info2 = Functor_BatchedSerialPttrf<DeviceType, RealView2DType,
+                                          View2DType, AlgoTagType>(d2, e)
+                   .run();
+
+  Kokkos::fence();
+
+  int expected_info2 = BlkSize == 0 ? 0 : N;
+  EXPECT_EQ(info, 0);
+  EXPECT_EQ(info2, expected_info2);
+
+  // this eps is about 10^-14
+  RealType eps = 1.0e3 * ats::epsilon();
+
+  auto h_d  = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d);
+  auto h_d2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d2);
+
+  // Check if d is unchanged
+  for (int ib = 0; ib < N; ib++) {
+    for (int i = 0; i < BlkSize; i++) {
+      EXPECT_NEAR_KK(h_d(ib, i), reference_value, eps);
+      EXPECT_NEAR_KK(h_d2(ib, i), -reference_value, eps);
+    }
+  }
+}
+
+template <typename DeviceType, typename ScalarType, typename LayoutType,
+          typename AlgoTagType>
+/// \brief Implementation details of batched pttrf test
+///
+/// \param N [in] Batch size of matrix A
+/// \param BlkSize [in] Block size of matrix A
+void impl_test_batched_pttrf_analytical(const int N, const int BlkSize) {
+  using ats            = typename Kokkos::ArithTraits<ScalarType>;
+  using RealType       = typename ats::mag_type;
+  using RealView2DType = Kokkos::View<RealType **, LayoutType, DeviceType>;
+  using View2DType     = Kokkos::View<ScalarType **, LayoutType, DeviceType>;
+  using View3DType     = Kokkos::View<ScalarType ***, LayoutType, DeviceType>;
+
+  View3DType A("A", N, BlkSize, BlkSize),
+      A_reconst("A_reconst", N, BlkSize, BlkSize);
+  View3DType EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize),
+      D("D", N, BlkSize, BlkSize), LD("LD", N, BlkSize, BlkSize),
+      L("L", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize);
+  RealView2DType d(Kokkos::view_alloc("d", Kokkos::WithoutInitializing), N,
+                   BlkSize),  // Diagonal components
+      ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize);
+  View2DType e(Kokkos::view_alloc("e", Kokkos::WithoutInitializing), N,
+               BlkSize - 1);  // Upper and lower diagonal components (identical)
+
+  Kokkos::deep_copy(d, RealType(4.0));
+  Kokkos::deep_copy(e, ScalarType(1.0));
+  Kokkos::deep_copy(ones, RealType(1.0));
+
+  // Reconstruct Tridiaonal matrix A
+  // A = D + EL + EU
+  create_diagonal_matrix(e, EL, -1);
+  create_diagonal_matrix(e, EU, 1);
+  create_diagonal_matrix(d, D);
+  create_diagonal_matrix(ones, I);
+
+  // Matrix matrix addition by Gemm
+  // D + EU by D * I + EU (result stored in EU)
+  Functor_BatchedSerialGemm<DeviceType, ScalarType, View3DType, View3DType,
+                            View3DType, Trans::NoTranspose>(1.0, D, I, 1.0, EU)
+      .run();
+
+  // Copy EL to A
+  Kokkos::deep_copy(A, EL);
+
+  // EU + EL by EU * I + A (result stored in A)
+  Functor_BatchedSerialGemm<DeviceType, ScalarType, View3DType, View3DType,
+                            View3DType, Trans::NoTranspose>(1.0, EU, I, 1.0, A)
+      .run();
+
+  // Factorize matrix A -> L * D * L**T
+  // d and e are updated by pttrf
+  auto info = Functor_BatchedSerialPttrf<DeviceType, RealView2DType, View2DType,
+                                         AlgoTagType>(d, e)
+                  .run();
+
+  Kokkos::fence();
+
+#if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
+  EXPECT_EQ(info, 0);
+#endif
+
+  // Reconstruct L and D from factorized matrix
+  create_diagonal_matrix(e, EL, -1);
+  create_diagonal_matrix(d, D);
+
+  // Copy I to L
+  Kokkos::deep_copy(L, I);
+
+  // EL + I by EL * I + L (result stored in L)
+  Functor_BatchedSerialGemm<DeviceType, ScalarType, View3DType, View3DType,
+                            View3DType, Trans::NoTranspose>(1.0, EL, I, 1.0, L)
+      .run();
+
+  // Reconstruct A by L*D*L**T
+  // Gemm to compute L*D -> LD
+  Functor_BatchedSerialGemm<DeviceType, ScalarType, View3DType, View3DType,
+                            View3DType, Trans::NoTranspose>(1.0, L, D, 0.0, LD)
+      .run();
+
+  // Gemm to compute (L*D)*L**T -> A_reconst
+  Functor_BatchedSerialGemm<DeviceType, ScalarType, View3DType, View3DType,
+                            View3DType, Trans::Transpose>(1.0, LD, L, 0.0,
+                                                          A_reconst)
+      .run();
+
+  Kokkos::fence();
+
+  // this eps is about 10^-14
+  RealType eps = 1.0e3 * ats::epsilon();
+
+  auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A);
+  auto h_A_reconst =
+      Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst);
+
+  // Check A = L*D*L.T
+  for (int ib = 0; ib < N; ib++) {
+    for (int i = 0; i < BlkSize; i++) {
+      for (int j = 0; j < BlkSize; j++) {
+        EXPECT_NEAR_KK(h_A_reconst(ib, i, j), h_A(ib, i, j), eps);
+      }
+    }
+  }
+}
+
+}  // namespace Pttrf
+}  // namespace Test
+
+template <typename DeviceType, typename ScalarType, typename AlgoTagType>
+int test_batched_pttrf() {
+#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT)
+  {
+    using LayoutType = Kokkos::LayoutLeft;
+    for (int i = 0; i < 2; i++) {
+      Test::Pttrf::impl_test_batched_pttrf_quick_return<
+          DeviceType, ScalarType, LayoutType, AlgoTagType>(1, i);
+      Test::Pttrf::impl_test_batched_pttrf_quick_return<
+          DeviceType, ScalarType, LayoutType, AlgoTagType>(2, i);
+    }
+    for (int i = 2; i < 10; i++) {
+      Test::Pttrf::impl_test_batched_pttrf<DeviceType, ScalarType, LayoutType,
+                                           AlgoTagType>(1, i);
+      Test::Pttrf::impl_test_batched_pttrf<DeviceType, ScalarType, LayoutType,
+                                           AlgoTagType>(2, i);
+      Test::Pttrf::impl_test_batched_pttrf_analytical<DeviceType, ScalarType,
+                                                      LayoutType, AlgoTagType>(
+          1, i);
+      Test::Pttrf::impl_test_batched_pttrf_analytical<DeviceType, ScalarType,
+                                                      LayoutType, AlgoTagType>(
+          2, i);
+    }
+  }
+#endif
+#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT)
+  {
+    using LayoutType = Kokkos::LayoutRight;
+    for (int i = 0; i < 2; i++) {
+      Test::Pttrf::impl_test_batched_pttrf_quick_return<
+          DeviceType, ScalarType, LayoutType, AlgoTagType>(1, i);
+      Test::Pttrf::impl_test_batched_pttrf_quick_return<
+          DeviceType, ScalarType, LayoutType, AlgoTagType>(2, i);
+    }
+    for (int i = 2; i < 10; i++) {
+      Test::Pttrf::impl_test_batched_pttrf<DeviceType, ScalarType, LayoutType,
+                                           AlgoTagType>(1, i);
+      Test::Pttrf::impl_test_batched_pttrf<DeviceType, ScalarType, LayoutType,
+                                           AlgoTagType>(2, i);
+      Test::Pttrf::impl_test_batched_pttrf_analytical<DeviceType, ScalarType,
+                                                      LayoutType, AlgoTagType>(
+          1, i);
+      Test::Pttrf::impl_test_batched_pttrf_analytical<DeviceType, ScalarType,
+                                                      LayoutType, AlgoTagType>(
+          2, i);
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp
new file mode 100644
index 0000000000..febccc5cb3
--- /dev/null
+++ b/batched/dense/unit_test/Test_Batched_SerialPttrf_Complex.hpp
@@ -0,0 +1,31 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT)
+TEST_F(TestCategory, test_batched_pttrf_fcomplex) {
+  using algo_tag_type = typename Algo::Pttrf::Unblocked;
+
+  test_batched_pttrf<TestDevice, float, algo_tag_type>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE)
+TEST_F(TestCategory, test_batched_pttrf_dcomplex) {
+  using algo_tag_type = typename Algo::Pttrf::Unblocked;
+
+  test_batched_pttrf<TestDevice, double, algo_tag_type>();
+}
+#endif
diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp
new file mode 100644
index 0000000000..8b0fb658fe
--- /dev/null
+++ b/batched/dense/unit_test/Test_Batched_SerialPttrf_Real.hpp
@@ -0,0 +1,31 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#if defined(KOKKOSKERNELS_INST_FLOAT)
+TEST_F(TestCategory, test_batched_pttrf_float) {
+  using algo_tag_type = typename Algo::Pttrf::Unblocked;
+
+  test_batched_pttrf<TestDevice, float, algo_tag_type>();
+}
+#endif
+
+#if defined(KOKKOSKERNELS_INST_DOUBLE)
+TEST_F(TestCategory, test_batched_pttrf_double) {
+  using algo_tag_type = typename Algo::Pttrf::Unblocked;
+
+  test_batched_pttrf<TestDevice, double, algo_tag_type>();
+}
+#endif
diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp
index ecb72e7c9a..1fc6b7d480 100644
--- a/blas/impl/KokkosBlas_util.hpp
+++ b/blas/impl/KokkosBlas_util.hpp
@@ -85,6 +85,7 @@ struct Algo {
   using SolveLU   = Level3;
   using QR        = Level3;
   using UTV       = Level3;
+  using Pttrf     = Level3;
 
   struct Level2 {
     struct Unblocked {};

From d310f1aa8c4b2bc7513d34eff0286190c11460ab Mon Sep 17 00:00:00 2001
From: James Foucar <jgfouca@sandia.gov>
Date: Wed, 10 Jul 2024 14:29:55 -0600
Subject: [PATCH 27/32] A little sptrsv cleanup before the main block effort
 (#2247)

* Some cleanup and refactoring

* First round of cleanup complete

* Fix a couple warnings

* formatting
---
 .../impl/KokkosSparse_spiluk_numeric_impl.hpp |   26 +-
 .../impl/KokkosSparse_sptrsv_solve_impl.hpp   | 6557 ++++++++---------
 .../impl/KokkosSparse_sptrsv_solve_spec.hpp   |   39 +-
 sparse/src/KokkosKernels_Handle.hpp           |    6 +-
 sparse/src/KokkosSparse_sptrsv_handle.hpp     |  134 +-
 5 files changed, 3267 insertions(+), 3495 deletions(-)

diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp
index 415ccf87a0..3caa2bcc31 100644
--- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp
+++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp
@@ -47,20 +47,18 @@ struct IlukWrap {
   //
   // Useful types
   //
-  using execution_space        = typename IlukHandle::execution_space;
-  using memory_space           = typename IlukHandle::memory_space;
-  using lno_t                  = typename IlukHandle::nnz_lno_t;
-  using size_type              = typename IlukHandle::size_type;
-  using scalar_t               = typename IlukHandle::nnz_scalar_t;
-  using HandleDeviceRowMapType = typename IlukHandle::nnz_row_view_t;
-  using HandleDeviceValueType  = typename IlukHandle::nnz_value_view_t;
-  using WorkViewType           = typename IlukHandle::work_view_t;
-  using LevelHostViewType      = typename IlukHandle::nnz_lno_view_host_t;
-  using LevelViewType          = typename IlukHandle::nnz_lno_view_t;
-  using karith                 = typename Kokkos::ArithTraits<scalar_t>;
-  using team_policy            = typename IlukHandle::TeamPolicy;
-  using member_type            = typename team_policy::member_type;
-  using range_policy           = typename IlukHandle::RangePolicy;
+  using execution_space   = typename IlukHandle::execution_space;
+  using memory_space      = typename IlukHandle::memory_space;
+  using lno_t             = typename IlukHandle::nnz_lno_t;
+  using size_type         = typename IlukHandle::size_type;
+  using scalar_t          = typename IlukHandle::nnz_scalar_t;
+  using WorkViewType      = typename IlukHandle::work_view_t;
+  using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t;
+  using LevelViewType     = typename IlukHandle::nnz_lno_view_t;
+  using karith            = typename Kokkos::ArithTraits<scalar_t>;
+  using team_policy       = typename IlukHandle::TeamPolicy;
+  using member_type       = typename team_policy::member_type;
+  using range_policy      = typename IlukHandle::RangePolicy;
 
   static team_policy get_team_policy(const size_type nrows,
                                      const int team_size) {
diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
index a64a4d23bc..d385a390cd 100644
--- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
+++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
@@ -27,15 +27,11 @@
 #include <KokkosSparse_CrsMatrix.hpp>
 
 #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV
-
 // Enable supernodal sptrsv
 #include "KokkosBlas3_trsm.hpp"
 #include "KokkosSparse_spmv.hpp"
-
 #include "KokkosBatched_Util.hpp"
-
 #include "KokkosBlas2_team_gemv_spec.hpp"
-
 #include "KokkosBatched_Trsm_Team_Impl.hpp"
 #endif
 
@@ -48,834 +44,997 @@
 #include "cuda_profiler_api.h"
 #endif
 
-namespace KokkosSparse {
-namespace Impl {
-namespace Experimental {
-
 #if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && \
     defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH)
 #define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
 #endif
 
-struct UnsortedTag {};
-
-struct LargerCutoffTag {};
-
-struct UnsortedLargerCutoffTag {};
-
-template <class ViewType>
-void print_view1d_solve(const ViewType dv, size_t range = 0) {
-  auto v = Kokkos::create_mirror_view(dv);
-  Kokkos::deep_copy(v, dv);
-  std::cout << "Output for view " << v.label() << std::endl;
-  range = range == 0 ? dv.extent(0) : range;
-  for (size_t i = 0; i < range; ++i) {
-    std::cout << "v(" << i << ") = " << v(i) << " , ";
-  }
-  std::cout << std::endl;
-}
-
-// Needed for cudagraphs
-struct EmptyFunctor {
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const int) const {}
-};
-
-// This functor unifies the lower and upper implementations, the hope is the
-// "is_lowertri" check does not add noticable time on larger problems
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct TriLvlSchedTP1SolverFunctor {
-  typedef typename RowMapType::execution_space execution_space;
-  typedef Kokkos::TeamPolicy<execution_space> policy_type;
-  typedef typename policy_type::member_type member_type;
-  typedef typename EntriesType::non_const_value_type lno_t;
-  typedef typename ValuesType::non_const_value_type scalar_t;
-
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-
-  const bool is_lowertri;
-
-  long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                    // offset
-
-  TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
-                              const EntriesType &entries_,
-                              const ValuesType &values_, LHSType &lhs_,
-                              const RHSType &rhs_,
-                              const NGBLType &nodes_grouped_by_level_,
-                              const bool &is_lowertri_, const long &node_count_)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        is_lowertri(is_lowertri_),
-        node_count(node_count_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
-    auto rowid     = nodes_grouped_by_level(my_league + node_count);
-    auto my_rank   = team.team_rank();
-
-    auto soffset   = row_map(rowid);
-    auto eoffset   = row_map(rowid + 1);
-    auto rhs_rowid = rhs(rowid);
-    scalar_t diff  = scalar_t(0.0);
-
-    Kokkos::parallel_reduce(
-        Kokkos::TeamThreadRange(team, soffset, eoffset),
-        [&](const long ptr, scalar_t &tdiff) {
-          auto colid = entries(ptr);
-
-          auto val = values(ptr);
-          if (colid != rowid) {
-            tdiff = tdiff - val * lhs(colid);
-          }
-        },
-        diff);
-
-    team.team_barrier();
-
-    // At end, finalize rowid == colid
-    // only one thread should do this; can also use Kokkos::single
-    if (my_rank == 0) {
-      // ASSUMPTION: sorted diagonal value located at eoffset - 1
-      lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1)
-                               : (rhs_rowid + diff) / values(soffset);
-    }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag &, const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
-    auto rowid     = nodes_grouped_by_level(my_league + node_count);
-    auto my_rank   = team.team_rank();
-
-    auto soffset   = row_map(rowid);
-    auto eoffset   = row_map(rowid + 1);
-    auto rhs_rowid = rhs(rowid);
-    scalar_t diff  = scalar_t(0.0);
-
-    auto diag = -1;
-
-    Kokkos::parallel_reduce(
-        Kokkos::TeamThreadRange(team, soffset, eoffset),
-        [&](const long ptr, scalar_t &tdiff) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            tdiff = tdiff - val * lhs(colid);
-          } else {
-            diag = ptr;
-          }
-        },
-        diff);
-    team.team_barrier();
-
-    // At end, finalize rowid == colid
-    // only one thread should do this; can also use Kokkos::single
-    if (my_rank == 0) {
-      lhs(rowid) = (rhs_rowid + diff) / values(diag);
-    }
-  }
-};
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct TriLvlSchedTP1SolverFunctorDiagValues {
-  typedef typename RowMapType::execution_space execution_space;
-  typedef Kokkos::TeamPolicy<execution_space> policy_type;
-  typedef typename policy_type::member_type member_type;
-  typedef typename EntriesType::non_const_value_type lno_t;
-  typedef typename ValuesType::non_const_value_type scalar_t;
-
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-  ValuesType diagonal_values;  // inserted according to rowid
-
-  const bool is_lowertri;
-
-  long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                    // offset
-  long dense_nrows;
-
-  TriLvlSchedTP1SolverFunctorDiagValues(const RowMapType &row_map_,
-                                        const EntriesType &entries_,
-                                        const ValuesType &values_,
-                                        LHSType &lhs_, const RHSType &rhs_,
-                                        const NGBLType &nodes_grouped_by_level_,
-                                        const ValuesType &diagonal_values_,
-                                        const bool is_lowertri_,
-                                        long node_count_, long dense_nrows_ = 0)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        diagonal_values(diagonal_values_),
-        is_lowertri(is_lowertri_),
-        node_count(node_count_),
-        dense_nrows(dense_nrows_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
-    auto rowid     = nodes_grouped_by_level(my_league + node_count);
-    auto my_rank   = team.team_rank();
-
-    auto soffset   = row_map(rowid);
-    auto eoffset   = row_map(rowid + 1);
-    auto rhs_rowid = rhs(rowid);
-    scalar_t diff  = scalar_t(0.0);
-
-    Kokkos::parallel_reduce(
-        Kokkos::TeamThreadRange(team, soffset, eoffset),
-        [&](const long ptr, scalar_t &tdiff) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            tdiff = tdiff - val * lhs(colid);
-          }
-        },
-        diff);
-
-    team.team_barrier();
+namespace KokkosSparse {
+namespace Impl {
+namespace Experimental {
 
-    // At end, finalize rowid == colid
-    // only one thread should do this; can also use Kokkos::single
-    if (my_rank == 0) {
-      // lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) :
-      // (rhs_rowid+diff)/values(soffset);
-      lhs(rowid) = (rhs_rowid + diff) / diagonal_values(rowid);
+template <class TriSolveHandle>
+struct SptrsvWrap {
+  //
+  // Useful types
+  //
+  using execution_space = typename TriSolveHandle::execution_space;
+  using memory_space    = typename TriSolveHandle::memory_space;
+  using temp_mem_space  = typename TriSolveHandle::HandleTempMemorySpace;
+  using lno_t           = typename TriSolveHandle::nnz_lno_t;
+  using size_type       = typename TriSolveHandle::size_type;
+  using scalar_t        = typename TriSolveHandle::scalar_t;
+  using row_map_t       = typename TriSolveHandle::nnz_row_view_t;
+  using entries_t       = typename TriSolveHandle::nnz_lno_view_t;
+  using values_t        = typename TriSolveHandle::nnz_scalar_view_t;
+  using work_view_t =
+      Kokkos::View<scalar_t *, Kokkos::Device<execution_space, temp_mem_space>>;
+  using work_view_int_t =
+      Kokkos::View<int *, Kokkos::Device<execution_space, temp_mem_space>>;
+  using karith       = typename Kokkos::ArithTraits<scalar_t>;
+  using team_policy  = typename TriSolveHandle::TeamPolicy;
+  using member_type  = typename team_policy::member_type;
+  using range_policy = typename TriSolveHandle::RangePolicy;
+  using range_type   = Kokkos::pair<int, int>;
+
+  // Tag structs
+  struct UnsortedTag {};
+  struct LargerCutoffTag {};
+  struct UnsortedLargerCutoffTag {};
+
+  template <class ViewType>
+  static void print_view1d_solve(const ViewType dv, size_t range = 0) {
+    auto v = Kokkos::create_mirror_view(dv);
+    Kokkos::deep_copy(v, dv);
+    std::cout << "Output for view " << v.label() << std::endl;
+    range = range == 0 ? dv.extent(0) : range;
+    for (size_t i = 0; i < range; ++i) {
+      std::cout << "v(" << i << ") = " << v(i) << " , ";
     }
+    std::cout << std::endl;
   }
-};
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct TriLvlSchedTP2SolverFunctor {
-  typedef typename RowMapType::execution_space execution_space;
-  typedef Kokkos::TeamPolicy<execution_space> policy_type;
-  typedef typename policy_type::member_type member_type;
-  typedef typename EntriesType::non_const_value_type lno_t;
-  typedef typename ValuesType::non_const_value_type scalar_t;
-
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-
-  const bool is_lowertri;
-  long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                    // offset
-  long node_groups;
-  long dense_nrows;
-
-  TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_,
-                              const EntriesType &entries_,
-                              const ValuesType &values_, LHSType &lhs_,
-                              const RHSType &rhs_,
-                              const NGBLType &nodes_grouped_by_level_,
-                              const bool is_lowertri_, long node_count_,
-                              long node_groups_ = 0, long dense_nrows_ = 0)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        is_lowertri(is_lowertri_),
-        node_count(node_count_),
-        node_groups(node_groups_),
-        dense_nrows(dense_nrows_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
-
-    size_t nrows = row_map.extent(0) - 1;
-
-    Kokkos::parallel_for(
-        Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
-          auto rowid =
-              nodes_grouped_by_level(node_count + my_league * node_groups + ng);
-          if (size_t(rowid) < nrows) {
-            auto soffset   = row_map(rowid);
-            auto eoffset   = row_map(rowid + 1);
-            auto rhs_rowid = rhs(rowid);
-            scalar_t diff  = scalar_t(0.0);
-
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, soffset, eoffset),
-                [&](const long ptr, scalar_t &tdiff) {
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff = tdiff - val * lhs(colid);
-                  }
-                },
-                diff);
 
-            // ASSUMPTION: sorted diagonal value located at eoffset - 1
-            lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1)
-                                     : (rhs_rowid + diff) / values(soffset);
-          }  // end if
-        });  // end TeamThreadRange
+  // Needed for cudagraphs
+  struct EmptyFunctor {
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int) const {}
+  };
+
+  // This functor unifies the lower and upper implementations, the hope is the
+  // "is_lowertri" check does not add noticable time on larger problems
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct TriLvlSchedTP1SolverFunctor {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+
+    const bool is_lowertri;
+
+    long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                      // offset
+
+    TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
+                                const EntriesType &entries_,
+                                const ValuesType &values_, LHSType &lhs_,
+                                const RHSType &rhs_,
+                                const entries_t &nodes_grouped_by_level_,
+                                const bool &is_lowertri_,
+                                const long &node_count_)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          is_lowertri(is_lowertri_),
+          node_count(node_count_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+      auto rowid     = nodes_grouped_by_level(my_league + node_count);
+      auto my_rank   = team.team_rank();
+
+      auto soffset   = row_map(rowid);
+      auto eoffset   = row_map(rowid + 1);
+      auto rhs_rowid = rhs(rowid);
+      scalar_t diff  = scalar_t(0.0);
+
+      Kokkos::parallel_reduce(
+          Kokkos::TeamThreadRange(team, soffset, eoffset),
+          [&](const long ptr, scalar_t &tdiff) {
+            auto colid = entries(ptr);
 
-    team.team_barrier();
-  }
+            auto val = values(ptr);
+            if (colid != rowid) {
+              tdiff = tdiff - val * lhs(colid);
+            }
+          },
+          diff);
 
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag &, const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
+      team.team_barrier();
 
-    size_t nrows = row_map.extent(0) - 1;
+      // At end, finalize rowid == colid
+      // only one thread should do this; can also use Kokkos::single
+      if (my_rank == 0) {
+        // ASSUMPTION: sorted diagonal value located at eoffset - 1
+        lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1)
+                                 : (rhs_rowid + diff) / values(soffset);
+      }
+    }
 
-    Kokkos::parallel_for(
-        Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
-          auto rowid =
-              nodes_grouped_by_level(node_count + my_league * node_groups + ng);
-          if (size_t(rowid) < nrows) {
-            auto soffset   = row_map(rowid);
-            auto eoffset   = row_map(rowid + 1);
-            auto rhs_rowid = rhs(rowid);
-            scalar_t diff  = scalar_t(0.0);
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedTag &, const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+      auto rowid     = nodes_grouped_by_level(my_league + node_count);
+      auto my_rank   = team.team_rank();
 
-            auto diag = -1;
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, soffset, eoffset),
-                [&](const long ptr, scalar_t &tdiff) {
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff = tdiff - val * lhs(colid);
-                  } else {
-                    diag = ptr;
-                  }
-                },
-                diff);
+      auto soffset   = row_map(rowid);
+      auto eoffset   = row_map(rowid + 1);
+      auto rhs_rowid = rhs(rowid);
+      scalar_t diff  = scalar_t(0.0);
 
-            lhs(rowid) = (rhs_rowid + diff) / values(diag);
-          }  // end if
-        });  // end TeamThreadRange
+      auto diag = -1;
 
-    team.team_barrier();
-  }
-};
-
-// Lower vs Upper Multi-block Functors
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct LowerTriLvlSchedRPSolverFunctor {
-  typedef typename EntriesType::non_const_value_type lno_t;
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-
-  LowerTriLvlSchedRPSolverFunctor(const RowMapType &row_map_,
-                                  const EntriesType &entries_,
-                                  const ValuesType &values_, LHSType &lhs_,
-                                  const RHSType &rhs_,
-                                  const NGBLType &nodes_grouped_by_level_)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const lno_t i) const {
-    auto rowid = nodes_grouped_by_level(i);
-    // Assuming indices are sorted per row, diag entry is final index in the
-    // list
-
-    long soffset   = row_map(rowid);
-    long eoffset   = row_map(rowid + 1);
-    auto rhs_rowid = rhs(rowid);
-
-    for (long ptr = soffset; ptr < eoffset; ++ptr) {
-      auto colid = entries(ptr);
-      auto val   = values(ptr);
-      if (colid != rowid) {
-        rhs_rowid = rhs_rowid - val * lhs(colid);
-      } else {
-        lhs(rowid) = rhs_rowid / val;
-      }
-    }  // end for ptr
-  }
+      Kokkos::parallel_reduce(
+          Kokkos::TeamThreadRange(team, soffset, eoffset),
+          [&](const long ptr, scalar_t &tdiff) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              tdiff = tdiff - val * lhs(colid);
+            } else {
+              diag = ptr;
+            }
+          },
+          diff);
+      team.team_barrier();
 
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag &, const lno_t i) const {
-    auto rowid     = nodes_grouped_by_level(i);
-    long soffset   = row_map(rowid);
-    long eoffset   = row_map(rowid + 1);
-    auto rhs_rowid = rhs(rowid);
-    auto diag      = -1;
-
-    for (long ptr = soffset; ptr < eoffset; ++ptr) {
-      auto colid = entries(ptr);
-      auto val   = values(ptr);
-      if (colid != rowid) {
-        rhs_rowid = rhs_rowid - val * lhs(colid);
-      } else {
-        diag = ptr;
+      // At end, finalize rowid == colid
+      // only one thread should do this; can also use Kokkos::single
+      if (my_rank == 0) {
+        lhs(rowid) = (rhs_rowid + diff) / values(diag);
       }
-    }  // end for ptr
-    lhs(rowid) = rhs_rowid / values(diag);
-  }
-};
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct LowerTriLvlSchedTP1SolverFunctor {
-  typedef typename RowMapType::execution_space execution_space;
-  typedef Kokkos::TeamPolicy<execution_space> policy_type;
-  typedef typename policy_type::member_type member_type;
-  typedef typename EntriesType::non_const_value_type lno_t;
-  typedef typename ValuesType::non_const_value_type scalar_t;
-
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-
-  long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                    // offset
-  long node_groups;
-
-  LowerTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
-                                   const EntriesType &entries_,
-                                   const ValuesType &values_, LHSType &lhs_,
-                                   const RHSType &rhs_,
-                                   const NGBLType &nodes_grouped_by_level_,
-                                   long node_count_, long node_groups_ = 0)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        node_count(node_count_),
-        node_groups(node_groups_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
-    auto rowid     = nodes_grouped_by_level(my_league + node_count);
-    auto my_rank   = team.team_rank();
-
-    auto soffset   = row_map(rowid);
-    auto eoffset   = row_map(rowid + 1);
-    auto rhs_rowid = rhs(rowid);
-    scalar_t diff  = scalar_t(0.0);
-
-    Kokkos::parallel_reduce(
-        Kokkos::TeamThreadRange(team, soffset, eoffset),
-        [&](const long ptr, scalar_t &tdiff) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            tdiff = tdiff - val * lhs(colid);
-          }
-        },
-        diff);
+    }
+  };
+
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct TriLvlSchedTP1SolverFunctorDiagValues {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+    ValuesType diagonal_values;  // inserted according to rowid
+
+    const bool is_lowertri;
+
+    long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                      // offset
+    long dense_nrows;
+
+    TriLvlSchedTP1SolverFunctorDiagValues(
+        const RowMapType &row_map_, const EntriesType &entries_,
+        const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
+        const entries_t &nodes_grouped_by_level_,
+        const ValuesType &diagonal_values_, const bool is_lowertri_,
+        long node_count_, long dense_nrows_ = 0)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          diagonal_values(diagonal_values_),
+          is_lowertri(is_lowertri_),
+          node_count(node_count_),
+          dense_nrows(dense_nrows_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+      auto rowid     = nodes_grouped_by_level(my_league + node_count);
+      auto my_rank   = team.team_rank();
+
+      auto soffset   = row_map(rowid);
+      auto eoffset   = row_map(rowid + 1);
+      auto rhs_rowid = rhs(rowid);
+      scalar_t diff  = scalar_t(0.0);
+
+      Kokkos::parallel_reduce(
+          Kokkos::TeamThreadRange(team, soffset, eoffset),
+          [&](const long ptr, scalar_t &tdiff) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              tdiff = tdiff - val * lhs(colid);
+            }
+          },
+          diff);
 
-    team.team_barrier();
+      team.team_barrier();
 
-    // At end, finalize rowid == colid
-    // only one thread should do this; can also use Kokkos::single
-    if (my_rank == 0) {
-      // ASSUMPTION: sorted diagonal value located at eoffset - 1
-      lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1);
+      // At end, finalize rowid == colid
+      // only one thread should do this; can also use Kokkos::single
+      if (my_rank == 0) {
+        // lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) :
+        // (rhs_rowid+diff)/values(soffset);
+        lhs(rowid) = (rhs_rowid + diff) / diagonal_values(rowid);
+      }
     }
-  }
+  };
+
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct TriLvlSchedTP2SolverFunctor {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+
+    const bool is_lowertri;
+    long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                      // offset
+    long node_groups;
+    long dense_nrows;
+
+    TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_,
+                                const EntriesType &entries_,
+                                const ValuesType &values_, LHSType &lhs_,
+                                const RHSType &rhs_,
+                                const entries_t &nodes_grouped_by_level_,
+                                const bool is_lowertri_, long node_count_,
+                                long node_groups_ = 0, long dense_nrows_ = 0)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          is_lowertri(is_lowertri_),
+          node_count(node_count_),
+          node_groups(node_groups_),
+          dense_nrows(dense_nrows_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+
+      size_t nrows = row_map.extent(0) - 1;
+
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
+            auto rowid = nodes_grouped_by_level(node_count +
+                                                my_league * node_groups + ng);
+            if (size_t(rowid) < nrows) {
+              auto soffset   = row_map(rowid);
+              auto eoffset   = row_map(rowid + 1);
+              auto rhs_rowid = rhs(rowid);
+              scalar_t diff  = scalar_t(0.0);
+
+              Kokkos::parallel_reduce(
+                  Kokkos::ThreadVectorRange(team, soffset, eoffset),
+                  [&](const long ptr, scalar_t &tdiff) {
+                    auto colid = entries(ptr);
+                    auto val   = values(ptr);
+                    if (colid != rowid) {
+                      tdiff = tdiff - val * lhs(colid);
+                    }
+                  },
+                  diff);
+
+              // ASSUMPTION: sorted diagonal value located at eoffset - 1
+              lhs(rowid) = is_lowertri
+                               ? (rhs_rowid + diff) / values(eoffset - 1)
+                               : (rhs_rowid + diff) / values(soffset);
+            }  // end if
+          });  // end TeamThreadRange
 
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag &, const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
-    auto rowid     = nodes_grouped_by_level(my_league + node_count);
-    auto my_rank   = team.team_rank();
-
-    auto soffset   = row_map(rowid);
-    auto eoffset   = row_map(rowid + 1);
-    auto rhs_rowid = rhs(rowid);
-    scalar_t diff  = scalar_t(0.0);
-
-    auto diag = -1;
-
-    Kokkos::parallel_reduce(
-        Kokkos::TeamThreadRange(team, soffset, eoffset),
-        [&](const long ptr, scalar_t &tdiff) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            tdiff = tdiff - val * lhs(colid);
-          } else {
-            diag = ptr;
-          }
-        },
-        diff);
-    team.team_barrier();
-
-    // At end, finalize rowid == colid
-    // only one thread should do this; can also use Kokkos::single
-    if (my_rank == 0) {
-      lhs(rowid) = (rhs_rowid + diff) / values(diag);
+      team.team_barrier();
     }
-  }
-};
-
-// FIXME CUDA: This algorithm not working with all integral type combos
-// In any case, this serves as a skeleton for 3-level hierarchical parallelism
-// for alg dev
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct LowerTriLvlSchedTP2SolverFunctor {
-  typedef typename RowMapType::execution_space execution_space;
-  typedef Kokkos::TeamPolicy<execution_space> policy_type;
-  typedef typename policy_type::member_type member_type;
-  typedef typename EntriesType::non_const_value_type lno_t;
-  typedef typename ValuesType::non_const_value_type scalar_t;
-
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-
-  long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                    // offset
-  long node_groups;
-
-  LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_,
-                                   const EntriesType &entries_,
-                                   const ValuesType &values_, LHSType &lhs_,
-                                   const RHSType &rhs_,
-                                   const NGBLType &nodes_grouped_by_level_,
-                                   long node_count_, long node_groups_ = 0)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        node_count(node_count_),
-        node_groups(node_groups_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
-
-    size_t nrows = row_map.extent(0) - 1;
-
-    Kokkos::parallel_for(
-        Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
-          auto rowid =
-              nodes_grouped_by_level(node_count + my_league * node_groups + ng);
-          if (size_t(rowid) < nrows) {
-            auto soffset   = row_map(rowid);
-            auto eoffset   = row_map(rowid + 1);
-            auto rhs_rowid = rhs(rowid);
-            scalar_t diff  = scalar_t(0.0);
-
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, soffset, eoffset),
-                [&](const long ptr, scalar_t &tdiff) {
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff = tdiff - val * lhs(colid);
-                  }
-                },
-                diff);
 
-            // ASSUMPTION: sorted diagonal value located at eoffset - 1
-            lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1);
-          }  // end if
-        });  // end TeamThreadRange
-
-    team.team_barrier();
-  }
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedTag &, const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+
+      size_t nrows = row_map.extent(0) - 1;
+
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
+            auto rowid = nodes_grouped_by_level(node_count +
+                                                my_league * node_groups + ng);
+            if (size_t(rowid) < nrows) {
+              auto soffset   = row_map(rowid);
+              auto eoffset   = row_map(rowid + 1);
+              auto rhs_rowid = rhs(rowid);
+              scalar_t diff  = scalar_t(0.0);
+
+              auto diag = -1;
+              Kokkos::parallel_reduce(
+                  Kokkos::ThreadVectorRange(team, soffset, eoffset),
+                  [&](const long ptr, scalar_t &tdiff) {
+                    auto colid = entries(ptr);
+                    auto val   = values(ptr);
+                    if (colid != rowid) {
+                      tdiff = tdiff - val * lhs(colid);
+                    } else {
+                      diag = ptr;
+                    }
+                  },
+                  diff);
+
+              lhs(rowid) = (rhs_rowid + diff) / values(diag);
+            }  // end if
+          });  // end TeamThreadRange
 
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag &, const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
+      team.team_barrier();
+    }
+  };
+
+  // Lower vs Upper Multi-block Functors
+
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct LowerTriLvlSchedRPSolverFunctor {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+
+    LowerTriLvlSchedRPSolverFunctor(const RowMapType &row_map_,
+                                    const EntriesType &entries_,
+                                    const ValuesType &values_, LHSType &lhs_,
+                                    const RHSType &rhs_,
+                                    const entries_t &nodes_grouped_by_level_)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const lno_t i) const {
+      auto rowid = nodes_grouped_by_level(i);
+      // Assuming indices are sorted per row, diag entry is final index in the
+      // list
+
+      long soffset   = row_map(rowid);
+      long eoffset   = row_map(rowid + 1);
+      auto rhs_rowid = rhs(rowid);
+
+      for (long ptr = soffset; ptr < eoffset; ++ptr) {
+        auto colid = entries(ptr);
+        auto val   = values(ptr);
+        if (colid != rowid) {
+          rhs_rowid = rhs_rowid - val * lhs(colid);
+        } else {
+          lhs(rowid) = rhs_rowid / val;
+        }
+      }  // end for ptr
+    }
 
-    size_t nrows = row_map.extent(0) - 1;
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedTag &, const lno_t i) const {
+      auto rowid     = nodes_grouped_by_level(i);
+      long soffset   = row_map(rowid);
+      long eoffset   = row_map(rowid + 1);
+      auto rhs_rowid = rhs(rowid);
+      auto diag      = -1;
+
+      for (long ptr = soffset; ptr < eoffset; ++ptr) {
+        auto colid = entries(ptr);
+        auto val   = values(ptr);
+        if (colid != rowid) {
+          rhs_rowid = rhs_rowid - val * lhs(colid);
+        } else {
+          diag = ptr;
+        }
+      }  // end for ptr
+      lhs(rowid) = rhs_rowid / values(diag);
+    }
+  };
+
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct LowerTriLvlSchedTP1SolverFunctor {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+
+    long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                      // offset
+    long node_groups;
+
+    LowerTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
+                                     const EntriesType &entries_,
+                                     const ValuesType &values_, LHSType &lhs_,
+                                     const RHSType &rhs_,
+                                     const entries_t &nodes_grouped_by_level_,
+                                     long node_count_, long node_groups_ = 0)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          node_count(node_count_),
+          node_groups(node_groups_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+      auto rowid     = nodes_grouped_by_level(my_league + node_count);
+      auto my_rank   = team.team_rank();
+
+      auto soffset   = row_map(rowid);
+      auto eoffset   = row_map(rowid + 1);
+      auto rhs_rowid = rhs(rowid);
+      scalar_t diff  = scalar_t(0.0);
+
+      Kokkos::parallel_reduce(
+          Kokkos::TeamThreadRange(team, soffset, eoffset),
+          [&](const long ptr, scalar_t &tdiff) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              tdiff = tdiff - val * lhs(colid);
+            }
+          },
+          diff);
 
-    Kokkos::parallel_for(
-        Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
-          auto rowid =
-              nodes_grouped_by_level(node_count + my_league * node_groups + ng);
-          if (size_t(rowid) < nrows) {
-            auto soffset   = row_map(rowid);
-            auto eoffset   = row_map(rowid + 1);
-            auto rhs_rowid = rhs(rowid);
-            scalar_t diff  = scalar_t(0.0);
+      team.team_barrier();
 
-            auto diag = -1;
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, soffset, eoffset),
-                [&](const long ptr, scalar_t &tdiff) {
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff = tdiff - val * lhs(colid);
-                  } else {
-                    diag = ptr;
-                  }
-                },
-                diff);
+      // At end, finalize rowid == colid
+      // only one thread should do this; can also use Kokkos::single
+      if (my_rank == 0) {
+        // ASSUMPTION: sorted diagonal value located at eoffset - 1
+        lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1);
+      }
+    }
 
-            // ASSUMPTION: sorted diagonal value located at eoffset - 1
-            lhs(rowid) = (rhs_rowid + diff) / values(diag);
-          }  // end if
-        });  // end TeamThreadRange
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedTag &, const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+      auto rowid     = nodes_grouped_by_level(my_league + node_count);
+      auto my_rank   = team.team_rank();
 
-    team.team_barrier();
-  }
-};
+      auto soffset   = row_map(rowid);
+      auto eoffset   = row_map(rowid + 1);
+      auto rhs_rowid = rhs(rowid);
+      scalar_t diff  = scalar_t(0.0);
 
-#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
-// -----------------------------------------------------------
-// Helper functors for Lower-triangular solve with SpMV
-template <class TriSolveHandle, class LHSType, class NGBLType>
-struct SparseTriSupernodalSpMVFunctor {
-  using execution_space = typename TriSolveHandle::HandleExecSpace;
-  using memory_space    = typename TriSolveHandle::HandleTempMemorySpace;
+      auto diag = -1;
 
-  using policy_type = Kokkos::TeamPolicy<execution_space>;
-  using member_type = typename policy_type::member_type;
+      Kokkos::parallel_reduce(
+          Kokkos::TeamThreadRange(team, soffset, eoffset),
+          [&](const long ptr, scalar_t &tdiff) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              tdiff = tdiff - val * lhs(colid);
+            } else {
+              diag = ptr;
+            }
+          },
+          diff);
+      team.team_barrier();
 
-  using scalar_t = typename LHSType::non_const_value_type;
+      // At end, finalize rowid == colid
+      // only one thread should do this; can also use Kokkos::single
+      if (my_rank == 0) {
+        lhs(rowid) = (rhs_rowid + diff) / values(diag);
+      }
+    }
+  };
+
+  // FIXME CUDA: This algorithm not working with all integral type combos
+  // In any case, this serves as a skeleton for 3-level hierarchical parallelism
+  // for alg dev
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct LowerTriLvlSchedTP2SolverFunctor {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+
+    long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                      // offset
+    long node_groups;
+
+    LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_,
+                                     const EntriesType &entries_,
+                                     const ValuesType &values_, LHSType &lhs_,
+                                     const RHSType &rhs_,
+                                     const entries_t &nodes_grouped_by_level_,
+                                     long node_count_, long node_groups_ = 0)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          node_count(node_count_),
+          node_groups(node_groups_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+
+      size_t nrows = row_map.extent(0) - 1;
+
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
+            auto rowid = nodes_grouped_by_level(node_count +
+                                                my_league * node_groups + ng);
+            if (size_t(rowid) < nrows) {
+              auto soffset   = row_map(rowid);
+              auto eoffset   = row_map(rowid + 1);
+              auto rhs_rowid = rhs(rowid);
+              scalar_t diff  = scalar_t(0.0);
+
+              Kokkos::parallel_reduce(
+                  Kokkos::ThreadVectorRange(team, soffset, eoffset),
+                  [&](const long ptr, scalar_t &tdiff) {
+                    auto colid = entries(ptr);
+                    auto val   = values(ptr);
+                    if (colid != rowid) {
+                      tdiff = tdiff - val * lhs(colid);
+                    }
+                  },
+                  diff);
+
+              // ASSUMPTION: sorted diagonal value located at eoffset - 1
+              lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1);
+            }  // end if
+          });  // end TeamThreadRange
 
-  using work_view_t =
-      typename Kokkos::View<scalar_t *,
-                            Kokkos::Device<execution_space, memory_space>>;
-
-  int flag;
-  long node_count;
-  NGBLType nodes_grouped_by_level;
-
-  const int *supercols;
-  const int *workoffset;
-
-  LHSType X;
-  work_view_t work;
-
-  // constructor
-  SparseTriSupernodalSpMVFunctor(int flag_, long node_count_,
-                                 const NGBLType &nodes_grouped_by_level_,
-                                 const int *supercols_, const int *workoffset_,
-                                 LHSType &X_, work_view_t work_)
-      : flag(flag_),
-        node_count(node_count_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        supercols(supercols_),
-        workoffset(workoffset_),
-        X(X_),
-        work(work_) {}
-
-  // operator
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    const int league_rank = team.league_rank();  // batch id
-    const int team_size   = team.team_size();
-    const int team_rank   = team.team_rank();
-    const scalar_t zero(0.0);
+      team.team_barrier();
+    }
 
-    auto s = nodes_grouped_by_level(node_count + league_rank);
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedTag &, const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+
+      size_t nrows = row_map.extent(0) - 1;
+
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
+            auto rowid = nodes_grouped_by_level(node_count +
+                                                my_league * node_groups + ng);
+            if (size_t(rowid) < nrows) {
+              auto soffset   = row_map(rowid);
+              auto eoffset   = row_map(rowid + 1);
+              auto rhs_rowid = rhs(rowid);
+              scalar_t diff  = scalar_t(0.0);
+
+              auto diag = -1;
+              Kokkos::parallel_reduce(
+                  Kokkos::ThreadVectorRange(team, soffset, eoffset),
+                  [&](const long ptr, scalar_t &tdiff) {
+                    auto colid = entries(ptr);
+                    auto val   = values(ptr);
+                    if (colid != rowid) {
+                      tdiff = tdiff - val * lhs(colid);
+                    } else {
+                      diag = ptr;
+                    }
+                  },
+                  diff);
+
+              // ASSUMPTION: sorted diagonal value located at eoffset - 1
+              lhs(rowid) = (rhs_rowid + diff) / values(diag);
+            }  // end if
+          });  // end TeamThreadRange
 
-    // copy vector elements for the diagonal to input vector (work)
-    // and zero out the corresponding elements in output (X)
-    int w1 = workoffset[s];
-    int j1 = supercols[s];
-    // number of columns in the s-th supernode column
-    int nscol = supercols[s + 1] - j1;
+      team.team_barrier();
+    }
+  };
 
-    if (flag == -2) {
-      // copy X to work
-      for (int j = team_rank; j < nscol; j += team_size) {
-        work(w1 + j) = X(j1 + j);
-      }
-    } else if (flag == -1) {
-      // copy work to X
-      for (int j = team_rank; j < nscol; j += team_size) {
-        X(j1 + j) = work(w1 + j);
-      }
-    } else if (flag == 1) {
-      for (int j = team_rank; j < nscol; j += team_size) {
-        work(w1 + j) = X(j1 + j);
-        X(j1 + j)    = zero;
-      }
-    } else {
-      // reinitialize work to zero
-      for (int j = team_rank; j < nscol; j += team_size) {
-        work(w1 + j) = zero;
+#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
+  // -----------------------------------------------------------
+  // Helper functors for Lower-triangular solve with SpMV
+  template <class LHSType>
+  struct SparseTriSupernodalSpMVFunctor {
+    int flag;
+    long node_count;
+    entries_t nodes_grouped_by_level;
+
+    const int *supercols;
+    const int *workoffset;
+
+    LHSType X;
+    work_view_t work;
+
+    // constructor
+    SparseTriSupernodalSpMVFunctor(int flag_, long node_count_,
+                                   const entries_t &nodes_grouped_by_level_,
+                                   const int *supercols_,
+                                   const int *workoffset_, LHSType &X_,
+                                   work_view_t work_)
+        : flag(flag_),
+          node_count(node_count_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          supercols(supercols_),
+          workoffset(workoffset_),
+          X(X_),
+          work(work_) {}
+
+    // operator
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      const int league_rank = team.league_rank();  // batch id
+      const int team_size   = team.team_size();
+      const int team_rank   = team.team_rank();
+      const scalar_t zero(0.0);
+
+      auto s = nodes_grouped_by_level(node_count + league_rank);
+
+      // copy vector elements for the diagonal to input vector (work)
+      // and zero out the corresponding elements in output (X)
+      int w1 = workoffset[s];
+      int j1 = supercols[s];
+      // number of columns in the s-th supernode column
+      int nscol = supercols[s + 1] - j1;
+
+      if (flag == -2) {
+        // copy X to work
+        for (int j = team_rank; j < nscol; j += team_size) {
+          work(w1 + j) = X(j1 + j);
+        }
+      } else if (flag == -1) {
+        // copy work to X
+        for (int j = team_rank; j < nscol; j += team_size) {
+          X(j1 + j) = work(w1 + j);
+        }
+      } else if (flag == 1) {
+        for (int j = team_rank; j < nscol; j += team_size) {
+          work(w1 + j) = X(j1 + j);
+          X(j1 + j)    = zero;
+        }
+      } else {
+        // reinitialize work to zero
+        for (int j = team_rank; j < nscol; j += team_size) {
+          work(w1 + j) = zero;
+        }
       }
+      team.team_barrier();
     }
-    team.team_barrier();
-  }
-};
-
-// -----------------------------------------------------------
-// Functor for Lower-triangular solve
-template <class TriSolveHandle, class ColptrView, class RowindType,
-          class ValuesType, class LHSType, class NGBLType>
-struct LowerTriSupernodalFunctor {
-  using execution_space = typename TriSolveHandle::HandleExecSpace;
-  using memory_space    = typename TriSolveHandle::HandleTempMemorySpace;
-
-  using policy_type = Kokkos::TeamPolicy<execution_space>;
-  using member_type = typename policy_type::member_type;
-
-  using scalar_t = typename ValuesType::non_const_value_type;
+  };
+
+  // -----------------------------------------------------------
+  // Functor for Lower-triangular solve
+  template <class ColptrView, class RowindType, class ValuesType, class LHSType>
+  struct LowerTriSupernodalFunctor {
+    const bool unit_diagonal;
+    const bool invert_diagonal;
+    const bool invert_offdiagonal;
+    const int *supercols;
+    ColptrView colptr;
+    RowindType rowind;
+    ValuesType values;
+
+    int level;
+    work_view_int_t kernel_type;
+    work_view_int_t diag_kernel_type;
+
+    LHSType X;
+
+    work_view_t work;  // needed with gemv for update&scatter
+    work_view_int_t work_offset;
+
+    entries_t nodes_grouped_by_level;
+
+    long node_count;
+
+    // constructor
+    LowerTriSupernodalFunctor(  // supernode info
+        const bool unit_diagonal_, const bool invert_diagonal_,
+        const bool invert_offdiagonal_, const int *supercols_,
+        // L in CSC
+        const ColptrView &colptr_, const RowindType &rowind_,
+        const ValuesType &values_,
+        // options to pick kernel type
+        int level_, work_view_int_t &kernel_type_,
+        work_view_int_t &diag_kernel_type_,
+        // right-hand-side (input), solution (output)
+        LHSType &X_,
+        // workspace
+        work_view_t work_, work_view_int_t &work_offset_,
+        //
+        const entries_t &nodes_grouped_by_level_, long node_count_)
+        : unit_diagonal(unit_diagonal_),
+          invert_diagonal(invert_diagonal_),
+          invert_offdiagonal(invert_offdiagonal_),
+          supercols(supercols_),
+          colptr(colptr_),
+          rowind(rowind_),
+          values(values_),
+          level(level_),
+          kernel_type(kernel_type_),
+          diag_kernel_type(diag_kernel_type_),
+          X(X_),
+          work(work_),
+          work_offset(work_offset_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          node_count(node_count_) {}
+
+    // operator
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      /* ----------------------------------------------------------------------
+       */
+      /* get inputs */
+      /* ----------------------------------------------------------------------
+       */
+      const int league_rank = team.league_rank();  // batch id
+      const int team_size   = team.team_size();
+      const int team_rank   = team.team_rank();
+      const scalar_t zero(0.0);
+      const scalar_t one(1.0);
+
+      auto s = nodes_grouped_by_level(node_count + league_rank);
+
+      // supernodal column size
+      const int j1 = supercols[s];
+      const int j2 = supercols[s + 1];
+      // > number of columns in the s-th supernode column
+      const int nscol = j2 - j1;
+      // "total" number of rows in all the supernodes (diagonal+off-diagonal)
+      const int i1    = colptr(j1);
+      const int nsrow = colptr(j1 + 1) - i1;
+
+      // create a view for the s-th supernocal column
+      // NOTE: we currently supports only default_layout = LayoutLeft
+      scalar_t *dataL = const_cast<scalar_t *>(values.data());
+      Kokkos::View<scalar_t **, default_layout, temp_mem_space,
+                   Kokkos::MemoryUnmanaged>
+          viewL(&dataL[i1], nsrow, nscol);
+
+      // extract part of the solution, corresponding to the diagonal block
+      auto Xj = Kokkos::subview(X, range_type(j1, j2));
 
-  using integer_view_t = Kokkos::View<int *, memory_space>;
-  using work_view_t =
-      typename Kokkos::View<scalar_t *,
-                            Kokkos::Device<execution_space, memory_space>>;
-
-  using range_type = Kokkos::pair<int, int>;
-
-  const bool unit_diagonal;
-  const bool invert_diagonal;
-  const bool invert_offdiagonal;
-  const int *supercols;
-  ColptrView colptr;
-  RowindType rowind;
-  ValuesType values;
-
-  int level;
-  integer_view_t kernel_type;
-  integer_view_t diag_kernel_type;
-
-  LHSType X;
-
-  work_view_t work;  // needed with gemv for update&scatter
-  integer_view_t work_offset;
-
-  NGBLType nodes_grouped_by_level;
-
-  long node_count;
-
-  // constructor
-  LowerTriSupernodalFunctor(  // supernode info
-      const bool unit_diagonal_, const bool invert_diagonal_,
-      const bool invert_offdiagonal_, const int *supercols_,
-      // L in CSC
-      const ColptrView &colptr_, const RowindType &rowind_,
-      const ValuesType &values_,
-      // options to pick kernel type
-      int level_, integer_view_t &kernel_type_,
-      integer_view_t &diag_kernel_type_,
-      // right-hand-side (input), solution (output)
-      LHSType &X_,
       // workspace
-      work_view_t work_, integer_view_t &work_offset_,
-      //
-      const NGBLType &nodes_grouped_by_level_, long node_count_)
-      : unit_diagonal(unit_diagonal_),
-        invert_diagonal(invert_diagonal_),
-        invert_offdiagonal(invert_offdiagonal_),
-        supercols(supercols_),
-        colptr(colptr_),
-        rowind(rowind_),
-        values(values_),
-        level(level_),
-        kernel_type(kernel_type_),
-        diag_kernel_type(diag_kernel_type_),
-        X(X_),
-        work(work_),
-        work_offset(work_offset_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        node_count(node_count_) {}
-
-  // operator
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    /* ---------------------------------------------------------------------- */
-    /* get inputs */
-    /* ---------------------------------------------------------------------- */
-    const int league_rank = team.league_rank();  // batch id
-    const int team_size   = team.team_size();
-    const int team_rank   = team.team_rank();
-    const scalar_t zero(0.0);
-    const scalar_t one(1.0);
-
-    auto s = nodes_grouped_by_level(node_count + league_rank);
+      const int workoffset = work_offset(s);
+      auto Z               = Kokkos::subview(
+          work, range_type(workoffset + nscol, workoffset + nsrow));
 
-    // supernodal column size
-    const int j1 = supercols[s];
-    const int j2 = supercols[s + 1];
-    // > number of columns in the s-th supernode column
-    const int nscol = j2 - j1;
-    // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-    const int i1    = colptr(j1);
-    const int nsrow = colptr(j1 + 1) - i1;
+      if (diag_kernel_type(level) != 3) {  // not a device-level TRSM-solve
+        if (invert_offdiagonal) {
+          // combined TRSM solve with diagonal + GEMV update with off-diagonal
+          auto Y = Kokkos::subview(
+              work,
+              range_type(
+                  workoffset,
+                  workoffset + nsrow));  // needed for gemv instead of trmv/trsv
+          auto Ljj =
+              Kokkos::subview(viewL, range_type(0, nsrow), Kokkos::ALL());
+          KokkosBlas::TeamGemv<member_type, KokkosBlas::Trans::NoTranspose,
+                               KokkosBlas::Algo::Gemv::Unblocked>::invoke(team,
+                                                                          one,
+                                                                          Ljj,
+                                                                          Xj,
+                                                                          zero,
+                                                                          Y);
+          team.team_barrier();
+          for (int ii = team_rank; ii < nscol; ii += team_size) {
+            Xj(ii) = Y(ii);
+          }
+          team.team_barrier();
+        } else {
+          /* TRSM with diagonal block */
+          // extract diagonal and off-diagonal blocks of L
+          auto Ljj =
+              Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL());
+          if (invert_diagonal) {
+            // workspace
+            auto Y = Kokkos::subview(
+                work,
+                range_type(workoffset,
+                           workoffset +
+                               nscol));  // needed for gemv instead of trmv/trsv
+            for (int ii = team_rank; ii < nscol; ii += team_size) {
+              Y(ii) = Xj(ii);
+            }
+            team.team_barrier();
+            // calling team-level "Unblocked" gemv on small-size diagonal in
+            // KokkosBatched
+            KokkosBlas::TeamGemv<
+                member_type, KokkosBlas::Trans::NoTranspose,
+                KokkosBlas::Algo::Gemv::Unblocked>::invoke(team, one, Ljj, Y,
+                                                           zero, Xj);
+          } else {
+            // NOTE: we currently supports only default_layout = LayoutLeft
+            Kokkos::View<scalar_t **, default_layout, temp_mem_space,
+                         Kokkos::MemoryUnmanaged>
+                Xjj(Xj.data(), nscol, 1);
+            if (unit_diagonal) {
+              KokkosBatched::TeamTrsm<
+                  member_type, KokkosBatched::Side::Left,
+                  KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose,
+                  KokkosBatched::Diag::Unit,
+                  KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj,
+                                                                Xjj);
+            } else {
+              KokkosBatched::TeamTrsm<
+                  member_type, KokkosBatched::Side::Left,
+                  KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose,
+                  KokkosBatched::Diag::NonUnit,
+                  KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj,
+                                                                Xjj);
+            }
+          }
+          team.team_barrier();
 
-    // create a view for the s-th supernocal column
-    // NOTE: we currently supports only default_layout = LayoutLeft
-    scalar_t *dataL = const_cast<scalar_t *>(values.data());
-    Kokkos::View<scalar_t **, default_layout, memory_space,
-                 Kokkos::MemoryUnmanaged>
-        viewL(&dataL[i1], nsrow, nscol);
+          /* GEMM to update with off diagonal blocks */
+          auto Lij =
+              Kokkos::subview(viewL, range_type(nscol, nsrow), Kokkos::ALL());
+          KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::NoTranspose,
+                               KokkosBlas::Algo::Gemv::Unblocked>::invoke(team,
+                                                                          one,
+                                                                          Lij,
+                                                                          Xj,
+                                                                          zero,
+                                                                          Z);
+          team.team_barrier();
+        }
+      }
 
-    // extract part of the solution, corresponding to the diagonal block
-    auto Xj = Kokkos::subview(X, range_type(j1, j2));
+      /* scatter vectors back into X */
+      int i2 = i1 + nscol;  // offset into rowind
+      int nsrow2 =
+          nsrow -
+          nscol;  // "total" number of rows in all the off-diagonal supernodes
+      Kokkos::View<scalar_t *, temp_mem_space,
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::Atomic>>
+          Xatomic(X.data(), X.extent(0));
+      for (int ii = team_rank; ii < nsrow2; ii += team_size) {
+        int i = rowind(i2 + ii);
+        Xatomic(i) -= Z(ii);
+      }
+      team.team_barrier();
+    }
+  };
 
-    // workspace
-    const int workoffset = work_offset(s);
-    auto Z               = Kokkos::subview(
-        work, range_type(workoffset + nscol, workoffset + nsrow));
-
-    if (diag_kernel_type(level) != 3) {  // not a device-level TRSM-solve
-      if (invert_offdiagonal) {
-        // combined TRSM solve with diagonal + GEMV update with off-diagonal
-        auto Y = Kokkos::subview(
-            work,
-            range_type(
-                workoffset,
-                workoffset + nsrow));  // needed for gemv instead of trmv/trsv
-        auto Ljj = Kokkos::subview(viewL, range_type(0, nsrow), Kokkos::ALL());
-        KokkosBlas::TeamGemv<member_type, KokkosBlas::Trans::NoTranspose,
-                             KokkosBlas::Algo::Gemv::Unblocked>::invoke(team,
-                                                                        one,
-                                                                        Ljj, Xj,
-                                                                        zero,
-                                                                        Y);
-        team.team_barrier();
-        for (int ii = team_rank; ii < nscol; ii += team_size) {
-          Xj(ii) = Y(ii);
-        }
+  // -----------------------------------------------------------
+  // Functor for Upper-triangular solve in CSR
+  template <class ColptrType, class RowindType, class ValuesType, class LHSType>
+  struct UpperTriSupernodalFunctor {
+    // NOTE: we currently supports only default_layout = LayoutLeft
+    using SupernodeView =
+        typename Kokkos::View<scalar_t **, default_layout, temp_mem_space,
+                              Kokkos::MemoryUnmanaged>;
+
+    bool invert_diagonal;
+    const int *supercols;
+    ColptrType colptr;
+    RowindType rowind;
+    ValuesType values;
+
+    int level;
+    work_view_int_t kernel_type;
+    work_view_int_t diag_kernel_type;
+
+    LHSType X;
+
+    work_view_t work;  // needed with gemv for update&scatter
+    work_view_int_t work_offset;
+
+    entries_t nodes_grouped_by_level;
+
+    long node_count;
+
+    // constructor
+    UpperTriSupernodalFunctor(  // supernode info
+        bool invert_diagonal_, const int *supercols_,
+        // U in CSR
+        const ColptrType &colptr_, const RowindType &rowind_,
+        const ValuesType &values_,
+        // options to pick kernel type
+        int level_, work_view_int_t &kernel_type_,
+        work_view_int_t &diag_kernel_type_,
+        // right-hand-side (input), solution (output)
+        LHSType &X_,
+        // workspace
+        work_view_t &work_, work_view_int_t &work_offset_,
+        //
+        const entries_t &nodes_grouped_by_level_, long node_count_)
+        : invert_diagonal(invert_diagonal_),
+          supercols(supercols_),
+          colptr(colptr_),
+          rowind(rowind_),
+          values(values_),
+          level(level_),
+          kernel_type(kernel_type_),
+          diag_kernel_type(diag_kernel_type_),
+          X(X_),
+          work(work_),
+          work_offset(work_offset_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          node_count(node_count_) {}
+
+    // operator
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      /* ----------------------------------------------------------------------
+       */
+      /* get inputs */
+      /* ----------------------------------------------------------------------
+       */
+      const int league_rank = team.league_rank();  // batch id
+      const int team_size   = team.team_size();
+      const int team_rank   = team.team_rank();
+      const scalar_t zero(0.0);
+      const scalar_t one(1.0);
+
+      auto s = nodes_grouped_by_level(node_count + league_rank);
+
+      // number of columns in the s-th supernode column
+      int j1    = supercols[s];
+      int j2    = supercols[s + 1];
+      int nscol = j2 - j1;
+      // "total" number of rows in all the supernodes (diagonal+off-diagonal)
+      int i1    = colptr(j1);
+      int nsrow = colptr(j1 + 1) - i1;
+
+      // create a view of the s-th supernocal row of U
+      scalar_t *dataU = const_cast<scalar_t *>(values.data());
+      SupernodeView viewU(&dataU[i1], nsrow, nscol);
+
+      // extract part of solution, corresponding to the diagonal block U(s, s)
+      auto Xj       = Kokkos::subview(X, range_type(j1, j2));
+      using Xj_type = decltype(Xj);
+
+      // workspaces
+      int workoffset = work_offset(s);
+
+      // "total" number of rows in all the off-diagonal supernodes
+      int nsrow2 = nsrow - nscol;
+      /* gather vector into Z */
+      int i2 = i1 + nscol;  // offset into rowind
+      auto Z = Kokkos::subview(
+          work, range_type(workoffset + nscol,
+                           workoffset +
+                               nsrow));  // needed with gemv for update&scatter
+      using Z_type = decltype(Z);
+      for (int ii = team_rank; ii < nsrow2; ii += team_size) {
+        int i = rowind(i2 + ii);
+        Z(ii) = X(i);
+      }
+      team.team_barrier();
+      /* GEMM to update with off diagonal blocks, Xj = -Uij^T * Z */
+      if (diag_kernel_type(level) != 3) {
+        // not device-level GEMV-udpate
+        auto Uij =
+            Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL());
+        using Uij_type = decltype(Uij);
+        KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::Transpose,
+                             KokkosBlas::Algo::Gemv::Unblocked>::
+            template invoke<const scalar_t, Uij_type, Z_type, Xj_type>(
+                team, -one, Uij, Z, one, Xj);
         team.team_barrier();
-      } else {
+
         /* TRSM with diagonal block */
-        // extract diagonal and off-diagonal blocks of L
-        auto Ljj = Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL());
+        // extract diagonal and off-diagonal blocks of U
+        auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL());
+        using Ujj_type = decltype(Ujj);
+
         if (invert_diagonal) {
           // workspace
           auto Y = Kokkos::subview(
@@ -883,894 +1042,540 @@ struct LowerTriSupernodalFunctor {
               range_type(
                   workoffset,
                   workoffset + nscol));  // needed for gemv instead of trmv/trsv
+          using Y_type = decltype(Y);
           for (int ii = team_rank; ii < nscol; ii += team_size) {
             Y(ii) = Xj(ii);
           }
           team.team_barrier();
-          // calling team-level "Unblocked" gemv on small-size diagonal in
-          // KokkosBatched
-          KokkosBlas::TeamGemv<member_type, KokkosBlas::Trans::NoTranspose,
-                               KokkosBlas::Algo::Gemv::Unblocked>::invoke(team,
-                                                                          one,
-                                                                          Ljj,
-                                                                          Y,
-                                                                          zero,
-                                                                          Xj);
+
+          // caling team-level kernel in KokkosBatched on a small-size diagonal
+          KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::Transpose,
+                               KokkosBlas::Algo::Gemv::Unblocked>::
+              template invoke<const scalar_t, Ujj_type, Y_type, Xj_type>(
+                  team, one, Ujj, Y, zero, Xj);
         } else {
           // NOTE: we currently supports only default_layout = LayoutLeft
-          Kokkos::View<scalar_t **, default_layout, memory_space,
+          Kokkos::View<scalar_t **, default_layout, temp_mem_space,
                        Kokkos::MemoryUnmanaged>
               Xjj(Xj.data(), nscol, 1);
-          if (unit_diagonal) {
-            KokkosBatched::TeamTrsm<
-                member_type, KokkosBatched::Side::Left,
-                KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose,
-                KokkosBatched::Diag::Unit,
-                KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj,
-                                                              Xjj);
+          KokkosBatched::TeamTrsm<
+              member_type, KokkosBatched::Side::Left,
+              KokkosBatched::Uplo::Lower, KokkosBatched::Trans::Transpose,
+              KokkosBatched::Diag::NonUnit,
+              KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj,
+                                                            Xjj);
+        }
+        team.team_barrier();
+      }
+    }
+  };
+
+  // -----------------------------------------------------------
+  // Functor for Upper-triangular solve in CSC
+  template <class ColptrType, class RowindType, class ValuesType, class LHSType>
+  struct UpperTriTranSupernodalFunctor {
+    const bool invert_diagonal;
+    const bool invert_offdiagonal;
+    const int *supercols;
+    ColptrType colptr;
+    RowindType rowind;
+    ValuesType values;
+
+    int level;
+    work_view_int_t kernel_type;
+    work_view_int_t diag_kernel_type;
+
+    LHSType X;
+
+    work_view_t work;  // needed with gemv for update&scatter
+    work_view_int_t work_offset;
+
+    entries_t nodes_grouped_by_level;
+
+    long node_count;
+
+    // constructor
+    UpperTriTranSupernodalFunctor(  // supernode info
+        const bool invert_diagonal_, const bool invert_offdiagonal_,
+        const int *supercols_,
+
+        // U in CSC
+        const ColptrType &colptr_, const RowindType &rowind_,
+        const ValuesType &values_,
+        // options to pick kernel type
+        const int level_, const work_view_int_t &kernel_type_,
+        const work_view_int_t &diag_kernel_type_,
+        // right-hand-side (input), solution (output)
+        const LHSType &X_,
+        // workspace
+        const work_view_t &work_, const work_view_int_t &work_offset_,
+        //
+        const entries_t &nodes_grouped_by_level_, const long node_count_)
+        : invert_diagonal(invert_diagonal_),
+          invert_offdiagonal(invert_offdiagonal_),
+          supercols(supercols_),
+          colptr(colptr_),
+          rowind(rowind_),
+          values(values_),
+          level(level_),
+          kernel_type(kernel_type_),
+          diag_kernel_type(diag_kernel_type_),
+          X(X_),
+          work(work_),
+          work_offset(work_offset_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          node_count(node_count_) {}
+
+    // operator
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      /* ----------------------------------------------------------------------
+       */
+      /* get inputs */
+      /* ----------------------------------------------------------------------
+       */
+      const int league_rank = team.league_rank();  // batch id
+      const int team_size   = team.team_size();
+      const int team_rank   = team.team_rank();
+      const scalar_t zero(0.0);
+      const scalar_t one(1.0);
+
+      auto s = nodes_grouped_by_level(node_count + league_rank);
+
+      // number of columns in the s-th supernode column
+      const int j1    = supercols[s];
+      const int j2    = supercols[s + 1];
+      const int nscol = j2 - j1;
+      // "total" number of rows in all the supernodes (diagonal+off-diagonal)
+      const int i1    = colptr(j1);
+      const int nsrow = colptr(j1 + 1) - i1;
+      // "total" number of rows in all the off-diagonal supernodes
+      const int nsrow2 = nsrow - nscol;
+
+      // create a view of the s-th supernocal column of U
+      // NOTE: we currently supports only default_layout = LayoutLeft
+      scalar_t *dataU = const_cast<scalar_t *>(values.data());
+      Kokkos::View<scalar_t **, default_layout, temp_mem_space,
+                   Kokkos::MemoryUnmanaged>
+          viewU(&dataU[i1], nsrow, nscol);
+
+      // extract part of solution, corresponding to the diagonal block U(s, s)
+      auto Xj = Kokkos::subview(X, range_type(j1, j2));
+
+      // workspaces
+      int workoffset = work_offset(s);
+
+      /* TRSM with diagonal block */
+      if (diag_kernel_type(level) != 3) {
+        // not device-level TRSM-solve
+        if (invert_offdiagonal) {
+          // extract diagonal + off-diagonal blocks of U
+          auto Y = Kokkos::subview(
+              work,
+              range_type(
+                  workoffset,
+                  workoffset + nsrow));  // needed with gemv for update&scatter
+          auto Uij =
+              Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL());
+          KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::NoTranspose,
+                               KokkosBlas::Algo::Gemv::Unblocked>::invoke(team,
+                                                                          one,
+                                                                          Uij,
+                                                                          Xj,
+                                                                          zero,
+                                                                          Y);
+          team.team_barrier();
+          // copy the diagonal back to output
+          for (int ii = team_rank; ii < nscol; ii += team_size) {
+            Xj(ii) = Y(ii);
+          }
+        } else {
+          // extract diagonal block of U (stored on top)
+          auto Ujj =
+              Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL());
+          if (invert_diagonal) {
+            auto Y = Kokkos::subview(
+                work,
+                range_type(workoffset,
+                           workoffset +
+                               nscol));  // needed for gemv instead of trmv/trsv
+            for (int ii = team_rank; ii < nscol; ii += team_size) {
+              Y(ii) = Xj(ii);
+            }
+            team.team_barrier();
+            KokkosBlas::TeamGemv<
+                member_type, KokkosBatched::Trans::NoTranspose,
+                KokkosBlas::Algo::Gemv::Unblocked>::invoke(team, one, Ujj, Y,
+                                                           zero, Xj);
           } else {
+            // NOTE: we currently supports only default_layout = LayoutLeft
+            Kokkos::View<scalar_t **, default_layout, temp_mem_space,
+                         Kokkos::MemoryUnmanaged>
+                Xjj(Xj.data(), nscol, 1);
             KokkosBatched::TeamTrsm<
                 member_type, KokkosBatched::Side::Left,
-                KokkosBatched::Uplo::Lower, KokkosBatched::Trans::NoTranspose,
+                KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose,
                 KokkosBatched::Diag::NonUnit,
-                KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ljj,
+                KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj,
                                                               Xjj);
           }
         }
         team.team_barrier();
+      }
+      if (nsrow2 > 0) {
+        /* GEMM to update off diagonal blocks, Z = Uij * Xj */
+        auto Z = Kokkos::subview(
+            work, range_type(workoffset + nscol, workoffset + nsrow));
+        if (!invert_offdiagonal && diag_kernel_type(level) != 3) {
+          // not device-level TRSM-solve
+          auto Uij =
+              Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL());
+          KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::NoTranspose,
+                               KokkosBlas::Algo::Gemv::Unblocked>::invoke(team,
+                                                                          one,
+                                                                          Uij,
+                                                                          Xj,
+                                                                          zero,
+                                                                          Z);
+          team.team_barrier();
+        }
 
-        /* GEMM to update with off diagonal blocks */
-        auto Lij =
-            Kokkos::subview(viewL, range_type(nscol, nsrow), Kokkos::ALL());
-        KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::NoTranspose,
-                             KokkosBlas::Algo::Gemv::Unblocked>::invoke(team,
-                                                                        one,
-                                                                        Lij, Xj,
-                                                                        zero,
-                                                                        Z);
+        /* scatter vector into Z */
+        int i2 = i1 + nscol;  // offset into rowind
+        Kokkos::View<scalar_t *, temp_mem_space,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::Atomic>>
+            Xatomic(X.data(), X.extent(0));
+        for (int ii = team_rank; ii < nsrow2; ii += team_size) {
+          int i = rowind(i2 + ii);
+          Xatomic(i) -= Z(ii);
+        }
         team.team_barrier();
       }
     }
+  };
+#endif
 
-    /* scatter vectors back into X */
-    int i2 = i1 + nscol;  // offset into rowind
-    int nsrow2 =
-        nsrow -
-        nscol;  // "total" number of rows in all the off-diagonal supernodes
-    Kokkos::View<scalar_t *, memory_space,
-                 Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::Atomic>>
-        Xatomic(X.data(), X.extent(0));
-    for (int ii = team_rank; ii < nsrow2; ii += team_size) {
-      int i = rowind(i2 + ii);
-      Xatomic(i) -= Z(ii);
-    }
-    team.team_barrier();
-  }
-};
-
-// -----------------------------------------------------------
-// Functor for Upper-triangular solve in CSR
-template <class TriSolveHandle, class ColptrType, class RowindType,
-          class ValuesType, class LHSType, class NGBLType>
-struct UpperTriSupernodalFunctor {
-  using execution_space = typename TriSolveHandle::HandleExecSpace;
-  using memory_space    = typename TriSolveHandle::HandleTempMemorySpace;
-
-  using policy_type = Kokkos::TeamPolicy<execution_space>;
-  using member_type = typename policy_type::member_type;
-
-  using scalar_t = typename ValuesType::non_const_value_type;
-
-  using integer_view_t = Kokkos::View<int *, memory_space>;
-  using work_view_t =
-      typename Kokkos::View<scalar_t *,
-                            Kokkos::Device<execution_space, memory_space>>;
-
-  // NOTE: we currently supports only default_layout = LayoutLeft
-  using SupernodeView =
-      typename Kokkos::View<scalar_t **, default_layout, memory_space,
-                            Kokkos::MemoryUnmanaged>;
-
-  using range_type = Kokkos::pair<int, int>;
-
-  bool invert_diagonal;
-  const int *supercols;
-  ColptrType colptr;
-  RowindType rowind;
-  ValuesType values;
-
-  int level;
-  integer_view_t kernel_type;
-  integer_view_t diag_kernel_type;
-
-  LHSType X;
-
-  work_view_t work;  // needed with gemv for update&scatter
-  integer_view_t work_offset;
-
-  NGBLType nodes_grouped_by_level;
-
-  long node_count;
-
-  // constructor
-  UpperTriSupernodalFunctor(  // supernode info
-      bool invert_diagonal_, const int *supercols_,
-      // U in CSR
-      const ColptrType &colptr_, const RowindType &rowind_,
-      const ValuesType &values_,
-      // options to pick kernel type
-      int level_, integer_view_t &kernel_type_,
-      integer_view_t &diag_kernel_type_,
-      // right-hand-side (input), solution (output)
-      LHSType &X_,
-      // workspace
-      work_view_t &work_, integer_view_t &work_offset_,
-      //
-      const NGBLType &nodes_grouped_by_level_, long node_count_)
-      : invert_diagonal(invert_diagonal_),
-        supercols(supercols_),
-        colptr(colptr_),
-        rowind(rowind_),
-        values(values_),
-        level(level_),
-        kernel_type(kernel_type_),
-        diag_kernel_type(diag_kernel_type_),
-        X(X_),
-        work(work_),
-        work_offset(work_offset_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        node_count(node_count_) {}
-
-  // operator
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    /* ---------------------------------------------------------------------- */
-    /* get inputs */
-    /* ---------------------------------------------------------------------- */
-    const int league_rank = team.league_rank();  // batch id
-    const int team_size   = team.team_size();
-    const int team_rank   = team.team_rank();
-    const scalar_t zero(0.0);
-    const scalar_t one(1.0);
-
-    auto s = nodes_grouped_by_level(node_count + league_rank);
-
-    // number of columns in the s-th supernode column
-    int j1    = supercols[s];
-    int j2    = supercols[s + 1];
-    int nscol = j2 - j1;
-    // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-    int i1    = colptr(j1);
-    int nsrow = colptr(j1 + 1) - i1;
-
-    // create a view of the s-th supernocal row of U
-    scalar_t *dataU = const_cast<scalar_t *>(values.data());
-    SupernodeView viewU(&dataU[i1], nsrow, nscol);
-
-    // extract part of solution, corresponding to the diagonal block U(s, s)
-    auto Xj       = Kokkos::subview(X, range_type(j1, j2));
-    using Xj_type = decltype(Xj);
-
-    // workspaces
-    int workoffset = work_offset(s);
-
-    // "total" number of rows in all the off-diagonal supernodes
-    int nsrow2 = nsrow - nscol;
-    /* gather vector into Z */
-    int i2 = i1 + nscol;  // offset into rowind
-    auto Z = Kokkos::subview(
-        work,
-        range_type(workoffset + nscol,
-                   workoffset + nsrow));  // needed with gemv for update&scatter
-    using Z_type = decltype(Z);
-    for (int ii = team_rank; ii < nsrow2; ii += team_size) {
-      int i = rowind(i2 + ii);
-      Z(ii) = X(i);
-    }
-    team.team_barrier();
-    /* GEMM to update with off diagonal blocks, Xj = -Uij^T * Z */
-    if (diag_kernel_type(level) != 3) {
-      // not device-level GEMV-udpate
-      auto Uij =
-          Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL());
-      using Uij_type = decltype(Uij);
-      KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::Transpose,
-                           KokkosBlas::Algo::Gemv::Unblocked>::
-          template invoke<const scalar_t, Uij_type, Z_type, Xj_type>(
-              team, -one, Uij, Z, one, Xj);
-      team.team_barrier();
-
-      /* TRSM with diagonal block */
-      // extract diagonal and off-diagonal blocks of U
-      auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL());
-      using Ujj_type = decltype(Ujj);
-
-      if (invert_diagonal) {
-        // workspace
-        auto Y = Kokkos::subview(
-            work,
-            range_type(
-                workoffset,
-                workoffset + nscol));  // needed for gemv instead of trmv/trsv
-        using Y_type = decltype(Y);
-        for (int ii = team_rank; ii < nscol; ii += team_size) {
-          Y(ii) = Xj(ii);
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct UpperTriLvlSchedRPSolverFunctor {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+
+    UpperTriLvlSchedRPSolverFunctor(const RowMapType &row_map_,
+                                    const EntriesType &entries_,
+                                    const ValuesType &values_, LHSType &lhs_,
+                                    const RHSType &rhs_,
+                                    const entries_t &nodes_grouped_by_level_)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const lno_t i) const {
+      auto rowid = nodes_grouped_by_level(i);
+      // Assuming indices are sorted per row, diag entry is final index in the
+      // list
+      long soffset   = row_map(rowid);
+      long eoffset   = row_map(rowid + 1);
+      auto rhs_rowid = rhs(rowid);
+      for (long ptr = eoffset - 1; ptr >= soffset; --ptr) {
+        auto colid = entries(ptr);
+        auto val   = values(ptr);
+        if (colid != rowid) {
+          rhs_rowid = rhs_rowid - val * lhs(colid);
+        } else {
+          lhs(rowid) = rhs_rowid / val;
         }
-        team.team_barrier();
-
-        // caling team-level kernel in KokkosBatched on a small-size diagonal
-        KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::Transpose,
-                             KokkosBlas::Algo::Gemv::Unblocked>::
-            template invoke<const scalar_t, Ujj_type, Y_type, Xj_type>(
-                team, one, Ujj, Y, zero, Xj);
-      } else {
-        // NOTE: we currently supports only default_layout = LayoutLeft
-        Kokkos::View<scalar_t **, default_layout, memory_space,
-                     Kokkos::MemoryUnmanaged>
-            Xjj(Xj.data(), nscol, 1);
-        KokkosBatched::TeamTrsm<
-            member_type, KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower,
-            KokkosBatched::Trans::Transpose, KokkosBatched::Diag::NonUnit,
-            KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, Xjj);
-      }
-      team.team_barrier();
+      }  // end for ptr
     }
-  }
-};
-
-// -----------------------------------------------------------
-// Functor for Upper-triangular solve in CSC
-template <class TriSolveHandle, class ColptrType, class RowindType,
-          class ValuesType, class LHSType, class NGBLType>
-struct UpperTriTranSupernodalFunctor {
-  using execution_space = typename TriSolveHandle::HandleExecSpace;
-  using memory_space    = typename TriSolveHandle::HandleTempMemorySpace;
 
-  using policy_type = Kokkos::TeamPolicy<execution_space>;
-  using member_type = typename policy_type::member_type;
-
-  using scalar_t = typename ValuesType::non_const_value_type;
-
-  using integer_view_t = Kokkos::View<int *, memory_space>;
-  using work_view_t =
-      typename Kokkos::View<scalar_t *,
-                            Kokkos::Device<execution_space, memory_space>>;
-
-  using range_type = Kokkos::pair<int, int>;
-
-  const bool invert_diagonal;
-  const bool invert_offdiagonal;
-  const int *supercols;
-  ColptrType colptr;
-  RowindType rowind;
-  ValuesType values;
-
-  int level;
-  integer_view_t kernel_type;
-  integer_view_t diag_kernel_type;
-
-  LHSType X;
-
-  work_view_t work;  // needed with gemv for update&scatter
-  integer_view_t work_offset;
-
-  NGBLType nodes_grouped_by_level;
-
-  long node_count;
-
-  // constructor
-  UpperTriTranSupernodalFunctor(  // supernode info
-      const bool invert_diagonal_, const bool invert_offdiagonal_,
-      const int *supercols_,
-
-      // U in CSC
-      const ColptrType &colptr_, const RowindType &rowind_,
-      const ValuesType &values_,
-      // options to pick kernel type
-      const int level_, const integer_view_t &kernel_type_,
-      const integer_view_t &diag_kernel_type_,
-      // right-hand-side (input), solution (output)
-      const LHSType &X_,
-      // workspace
-      const work_view_t &work_, const integer_view_t &work_offset_,
-      //
-      const NGBLType &nodes_grouped_by_level_, const long node_count_)
-      : invert_diagonal(invert_diagonal_),
-        invert_offdiagonal(invert_offdiagonal_),
-        supercols(supercols_),
-        colptr(colptr_),
-        rowind(rowind_),
-        values(values_),
-        level(level_),
-        kernel_type(kernel_type_),
-        diag_kernel_type(diag_kernel_type_),
-        X(X_),
-        work(work_),
-        work_offset(work_offset_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        node_count(node_count_) {}
-
-  // operator
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    /* ---------------------------------------------------------------------- */
-    /* get inputs */
-    /* ---------------------------------------------------------------------- */
-    const int league_rank = team.league_rank();  // batch id
-    const int team_size   = team.team_size();
-    const int team_rank   = team.team_rank();
-    const scalar_t zero(0.0);
-    const scalar_t one(1.0);
-
-    auto s = nodes_grouped_by_level(node_count + league_rank);
-
-    // number of columns in the s-th supernode column
-    const int j1    = supercols[s];
-    const int j2    = supercols[s + 1];
-    const int nscol = j2 - j1;
-    // "total" number of rows in all the supernodes (diagonal+off-diagonal)
-    const int i1    = colptr(j1);
-    const int nsrow = colptr(j1 + 1) - i1;
-    // "total" number of rows in all the off-diagonal supernodes
-    const int nsrow2 = nsrow - nscol;
-
-    // create a view of the s-th supernocal column of U
-    // NOTE: we currently supports only default_layout = LayoutLeft
-    scalar_t *dataU = const_cast<scalar_t *>(values.data());
-    Kokkos::View<scalar_t **, default_layout, memory_space,
-                 Kokkos::MemoryUnmanaged>
-        viewU(&dataU[i1], nsrow, nscol);
-
-    // extract part of solution, corresponding to the diagonal block U(s, s)
-    auto Xj = Kokkos::subview(X, range_type(j1, j2));
-
-    // workspaces
-    int workoffset = work_offset(s);
-
-    /* TRSM with diagonal block */
-    if (diag_kernel_type(level) != 3) {
-      // not device-level TRSM-solve
-      if (invert_offdiagonal) {
-        // extract diagonal + off-diagonal blocks of U
-        auto Y = Kokkos::subview(
-            work,
-            range_type(
-                workoffset,
-                workoffset + nsrow));  // needed with gemv for update&scatter
-        auto Uij = Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL());
-        KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::NoTranspose,
-                             KokkosBlas::Algo::Gemv::Unblocked>::invoke(team,
-                                                                        one,
-                                                                        Uij, Xj,
-                                                                        zero,
-                                                                        Y);
-        team.team_barrier();
-        // copy the diagonal back to output
-        for (int ii = team_rank; ii < nscol; ii += team_size) {
-          Xj(ii) = Y(ii);
-        }
-      } else {
-        // extract diagonal block of U (stored on top)
-        auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL());
-        if (invert_diagonal) {
-          auto Y = Kokkos::subview(
-              work,
-              range_type(
-                  workoffset,
-                  workoffset + nscol));  // needed for gemv instead of trmv/trsv
-          for (int ii = team_rank; ii < nscol; ii += team_size) {
-            Y(ii) = Xj(ii);
-          }
-          team.team_barrier();
-          KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::NoTranspose,
-                               KokkosBlas::Algo::Gemv::Unblocked>::invoke(team,
-                                                                          one,
-                                                                          Ujj,
-                                                                          Y,
-                                                                          zero,
-                                                                          Xj);
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedTag &, const lno_t i) const {
+      auto rowid     = nodes_grouped_by_level(i);
+      long soffset   = row_map(rowid);
+      long eoffset   = row_map(rowid + 1);
+      auto rhs_rowid = rhs(rowid);
+      auto diag      = -1;
+      for (long ptr = eoffset - 1; ptr >= soffset; --ptr) {
+        auto colid = entries(ptr);
+        auto val   = values(ptr);
+        if (colid != rowid) {
+          rhs_rowid = rhs_rowid - val * lhs(colid);
         } else {
-          // NOTE: we currently supports only default_layout = LayoutLeft
-          Kokkos::View<scalar_t **, default_layout, memory_space,
-                       Kokkos::MemoryUnmanaged>
-              Xjj(Xj.data(), nscol, 1);
-          KokkosBatched::TeamTrsm<
-              member_type, KokkosBatched::Side::Left,
-              KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose,
-              KokkosBatched::Diag::NonUnit,
-              KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj,
-                                                            Xjj);
+          diag = ptr;
         }
-      }
-      team.team_barrier();
+      }  // end for ptr
+      lhs(rowid) = rhs_rowid / values(diag);
     }
-    if (nsrow2 > 0) {
-      /* GEMM to update off diagonal blocks, Z = Uij * Xj */
-      auto Z = Kokkos::subview(
-          work, range_type(workoffset + nscol, workoffset + nsrow));
-      if (!invert_offdiagonal && diag_kernel_type(level) != 3) {
-        // not device-level TRSM-solve
-        auto Uij =
-            Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL());
-        KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::NoTranspose,
-                             KokkosBlas::Algo::Gemv::Unblocked>::invoke(team,
-                                                                        one,
-                                                                        Uij, Xj,
-                                                                        zero,
-                                                                        Z);
-        team.team_barrier();
-      }
+  };
+
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct UpperTriLvlSchedTP1SolverFunctor {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+
+    long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                      // offset
+    long node_groups;
+
+    UpperTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
+                                     const EntriesType &entries_,
+                                     const ValuesType &values_, LHSType &lhs_,
+                                     const RHSType &rhs_,
+                                     const entries_t &nodes_grouped_by_level_,
+                                     long node_count_, long node_groups_ = 0)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          node_count(node_count_),
+          node_groups(node_groups_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+      auto rowid     = nodes_grouped_by_level(my_league + node_count);
+      auto my_rank   = team.team_rank();
+
+      auto soffset   = row_map(rowid);
+      auto eoffset   = row_map(rowid + 1);
+      auto rhs_rowid = rhs(rowid);
+      scalar_t diff  = scalar_t(0.0);
+
+      Kokkos::parallel_reduce(
+          Kokkos::TeamThreadRange(team, soffset, eoffset),
+          [&](const long ptr, scalar_t &tdiff) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              tdiff = tdiff - val * lhs(colid);
+            }
+          },
+          diff);
 
-      /* scatter vector into Z */
-      int i2 = i1 + nscol;  // offset into rowind
-      Kokkos::View<scalar_t *, memory_space,
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::Atomic>>
-          Xatomic(X.data(), X.extent(0));
-      for (int ii = team_rank; ii < nsrow2; ii += team_size) {
-        int i = rowind(i2 + ii);
-        Xatomic(i) -= Z(ii);
-      }
       team.team_barrier();
-    }
-  }
-};
-#endif
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct UpperTriLvlSchedRPSolverFunctor {
-  typedef typename EntriesType::non_const_value_type lno_t;
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-
-  UpperTriLvlSchedRPSolverFunctor(const RowMapType &row_map_,
-                                  const EntriesType &entries_,
-                                  const ValuesType &values_, LHSType &lhs_,
-                                  const RHSType &rhs_,
-                                  const NGBLType &nodes_grouped_by_level_)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const lno_t i) const {
-    auto rowid = nodes_grouped_by_level(i);
-    // Assuming indices are sorted per row, diag entry is final index in the
-    // list
-    long soffset   = row_map(rowid);
-    long eoffset   = row_map(rowid + 1);
-    auto rhs_rowid = rhs(rowid);
-    for (long ptr = eoffset - 1; ptr >= soffset; --ptr) {
-      auto colid = entries(ptr);
-      auto val   = values(ptr);
-      if (colid != rowid) {
-        rhs_rowid = rhs_rowid - val * lhs(colid);
-      } else {
-        lhs(rowid) = rhs_rowid / val;
-      }
-    }  // end for ptr
-  }
 
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag &, const lno_t i) const {
-    auto rowid     = nodes_grouped_by_level(i);
-    long soffset   = row_map(rowid);
-    long eoffset   = row_map(rowid + 1);
-    auto rhs_rowid = rhs(rowid);
-    auto diag      = -1;
-    for (long ptr = eoffset - 1; ptr >= soffset; --ptr) {
-      auto colid = entries(ptr);
-      auto val   = values(ptr);
-      if (colid != rowid) {
-        rhs_rowid = rhs_rowid - val * lhs(colid);
-      } else {
-        diag = ptr;
+      // At end, finalize rowid == colid
+      // only one thread should do this, also can use Kokkos::single
+      if (my_rank == 0) {
+        // ASSUMPTION: sorted diagonal value located at start offset
+        lhs(rowid) = (rhs_rowid + diff) / values(soffset);
       }
-    }  // end for ptr
-    lhs(rowid) = rhs_rowid / values(diag);
-  }
-};
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct UpperTriLvlSchedTP1SolverFunctor {
-  typedef typename RowMapType::execution_space execution_space;
-  typedef Kokkos::TeamPolicy<execution_space> policy_type;
-  typedef typename policy_type::member_type member_type;
-  typedef typename EntriesType::non_const_value_type lno_t;
-  typedef typename ValuesType::non_const_value_type scalar_t;
-
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-
-  long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                    // offset
-  long node_groups;
-
-  UpperTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
-                                   const EntriesType &entries_,
-                                   const ValuesType &values_, LHSType &lhs_,
-                                   const RHSType &rhs_,
-                                   const NGBLType &nodes_grouped_by_level_,
-                                   long node_count_, long node_groups_ = 0)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        node_count(node_count_),
-        node_groups(node_groups_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
-    auto rowid     = nodes_grouped_by_level(my_league + node_count);
-    auto my_rank   = team.team_rank();
-
-    auto soffset   = row_map(rowid);
-    auto eoffset   = row_map(rowid + 1);
-    auto rhs_rowid = rhs(rowid);
-    scalar_t diff  = scalar_t(0.0);
-
-    Kokkos::parallel_reduce(
-        Kokkos::TeamThreadRange(team, soffset, eoffset),
-        [&](const long ptr, scalar_t &tdiff) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            tdiff = tdiff - val * lhs(colid);
-          }
-        },
-        diff);
-
-    team.team_barrier();
-
-    // At end, finalize rowid == colid
-    // only one thread should do this, also can use Kokkos::single
-    if (my_rank == 0) {
-      // ASSUMPTION: sorted diagonal value located at start offset
-      lhs(rowid) = (rhs_rowid + diff) / values(soffset);
     }
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag &, const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
-    auto rowid     = nodes_grouped_by_level(my_league + node_count);
-    auto my_rank   = team.team_rank();
-
-    auto soffset   = row_map(rowid);
-    auto eoffset   = row_map(rowid + 1);
-    auto rhs_rowid = rhs(rowid);
-    scalar_t diff  = scalar_t(0.0);
-
-    auto diag = -1;
-
-    Kokkos::parallel_reduce(
-        Kokkos::TeamThreadRange(team, soffset, eoffset),
-        [&](const long ptr, scalar_t &tdiff) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            tdiff = tdiff - val * lhs(colid);
-          } else {
-            diag = ptr;
-          }
-        },
-        diff);
-    team.team_barrier();
-
-    // At end, finalize rowid == colid
-    // only one thread should do this, also can use Kokkos::single
-    if (my_rank == 0) {
-      lhs(rowid) = (rhs_rowid + diff) / values(diag);
-    }
-  }
-};
-
-// FIXME CUDA: This algorithm not working with all integral type combos
-// In any case, this serves as a skeleton for 3-level hierarchical parallelism
-// for alg dev
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct UpperTriLvlSchedTP2SolverFunctor {
-  typedef typename RowMapType::execution_space execution_space;
-  typedef Kokkos::TeamPolicy<execution_space> policy_type;
-  typedef typename policy_type::member_type member_type;
-  typedef typename EntriesType::non_const_value_type lno_t;
-  typedef typename ValuesType::non_const_value_type scalar_t;
-
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-
-  long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                    // offset
-  long node_groups;
-
-  UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_,
-                                   const EntriesType &entries_,
-                                   const ValuesType &values_, LHSType &lhs_,
-                                   const RHSType &rhs_,
-                                   const NGBLType &nodes_grouped_by_level_,
-                                   long node_count_, long node_groups_ = 0)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        node_count(node_count_),
-        node_groups(node_groups_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
-
-    size_t nrows = row_map.extent(0) - 1;
-
-    Kokkos::parallel_for(
-        Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
-          auto rowid =
-              nodes_grouped_by_level(node_count + my_league * node_groups + ng);
-          if (size_t(rowid) < nrows) {
-            auto soffset   = row_map(rowid);
-            auto eoffset   = row_map(rowid + 1);
-            auto rhs_rowid = rhs(rowid);
-            scalar_t diff  = scalar_t(0.0);
-
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, soffset, eoffset),
-                [&](const long ptr, scalar_t &tdiff) {
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff = tdiff - val * lhs(colid);
-                  }
-                },
-                diff);
-
-            // ASSUMPTION: sorted diagonal value located at start offset
-            lhs(rowid) = (rhs_rowid + diff) / values(soffset);
-          }  // end if
-        });  // end TeamThreadRange
-
-    team.team_barrier();
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag &, const member_type &team) const {
-    auto my_league = team.league_rank();  // map to rowid
 
-    size_t nrows = row_map.extent(0) - 1;
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedTag &, const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+      auto rowid     = nodes_grouped_by_level(my_league + node_count);
+      auto my_rank   = team.team_rank();
 
-    Kokkos::parallel_for(
-        Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
-          auto rowid =
-              nodes_grouped_by_level(node_count + my_league * node_groups + ng);
-          if (size_t(rowid) < nrows) {
-            auto soffset   = row_map(rowid);
-            auto eoffset   = row_map(rowid + 1);
-            auto rhs_rowid = rhs(rowid);
-            scalar_t diff  = scalar_t(0.0);
-
-            auto diag = -1;
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, soffset, eoffset),
-                [&](const long ptr, scalar_t &tdiff) {
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff = tdiff - val * lhs(colid);
-                  } else {
-                    diag = ptr;
-                  }
-                },
-                diff);
+      auto soffset   = row_map(rowid);
+      auto eoffset   = row_map(rowid + 1);
+      auto rhs_rowid = rhs(rowid);
+      scalar_t diff  = scalar_t(0.0);
 
-            lhs(rowid) = (rhs_rowid + diff) / values(diag);
-          }  // end if
-        });  // end TeamThreadRange
+      auto diag = -1;
 
-    team.team_barrier();
-  }
-};
-
-// --------------------------------
-// Single-block functors
-// --------------------------------
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct LowerTriLvlSchedTP1SingleBlockFunctor {
-  typedef typename RowMapType::execution_space execution_space;
-  typedef Kokkos::TeamPolicy<execution_space> policy_type;
-  typedef typename policy_type::member_type member_type;
-  typedef typename EntriesType::non_const_value_type lno_t;
-  typedef typename ValuesType::non_const_value_type scalar_t;
-
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-  NGBLType nodes_per_level;
-
-  long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                    // offset
-  long lvl_start;
-  long lvl_end;
-  long cutoff;
-  // team_size: each team can be assigned a row, if there are enough rows...
-
-  LowerTriLvlSchedTP1SingleBlockFunctor(
-      const RowMapType &row_map_, const EntriesType &entries_,
-      const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
-      const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_,
-      long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        nodes_per_level(nodes_per_level_),
-        node_count(node_count_),
-        lvl_start(lvl_start_),
-        lvl_end(lvl_end_),
-        cutoff(cutoff_) {}
-
-  // SingleBlock: Only one block (or league) executing; team_rank used to map
-  // thread to row
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank         = team.team_rank();
-      diff                = scalar_t(0.0);
-
-      if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
-
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid + 1);
-        rhs_val = rhs(rowid);
+      Kokkos::parallel_reduce(
+          Kokkos::TeamThreadRange(team, soffset, eoffset),
+          [&](const long ptr, scalar_t &tdiff) {
+            auto colid = entries(ptr);
+            auto val   = values(ptr);
+            if (colid != rowid) {
+              tdiff = tdiff - val * lhs(colid);
+            } else {
+              diag = ptr;
+            }
+          },
+          diff);
+      team.team_barrier();
 
-#ifdef SERIAL_FOR_LOOP
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            diff -= val * lhs(colid);
-          }
-        }
-#else
-        auto trange = eoffset - soffset;
-        Kokkos::parallel_reduce(
-            Kokkos::ThreadVectorRange(team, trange),
-            [&](const int loffset, scalar_t &tdiff) {
-              auto ptr   = soffset + loffset;
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if (colid != rowid) {
-                tdiff -= val * lhs(colid);
-              }
-            },
-            diff);
-#endif
-        // ASSUMPTION: sorted diagonal value located at eoffset - 1
-        lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
-      }  // end if team.team_rank() < nodes_this_lvl
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // per thread
-        mut_node_count += nodes_this_lvl;
+      // At end, finalize rowid == colid
+      // only one thread should do this, also can use Kokkos::single
+      if (my_rank == 0) {
+        lhs(rowid) = (rhs_rowid + diff) / values(diag);
       }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end operator
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag &, const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank         = team.team_rank();
-      diff                = scalar_t(0.0);
-
-      if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid + 1);
-        rhs_val = rhs(rowid);
+    }
+  };
+
+  // FIXME CUDA: This algorithm not working with all integral type combos
+  // In any case, this serves as a skeleton for 3-level hierarchical parallelism
+  // for alg dev
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct UpperTriLvlSchedTP2SolverFunctor {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+
+    long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                      // offset
+    long node_groups;
+
+    UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_,
+                                     const EntriesType &entries_,
+                                     const ValuesType &values_, LHSType &lhs_,
+                                     const RHSType &rhs_,
+                                     const entries_t &nodes_grouped_by_level_,
+                                     long node_count_, long node_groups_ = 0)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          node_count(node_count_),
+          node_groups(node_groups_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+
+      size_t nrows = row_map.extent(0) - 1;
+
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
+            auto rowid = nodes_grouped_by_level(node_count +
+                                                my_league * node_groups + ng);
+            if (size_t(rowid) < nrows) {
+              auto soffset   = row_map(rowid);
+              auto eoffset   = row_map(rowid + 1);
+              auto rhs_rowid = rhs(rowid);
+              scalar_t diff  = scalar_t(0.0);
+
+              Kokkos::parallel_reduce(
+                  Kokkos::ThreadVectorRange(team, soffset, eoffset),
+                  [&](const long ptr, scalar_t &tdiff) {
+                    auto colid = entries(ptr);
+                    auto val   = values(ptr);
+                    if (colid != rowid) {
+                      tdiff = tdiff - val * lhs(colid);
+                    }
+                  },
+                  diff);
+
+              // ASSUMPTION: sorted diagonal value located at start offset
+              lhs(rowid) = (rhs_rowid + diff) / values(soffset);
+            }  // end if
+          });  // end TeamThreadRange
 
-#ifdef SERIAL_FOR_LOOP
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            diff -= val * lhs(colid);
-          }
-        }
-#else
-        auto trange = eoffset - soffset;
-        auto diag   = -1;
+      team.team_barrier();
+    }
 
-        Kokkos::parallel_reduce(
-            Kokkos::ThreadVectorRange(team, trange),
-            [&](const int loffset, scalar_t &tdiff) {
-              auto ptr = soffset + loffset;
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedTag &, const member_type &team) const {
+      auto my_league = team.league_rank();  // map to rowid
+
+      size_t nrows = row_map.extent(0) - 1;
+
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
+            auto rowid = nodes_grouped_by_level(node_count +
+                                                my_league * node_groups + ng);
+            if (size_t(rowid) < nrows) {
+              auto soffset   = row_map(rowid);
+              auto eoffset   = row_map(rowid + 1);
+              auto rhs_rowid = rhs(rowid);
+              scalar_t diff  = scalar_t(0.0);
+
+              auto diag = -1;
+              Kokkos::parallel_reduce(
+                  Kokkos::ThreadVectorRange(team, soffset, eoffset),
+                  [&](const long ptr, scalar_t &tdiff) {
+                    auto colid = entries(ptr);
+                    auto val   = values(ptr);
+                    if (colid != rowid) {
+                      tdiff = tdiff - val * lhs(colid);
+                    } else {
+                      diag = ptr;
+                    }
+                  },
+                  diff);
+
+              lhs(rowid) = (rhs_rowid + diff) / values(diag);
+            }  // end if
+          });  // end TeamThreadRange
 
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if (colid != rowid) {
-                tdiff -= val * lhs(colid);
-              } else {
-                diag = ptr;
-              }
-            },
-            diff);
-#endif
-        lhs(rowid) = (rhs_val + diff) / values(diag);
-      }  // end if team.team_rank() < nodes_this_lvl
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // per thread
-        mut_node_count += nodes_this_lvl;
-      }
       team.team_barrier();
-    }  // end for lvl
-  }    // end operator
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const LargerCutoffTag &, const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank    = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple
-      // rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff;
-           my_rank += team.team_size()) {
-        diff = scalar_t(0.0);
+    }
+  };
+
+  // --------------------------------
+  // Single-block functors
+  // --------------------------------
+
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct LowerTriLvlSchedTP1SingleBlockFunctor {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+    entries_t nodes_per_level;
+
+    long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                      // offset
+    long lvl_start;
+    long lvl_end;
+    long cutoff;
+    // team_size: each team can be assigned a row, if there are enough rows...
+
+    LowerTriLvlSchedTP1SingleBlockFunctor(
+        const RowMapType &row_map_, const EntriesType &entries_,
+        const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
+        const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_,
+        long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          nodes_per_level(nodes_per_level_),
+          node_count(node_count_),
+          lvl_start(lvl_start_),
+          lvl_end(lvl_end_),
+          cutoff(cutoff_) {}
+
+    // SingleBlock: Only one block (or league) executing; team_rank used to map
+    // thread to row
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_rank         = team.team_rank();
+        diff                = scalar_t(0.0);
+
         if (my_rank < nodes_this_lvl) {
           // THIS is where the mapping of threadid to rowid happens
-          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+          rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+
           soffset = row_map(rowid);
           eoffset = row_map(rowid + 1);
           rhs_val = rhs(rowid);
@@ -1797,38 +1602,31 @@ struct LowerTriLvlSchedTP1SingleBlockFunctor {
               },
               diff);
 #endif
-          // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower
-          // tri, soffset for upper tri
+          // ASSUMPTION: sorted diagonal value located at eoffset - 1
           lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
         }  // end if team.team_rank() < nodes_this_lvl
-      }    // end for my_rank loop
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // per thread
-        mut_node_count += nodes_this_lvl;
-      }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end tagged operator
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedLargerCutoffTag &,
-                  const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank    = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple
-      // rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff;
-           my_rank += team.team_size()) {
-        diff = scalar_t(0.0);
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl per thread
+          mut_node_count += nodes_this_lvl;
+        }
+        team.team_barrier();
+      }  // end for lvl
+    }    // end operator
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedTag &, const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_rank         = team.team_rank();
+        diff                = scalar_t(0.0);
+
         if (my_rank < nodes_this_lvl) {
           // THIS is where the mapping of threadid to rowid happens
           rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
@@ -1851,7 +1649,8 @@ struct LowerTriLvlSchedTP1SingleBlockFunctor {
           Kokkos::parallel_reduce(
               Kokkos::ThreadVectorRange(team, trange),
               [&](const int loffset, scalar_t &tdiff) {
-                auto ptr   = soffset + loffset;
+                auto ptr = soffset + loffset;
+
                 auto colid = entries(ptr);
                 auto val   = values(ptr);
                 if (colid != rowid) {
@@ -1864,194 +1663,189 @@ struct LowerTriLvlSchedTP1SingleBlockFunctor {
 #endif
           lhs(rowid) = (rhs_val + diff) / values(diag);
         }  // end if team.team_rank() < nodes_this_lvl
-      }    // end for my_rank loop
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // per thread
-        mut_node_count += nodes_this_lvl;
-      }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end tagged operator
-};
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct UpperTriLvlSchedTP1SingleBlockFunctor {
-  typedef typename RowMapType::execution_space execution_space;
-  typedef Kokkos::TeamPolicy<execution_space> policy_type;
-  typedef typename policy_type::member_type member_type;
-  typedef typename EntriesType::non_const_value_type lno_t;
-  typedef typename ValuesType::non_const_value_type scalar_t;
-
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-  NGBLType nodes_per_level;
-
-  long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                    // offset
-  long lvl_start;
-  long lvl_end;
-  long cutoff;
-  // team_size: each team can be assigned a row, if there are enough rows...
-
-  UpperTriLvlSchedTP1SingleBlockFunctor(
-      const RowMapType &row_map_, const EntriesType &entries_,
-      const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
-      const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_,
-      long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        nodes_per_level(nodes_per_level_),
-        node_count(node_count_),
-        lvl_start(lvl_start_),
-        lvl_end(lvl_end_),
-        cutoff(cutoff_) {}
-
-  // SingleBlock: Only one block (or league) executing; team_rank used to map
-  // thread to row
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank         = team.team_rank();
-      diff                = scalar_t(0.0);
-
-      if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid + 1);
-        rhs_val = rhs(rowid);
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl per thread
+          mut_node_count += nodes_this_lvl;
+        }
+        team.team_barrier();
+      }  // end for lvl
+    }    // end operator
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const LargerCutoffTag &, const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_team_rank    = team.team_rank();
+        // If cutoff > team_size, then a thread will be responsible for multiple
+        // rows - this may be a helpful scenario depending on occupancy etc.
+        for (int my_rank = my_team_rank; my_rank < cutoff;
+             my_rank += team.team_size()) {
+          diff = scalar_t(0.0);
+          if (my_rank < nodes_this_lvl) {
+            // THIS is where the mapping of threadid to rowid happens
+            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+            soffset = row_map(rowid);
+            eoffset = row_map(rowid + 1);
+            rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            diff -= val * lhs(colid);
-          }
-        }
-#else
-        auto trange = eoffset - soffset;
-        Kokkos::parallel_reduce(
-            Kokkos::ThreadVectorRange(team, trange),
-            [&](const int loffset, scalar_t &tdiff) {
-              auto ptr   = soffset + loffset;
+            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
               auto colid = entries(ptr);
               auto val   = values(ptr);
               if (colid != rowid) {
-                tdiff -= val * lhs(colid);
+                diff -= val * lhs(colid);
               }
-            },
-            diff);
+            }
+#else
+            auto trange = eoffset - soffset;
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, trange),
+                [&](const int loffset, scalar_t &tdiff) {
+                  auto ptr   = soffset + loffset;
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff -= val * lhs(colid);
+                  }
+                },
+                diff);
 #endif
-        // ASSUMPTION: sorted diagonal value located at soffset
-        lhs(rowid) = (rhs_val + diff) / values(soffset);
-      }  // end if
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // each thread
-        mut_node_count += nodes_this_lvl;
-      }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end operator
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag &, const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank         = team.team_rank();
-      diff                = scalar_t(0.0);
-
-      if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid + 1);
-        rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-        auto diag = -1;
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            diff -= val * lhs(colid);
-          } else {
-            diag = ptr;
-          }
+            // ASSUMPTION: sorted diagonal value located at eoffset - 1 for
+            // lower tri, soffset for upper tri
+            lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
+          }  // end if team.team_rank() < nodes_this_lvl
+        }    // end for my_rank loop
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl per thread
+          mut_node_count += nodes_this_lvl;
         }
-#else
-        auto trange = eoffset - soffset;
-        auto diag   = -1;
+        team.team_barrier();
+      }  // end for lvl
+    }    // end tagged operator
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedLargerCutoffTag &,
+                    const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_team_rank    = team.team_rank();
+        // If cutoff > team_size, then a thread will be responsible for multiple
+        // rows - this may be a helpful scenario depending on occupancy etc.
+        for (int my_rank = my_team_rank; my_rank < cutoff;
+             my_rank += team.team_size()) {
+          diff = scalar_t(0.0);
+          if (my_rank < nodes_this_lvl) {
+            // THIS is where the mapping of threadid to rowid happens
+            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+            soffset = row_map(rowid);
+            eoffset = row_map(rowid + 1);
+            rhs_val = rhs(rowid);
 
-        Kokkos::parallel_reduce(
-            Kokkos::ThreadVectorRange(team, trange),
-            [&](const int loffset, scalar_t &tdiff) {
-              auto ptr   = soffset + loffset;
+#ifdef SERIAL_FOR_LOOP
+            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
               auto colid = entries(ptr);
               auto val   = values(ptr);
               if (colid != rowid) {
-                tdiff -= val * lhs(colid);
-              } else {
-                diag = ptr;
+                diff -= val * lhs(colid);
               }
-            },
-            diff);
+            }
+#else
+            auto trange = eoffset - soffset;
+            auto diag   = -1;
+
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, trange),
+                [&](const int loffset, scalar_t &tdiff) {
+                  auto ptr   = soffset + loffset;
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff -= val * lhs(colid);
+                  } else {
+                    diag = ptr;
+                  }
+                },
+                diff);
 #endif
-        lhs(rowid) = (rhs_val + diff) / values(diag);
-      }  // end if
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // each thread
-        mut_node_count += nodes_this_lvl;
-      }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end operator
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const LargerCutoffTag &, const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank    = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple
-      // rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff;
-           my_rank += team.team_size()) {
-        diff = scalar_t(0.0);
+            lhs(rowid) = (rhs_val + diff) / values(diag);
+          }  // end if team.team_rank() < nodes_this_lvl
+        }    // end for my_rank loop
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl per thread
+          mut_node_count += nodes_this_lvl;
+        }
+        team.team_barrier();
+      }  // end for lvl
+    }    // end tagged operator
+  };
+
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct UpperTriLvlSchedTP1SingleBlockFunctor {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+    entries_t nodes_per_level;
+
+    long node_count;  // like "block" offset into ngbl, my_league is the "local"
+    // offset
+    long lvl_start;
+    long lvl_end;
+    long cutoff;
+    // team_size: each team can be assigned a row, if there are enough rows...
+
+    UpperTriLvlSchedTP1SingleBlockFunctor(
+        const RowMapType &row_map_, const EntriesType &entries_,
+        const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
+        const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_,
+        long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          nodes_per_level(nodes_per_level_),
+          node_count(node_count_),
+          lvl_start(lvl_start_),
+          lvl_end(lvl_end_),
+          cutoff(cutoff_) {}
+
+    // SingleBlock: Only one block (or league) executing; team_rank used to map
+    // thread to row
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_rank         = team.team_rank();
+        diff                = scalar_t(0.0);
+
         if (my_rank < nodes_this_lvl) {
           // THIS is where the mapping of threadid to rowid happens
           rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
@@ -2081,38 +1875,32 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor {
               },
               diff);
 #endif
-          // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower
-          // tri, soffset for upper tri
+          // ASSUMPTION: sorted diagonal value located at soffset
           lhs(rowid) = (rhs_val + diff) / values(soffset);
-        }  // end if team.team_rank() < nodes_this_lvl
-      }    // end for my_rank loop
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // per thread
-        mut_node_count += nodes_this_lvl;
-      }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end tagged operator
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedLargerCutoffTag &,
-                  const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank    = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple
-      // rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff;
-           my_rank += team.team_size()) {
-        diff = scalar_t(0.0);
+        }  // end if
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl each thread
+          mut_node_count += nodes_this_lvl;
+        }
+        team.team_barrier();
+      }  // end for lvl
+    }    // end operator
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedTag &, const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_rank         = team.team_rank();
+        diff                = scalar_t(0.0);
+
         if (my_rank < nodes_this_lvl) {
           // THIS is where the mapping of threadid to rowid happens
           rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
@@ -2134,6 +1922,7 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor {
 #else
           auto trange = eoffset - soffset;
           auto diag   = -1;
+
           Kokkos::parallel_reduce(
               Kokkos::ThreadVectorRange(team, trange),
               [&](const int loffset, scalar_t &tdiff) {
@@ -2149,204 +1938,199 @@ struct UpperTriLvlSchedTP1SingleBlockFunctor {
               diff);
 #endif
           lhs(rowid) = (rhs_val + diff) / values(diag);
-        }  // end if team.team_rank() < nodes_this_lvl
-      }    // end for my_rank loop
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // per thread
-        mut_node_count += nodes_this_lvl;
-      }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end tagged operator
-};
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct TriLvlSchedTP1SingleBlockFunctor {
-  typedef typename RowMapType::execution_space execution_space;
-  typedef Kokkos::TeamPolicy<execution_space> policy_type;
-  typedef typename policy_type::member_type member_type;
-  typedef typename EntriesType::non_const_value_type lno_t;
-  typedef typename ValuesType::non_const_value_type scalar_t;
-
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-  NGBLType nodes_per_level;
-
-  long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                    // offset
-  long lvl_start;
-  long lvl_end;
-  const bool is_lowertri;
-  const int dense_nrows;
-  const int cutoff;
-  // team_size: each team can be assigned a row, if there are enough rows...
-
-  TriLvlSchedTP1SingleBlockFunctor(
-      const RowMapType &row_map_, const EntriesType &entries_,
-      const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
-      const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_,
-      long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_,
-      const int dense_nrows_ = 0, const int cutoff_ = 0)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        nodes_per_level(nodes_per_level_),
-        node_count(node_count_),
-        lvl_start(lvl_start_),
-        lvl_end(lvl_end_),
-        is_lowertri(is_lower_),
-        dense_nrows(dense_nrows_),
-        cutoff(cutoff_) {}
-
-  // SingleBlock: Only one block (or league) executing; team_rank used to map
-  // thread to row
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank         = team.team_rank();
-      diff                = scalar_t(0.0);
-
-      if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid + 1);
-        rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            diff -= val * lhs(colid);
-          }
+        }  // end if
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl each thread
+          mut_node_count += nodes_this_lvl;
         }
-#else
-        auto trange = eoffset - soffset;
-        Kokkos::parallel_reduce(
-            Kokkos::ThreadVectorRange(team, trange),
-            [&](const int loffset, scalar_t &tdiff) {
-              auto ptr   = soffset + loffset;
+        team.team_barrier();
+      }  // end for lvl
+    }    // end operator
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const LargerCutoffTag &, const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_team_rank    = team.team_rank();
+        // If cutoff > team_size, then a thread will be responsible for multiple
+        // rows - this may be a helpful scenario depending on occupancy etc.
+        for (int my_rank = my_team_rank; my_rank < cutoff;
+             my_rank += team.team_size()) {
+          diff = scalar_t(0.0);
+          if (my_rank < nodes_this_lvl) {
+            // THIS is where the mapping of threadid to rowid happens
+            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+            soffset = row_map(rowid);
+            eoffset = row_map(rowid + 1);
+            rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
               auto colid = entries(ptr);
               auto val   = values(ptr);
               if (colid != rowid) {
-                tdiff -= val * lhs(colid);
+                diff -= val * lhs(colid);
               }
-            },
-            diff);
+            }
+#else
+            auto trange = eoffset - soffset;
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, trange),
+                [&](const int loffset, scalar_t &tdiff) {
+                  auto ptr   = soffset + loffset;
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff -= val * lhs(colid);
+                  }
+                },
+                diff);
 #endif
-
-        // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower
-        // tri, soffset for upper tri
-        if (is_lowertri)
-          lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
-        else
-          lhs(rowid) = (rhs_val + diff) / values(soffset);
-      }  // end if team.team_rank() < nodes_this_lvl
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // per thread
-        mut_node_count += nodes_this_lvl;
-      }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end operator
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedTag &, const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank         = team.team_rank();
-      diff                = scalar_t(0.0);
-
-      if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid + 1);
-        rhs_val = rhs(rowid);
+            // ASSUMPTION: sorted diagonal value located at eoffset - 1 for
+            // lower tri, soffset for upper tri
+            lhs(rowid) = (rhs_val + diff) / values(soffset);
+          }  // end if team.team_rank() < nodes_this_lvl
+        }    // end for my_rank loop
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl per thread
+          mut_node_count += nodes_this_lvl;
+        }
+        team.team_barrier();
+      }  // end for lvl
+    }    // end tagged operator
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedLargerCutoffTag &,
+                    const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_team_rank    = team.team_rank();
+        // If cutoff > team_size, then a thread will be responsible for multiple
+        // rows - this may be a helpful scenario depending on occupancy etc.
+        for (int my_rank = my_team_rank; my_rank < cutoff;
+             my_rank += team.team_size()) {
+          diff = scalar_t(0.0);
+          if (my_rank < nodes_this_lvl) {
+            // THIS is where the mapping of threadid to rowid happens
+            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+            soffset = row_map(rowid);
+            eoffset = row_map(rowid + 1);
+            rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
-        auto diag = -1;
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            diff -= val * lhs(colid);
-          } else {
-            diag = ptr;
-          }
-        }
-#else
-        auto trange = eoffset - soffset;
-        auto diag   = -1;
-        Kokkos::parallel_reduce(
-            Kokkos::ThreadVectorRange(team, trange),
-            [&](const int loffset, scalar_t &tdiff) {
-              auto ptr   = soffset + loffset;
+            auto diag = -1;
+            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
               auto colid = entries(ptr);
               auto val   = values(ptr);
               if (colid != rowid) {
-                tdiff -= val * lhs(colid);
+                diff -= val * lhs(colid);
               } else {
                 diag = ptr;
               }
-            },
-            diff);
+            }
+#else
+            auto trange = eoffset - soffset;
+            auto diag   = -1;
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, trange),
+                [&](const int loffset, scalar_t &tdiff) {
+                  auto ptr   = soffset + loffset;
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff -= val * lhs(colid);
+                  } else {
+                    diag = ptr;
+                  }
+                },
+                diff);
 #endif
-        lhs(rowid) = (rhs_val + diff) / values(diag);
-      }  // end if team.team_rank() < nodes_this_lvl
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // per thread
-        mut_node_count += nodes_this_lvl;
-      }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end operator
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const LargerCutoffTag &, const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank    = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple
-      // rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff;
-           my_rank += team.team_size()) {
-        diff = scalar_t(0.0);
+            lhs(rowid) = (rhs_val + diff) / values(diag);
+          }  // end if team.team_rank() < nodes_this_lvl
+        }    // end for my_rank loop
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl per thread
+          mut_node_count += nodes_this_lvl;
+        }
+        team.team_barrier();
+      }  // end for lvl
+    }    // end tagged operator
+  };
+
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct TriLvlSchedTP1SingleBlockFunctor {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+    entries_t nodes_per_level;
+
+    long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                      // offset
+    long lvl_start;
+    long lvl_end;
+    const bool is_lowertri;
+    const int dense_nrows;
+    const int cutoff;
+    // team_size: each team can be assigned a row, if there are enough rows...
+
+    TriLvlSchedTP1SingleBlockFunctor(
+        const RowMapType &row_map_, const EntriesType &entries_,
+        const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
+        const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_,
+        long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_,
+        const int dense_nrows_ = 0, const int cutoff_ = 0)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          nodes_per_level(nodes_per_level_),
+          node_count(node_count_),
+          lvl_start(lvl_start_),
+          lvl_end(lvl_end_),
+          is_lowertri(is_lower_),
+          dense_nrows(dense_nrows_),
+          cutoff(cutoff_) {}
+
+    // SingleBlock: Only one block (or league) executing; team_rank used to map
+    // thread to row
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_rank         = team.team_rank();
+        diff                = scalar_t(0.0);
+
         if (my_rank < nodes_this_lvl) {
           // THIS is where the mapping of threadid to rowid happens
           rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
@@ -2384,34 +2168,29 @@ struct TriLvlSchedTP1SingleBlockFunctor {
           else
             lhs(rowid) = (rhs_val + diff) / values(soffset);
         }  // end if team.team_rank() < nodes_this_lvl
-      }    // end for my_rank loop
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // per thread
-        mut_node_count += nodes_this_lvl;
-      }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end tagged operator
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const UnsortedLargerCutoffTag &,
-                  const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank    = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple
-      // rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff;
-           my_rank += team.team_size()) {
-        diff = scalar_t(0.0);
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl per thread
+          mut_node_count += nodes_this_lvl;
+        }
+        team.team_barrier();
+      }  // end for lvl
+    }    // end operator
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedTag &, const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_rank         = team.team_rank();
+        diff                = scalar_t(0.0);
+
         if (my_rank < nodes_this_lvl) {
           // THIS is where the mapping of threadid to rowid happens
           rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
@@ -2449,143 +2228,205 @@ struct TriLvlSchedTP1SingleBlockFunctor {
 #endif
           lhs(rowid) = (rhs_val + diff) / values(diag);
         }  // end if team.team_rank() < nodes_this_lvl
-      }    // end for my_rank loop
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // per thread
-        mut_node_count += nodes_this_lvl;
-      }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end tagged operator
-};
-
-template <class RowMapType, class EntriesType, class ValuesType, class LHSType,
-          class RHSType, class NGBLType>
-struct TriLvlSchedTP1SingleBlockFunctorDiagValues {
-  typedef typename RowMapType::execution_space execution_space;
-  typedef Kokkos::TeamPolicy<execution_space> policy_type;
-  typedef typename policy_type::member_type member_type;
-  typedef typename EntriesType::non_const_value_type lno_t;
-  typedef typename ValuesType::non_const_value_type scalar_t;
-
-  RowMapType row_map;
-  EntriesType entries;
-  ValuesType values;
-  LHSType lhs;
-  RHSType rhs;
-  NGBLType nodes_grouped_by_level;
-  NGBLType nodes_per_level;
-  ValuesType diagonal_values;
-
-  long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                    // offset
-  long lvl_start;
-  long lvl_end;
-  const bool is_lowertri;
-  const int dense_nrows;
-  const int cutoff;
-  // team_size: each team can be assigned a row, if there are enough rows...
-
-  TriLvlSchedTP1SingleBlockFunctorDiagValues(
-      const RowMapType &row_map_, const EntriesType &entries_,
-      const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
-      const NGBLType &nodes_grouped_by_level_, const NGBLType &nodes_per_level_,
-      const ValuesType &diagonal_values_, long node_count_,
-      const long lvl_start_, const long lvl_end_, const bool is_lower_,
-      const int dense_nrows_ = 0, const int cutoff_ = 0)
-      : row_map(row_map_),
-        entries(entries_),
-        values(values_),
-        lhs(lhs_),
-        rhs(rhs_),
-        nodes_grouped_by_level(nodes_grouped_by_level_),
-        nodes_per_level(nodes_per_level_),
-        diagonal_values(diagonal_values_),
-        node_count(node_count_),
-        lvl_start(lvl_start_),
-        lvl_end(lvl_end_),
-        is_lowertri(is_lower_),
-        dense_nrows(dense_nrows_),
-        cutoff(cutoff_) {}
-
-  // SingleBlock: Only one block (or league) executing; team_rank used to map
-  // thread to row
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_rank         = team.team_rank();
-      diff                = scalar_t(0.0);
-
-      if (my_rank < nodes_this_lvl) {
-        // THIS is where the mapping of threadid to rowid happens
-        rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-        soffset = row_map(rowid);
-        eoffset = row_map(rowid + 1);
-        rhs_val = rhs(rowid);
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl per thread
+          mut_node_count += nodes_this_lvl;
+        }
+        team.team_barrier();
+      }  // end for lvl
+    }    // end operator
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const LargerCutoffTag &, const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_team_rank    = team.team_rank();
+        // If cutoff > team_size, then a thread will be responsible for multiple
+        // rows - this may be a helpful scenario depending on occupancy etc.
+        for (int my_rank = my_team_rank; my_rank < cutoff;
+             my_rank += team.team_size()) {
+          diff = scalar_t(0.0);
+          if (my_rank < nodes_this_lvl) {
+            // THIS is where the mapping of threadid to rowid happens
+            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+            soffset = row_map(rowid);
+            eoffset = row_map(rowid + 1);
+            rhs_val = rhs(rowid);
 
 #ifdef SERIAL_FOR_LOOP
-        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-          auto colid = entries(ptr);
-          auto val   = values(ptr);
-          if (colid != rowid) {
-            diff -= val * lhs(colid);
-          }
-        }
-#else
-        auto trange = eoffset - soffset;
-        Kokkos::parallel_reduce(
-            Kokkos::ThreadVectorRange(team, trange),
-            [&](const int loffset, scalar_t &tdiff) {
-              auto ptr   = soffset + loffset;
+            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
               auto colid = entries(ptr);
               auto val   = values(ptr);
+              if (colid != rowid) {
+                diff -= val * lhs(colid);
+              }
+            }
+#else
+            auto trange = eoffset - soffset;
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, trange),
+                [&](const int loffset, scalar_t &tdiff) {
+                  auto ptr   = soffset + loffset;
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff -= val * lhs(colid);
+                  }
+                },
+                diff);
+#endif
 
+            // ASSUMPTION: sorted diagonal value located at eoffset - 1 for
+            // lower tri, soffset for upper tri
+            if (is_lowertri)
+              lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
+            else
+              lhs(rowid) = (rhs_val + diff) / values(soffset);
+          }  // end if team.team_rank() < nodes_this_lvl
+        }    // end for my_rank loop
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl per thread
+          mut_node_count += nodes_this_lvl;
+        }
+        team.team_barrier();
+      }  // end for lvl
+    }    // end tagged operator
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedLargerCutoffTag &,
+                    const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_team_rank    = team.team_rank();
+        // If cutoff > team_size, then a thread will be responsible for multiple
+        // rows - this may be a helpful scenario depending on occupancy etc.
+        for (int my_rank = my_team_rank; my_rank < cutoff;
+             my_rank += team.team_size()) {
+          diff = scalar_t(0.0);
+          if (my_rank < nodes_this_lvl) {
+            // THIS is where the mapping of threadid to rowid happens
+            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+            soffset = row_map(rowid);
+            eoffset = row_map(rowid + 1);
+            rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+            auto diag = -1;
+            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+              auto colid = entries(ptr);
+              auto val   = values(ptr);
               if (colid != rowid) {
-                tdiff -= val * lhs(colid);
+                diff -= val * lhs(colid);
+              } else {
+                diag = ptr;
               }
-            },
-            diff);
+            }
+#else
+            auto trange = eoffset - soffset;
+            auto diag   = -1;
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, trange),
+                [&](const int loffset, scalar_t &tdiff) {
+                  auto ptr   = soffset + loffset;
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff -= val * lhs(colid);
+                  } else {
+                    diag = ptr;
+                  }
+                },
+                diff);
 #endif
-        // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower
-        // tri, soffset for upper tri
-        lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid);
-      }  // end if team.team_rank() < nodes_this_lvl
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // per thread
-        mut_node_count += nodes_this_lvl;
-      }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end operator
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const LargerCutoffTag &, const member_type &team) const {
-    long mut_node_count = node_count;
-    typename NGBLType::non_const_value_type rowid{0};
-    typename RowMapType::non_const_value_type soffset{0};
-    typename RowMapType::non_const_value_type eoffset{0};
-    typename RHSType::non_const_value_type rhs_val{0};
-    scalar_t diff = scalar_t(0.0);
-
-    for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-      auto nodes_this_lvl = nodes_per_level(lvl);
-      int my_team_rank    = team.team_rank();
-      // If cutoff > team_size, then a thread will be responsible for multiple
-      // rows - this may be a helpful scenario depending on occupancy etc.
-      for (int my_rank = my_team_rank; my_rank < cutoff;
-           my_rank += team.team_size()) {
-        diff = scalar_t(0.0);
+            lhs(rowid) = (rhs_val + diff) / values(diag);
+          }  // end if team.team_rank() < nodes_this_lvl
+        }    // end for my_rank loop
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl per thread
+          mut_node_count += nodes_this_lvl;
+        }
+        team.team_barrier();
+      }  // end for lvl
+    }    // end tagged operator
+  };
+
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class LHSType, class RHSType>
+  struct TriLvlSchedTP1SingleBlockFunctorDiagValues {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+    LHSType lhs;
+    RHSType rhs;
+    entries_t nodes_grouped_by_level;
+    entries_t nodes_per_level;
+    ValuesType diagonal_values;
+
+    long node_count;  // like "block" offset into ngbl, my_league is the "local"
+                      // offset
+    long lvl_start;
+    long lvl_end;
+    const bool is_lowertri;
+    const int dense_nrows;
+    const int cutoff;
+    // team_size: each team can be assigned a row, if there are enough rows...
+
+    TriLvlSchedTP1SingleBlockFunctorDiagValues(
+        const RowMapType &row_map_, const EntriesType &entries_,
+        const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
+        const entries_t &nodes_grouped_by_level_,
+        const entries_t &nodes_per_level_, const ValuesType &diagonal_values_,
+        long node_count_, const long lvl_start_, const long lvl_end_,
+        const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0)
+        : row_map(row_map_),
+          entries(entries_),
+          values(values_),
+          lhs(lhs_),
+          rhs(rhs_),
+          nodes_grouped_by_level(nodes_grouped_by_level_),
+          nodes_per_level(nodes_per_level_),
+          diagonal_values(diagonal_values_),
+          node_count(node_count_),
+          lvl_start(lvl_start_),
+          lvl_end(lvl_end_),
+          is_lowertri(is_lower_),
+          dense_nrows(dense_nrows_),
+          cutoff(cutoff_) {}
+
+    // SingleBlock: Only one block (or league) executing; team_rank used to map
+    // thread to row
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_rank         = team.team_rank();
+        diff                = scalar_t(0.0);
+
         if (my_rank < nodes_this_lvl) {
           // THIS is where the mapping of threadid to rowid happens
           rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
@@ -2609,366 +2450,301 @@ struct TriLvlSchedTP1SingleBlockFunctorDiagValues {
                 auto ptr   = soffset + loffset;
                 auto colid = entries(ptr);
                 auto val   = values(ptr);
+
                 if (colid != rowid) {
                   tdiff -= val * lhs(colid);
                 }
               },
               diff);
 #endif
+          // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower
+          // tri, soffset for upper tri
           lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid);
         }  // end if team.team_rank() < nodes_this_lvl
-      }    // end for my_rank loop
-      {
-        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl
-        // per thread
-        mut_node_count += nodes_this_lvl;
-      }
-      team.team_barrier();
-    }  // end for lvl
-  }    // end tagged operator
-};
-
-#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
-template <class SpaceType>
-struct ReturnTeamPolicyType;
-
-#ifdef KOKKOS_ENABLE_SERIAL
-template <>
-struct ReturnTeamPolicyType<Kokkos::Serial> {
-  using PolicyType = Kokkos::TeamPolicy<Kokkos::Serial>;
-
-  static inline PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt, ts);
-  }
-
-  template <class ExecInstanceType>
-  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) {
-    return PolicyType(nt, ts);
-    // return PolicyType(ExecInstanceType(),nt,ts);
-  }
-};
-#endif
-#ifdef KOKKOS_ENABLE_OPENMP
-template <>
-struct ReturnTeamPolicyType<Kokkos::OpenMP> {
-  using PolicyType = Kokkos::TeamPolicy<Kokkos::OpenMP>;
-
-  static inline PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt, ts);
-  }
-
-  template <class ExecInstanceType>
-  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) {
-    return PolicyType(nt, ts);
-    // return PolicyType(ExecInstanceType(),nt,ts);
-  }
-};
-#endif
-#ifdef KOKKOS_ENABLE_CUDA
-template <>
-struct ReturnTeamPolicyType<Kokkos::Cuda> {
-  using PolicyType = Kokkos::TeamPolicy<Kokkos::Cuda>;
-
-  static inline PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt, ts);
-  }
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl per thread
+          mut_node_count += nodes_this_lvl;
+        }
+        team.team_barrier();
+      }  // end for lvl
+    }    // end operator
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const LargerCutoffTag &, const member_type &team) const {
+      long mut_node_count = node_count;
+      typename entries_t::non_const_value_type rowid{0};
+      typename RowMapType::non_const_value_type soffset{0};
+      typename RowMapType::non_const_value_type eoffset{0};
+      typename RHSType::non_const_value_type rhs_val{0};
+      scalar_t diff = scalar_t(0.0);
+
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        auto nodes_this_lvl = nodes_per_level(lvl);
+        int my_team_rank    = team.team_rank();
+        // If cutoff > team_size, then a thread will be responsible for multiple
+        // rows - this may be a helpful scenario depending on occupancy etc.
+        for (int my_rank = my_team_rank; my_rank < cutoff;
+             my_rank += team.team_size()) {
+          diff = scalar_t(0.0);
+          if (my_rank < nodes_this_lvl) {
+            // THIS is where the mapping of threadid to rowid happens
+            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
+            soffset = row_map(rowid);
+            eoffset = row_map(rowid + 1);
+            rhs_val = rhs(rowid);
 
-  template <class ExecInstanceType>
-  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
-    return PolicyType(stream, nt, ts);
-  }
-};
+#ifdef SERIAL_FOR_LOOP
+            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+              auto colid = entries(ptr);
+              auto val   = values(ptr);
+              if (colid != rowid) {
+                diff -= val * lhs(colid);
+              }
+            }
+#else
+            auto trange = eoffset - soffset;
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(team, trange),
+                [&](const int loffset, scalar_t &tdiff) {
+                  auto ptr   = soffset + loffset;
+                  auto colid = entries(ptr);
+                  auto val   = values(ptr);
+                  if (colid != rowid) {
+                    tdiff -= val * lhs(colid);
+                  }
+                },
+                diff);
 #endif
+            lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid);
+          }  // end if team.team_rank() < nodes_this_lvl
+        }    // end for my_rank loop
+        {
+          // Update mut_node_count from nodes_per_level(lvl) each iteration of
+          // lvl per thread
+          mut_node_count += nodes_this_lvl;
+        }
+        team.team_barrier();
+      }  // end for lvl
+    }    // end tagged operator
+  };
 
-template <class SpaceType>
-struct ReturnRangePolicyType;
-
-#ifdef KOKKOS_ENABLE_SERIAL
-template <>
-struct ReturnRangePolicyType<Kokkos::Serial> {
-  using PolicyType = Kokkos::RangePolicy<Kokkos::Serial>;
+#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class RHSType, class LHSType>
+  static void lower_tri_solve_cg(TriSolveHandle &thandle,
+                                 const RowMapType row_map,
+                                 const EntriesType entries,
+                                 const ValuesType values, const RHSType &rhs,
+                                 LHSType &lhs) {
+    typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph =
+        thandle.get_sptrsvCudaGraph();
+
+    auto nlevels = thandle.get_num_levels();
+
+    auto stream1 = lcl_cudagraph->stream;
+    Kokkos::Cuda cuda1(stream1);
+    auto graph = lcl_cudagraph->cudagraph;
+
+    Kokkos::parallel_for("Init", Kokkos::RangePolicy<execution_space>(0, 1),
+                         EmptyFunctor());
+    Kokkos::Cuda().fence();
+    cudaStreamSynchronize(stream1);
+    // Kokkos::fence();
+
+    auto hnodes_per_level       = thandle.get_host_nodes_per_level();
+    auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+
+    size_type node_count = 0;
+
+    int team_size = thandle.get_team_size();
+    team_size     = team_size == -1 ? 64 : team_size;
+
+    // Start capturing stream
+    if (thandle.cudagraphCreated == false) {
+      Kokkos::fence();
+      cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
+      {
+        for (int iter = 0; iter < nlevels; ++iter) {
+          size_type lvl_nodes = hnodes_per_level(iter);
 
-  static inline PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt, ts);
-  }
+          auto policy = std::is_same<execution_space, Kokkos::Cuda>::value
+                            ? team_policy(lvl_nodes, team_size, cuda1)
+                            : team_policy(lvl_nodes, team_size);
 
-  template <class ExecInstanceType>
-  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) {
-    return PolicyType(nt, ts);
-    // return PolicyType(ExecInstanceType(),nt,ts);
-  }
-};
-#endif
-#ifdef KOKKOS_ENABLE_OPENMP
-template <>
-struct ReturnRangePolicyType<Kokkos::OpenMP> {
-  using PolicyType = Kokkos::RangePolicy<Kokkos::OpenMP>;
+          Kokkos::parallel_for(
+              "parfor_l_team_cudagraph",
+              Kokkos::Experimental::require(
+                  policy,
+                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+              LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
+                                               ValuesType, LHSType, RHSType>(
+                  row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                  node_count));
 
-  static inline PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt, ts);
-  }
+          node_count += hnodes_per_level(iter);
+        }
+      }
+      cudaStreamEndCapture(stream1, &graph);
 
-  template <class ExecInstanceType>
-  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType) {
-    return PolicyType(nt, ts);
-    // return PolicyType(ExecInstanceType(),nt,ts);
-  }
-};
-#endif
-#ifdef KOKKOS_ENABLE_CUDA
-template <>
-struct ReturnRangePolicyType<Kokkos::Cuda> {
-  using PolicyType = Kokkos::RangePolicy<Kokkos::Cuda>;
+      // Create graphExec
+      cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL,
+                           NULL, 0);
+      thandle.cudagraphCreated = true;
+    }
+    // Run graph
+    Kokkos::fence();
+    cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1);
 
-  static inline PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt, ts);
-  }
+    cudaStreamSynchronize(stream1);
+    Kokkos::fence();
+  }  // end lower_tri_solve_cg
 
-  template <class ExecInstanceType>
-  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
-    return PolicyType(stream, nt, ts);
-  }
-};
-#endif
-#ifdef KOKKOS_ENABLE_HIP
-template <>
-struct ReturnRangePolicyType<Kokkos::HIP> {
-  using PolicyType = Kokkos::RangePolicy<Kokkos::HIP>;
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class RHSType, class LHSType>
+  static void upper_tri_solve_cg(TriSolveHandle &thandle,
+                                 const RowMapType row_map,
+                                 const EntriesType entries,
+                                 const ValuesType values, const RHSType &rhs,
+                                 LHSType &lhs) {
+    typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph =
+        thandle.get_sptrsvCudaGraph();
 
-  static inline PolicyType get_policy(int nt, int ts) {
-    return PolicyType(nt, ts);
-  }
+    auto nlevels = thandle.get_num_levels();
 
-  template <class ExecInstanceType>
-  static inline PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
-    return PolicyType(stream, nt, ts);
-  }
-};
-#endif
+    auto stream1 = lcl_cudagraph->stream;
+    Kokkos::Cuda cuda1(stream1);
+    auto graph = lcl_cudagraph->cudagraph;
 
-template <class TriSolveHandle, class RowMapType, class EntriesType,
-          class ValuesType, class RHSType, class LHSType>
-void lower_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map,
-                        const EntriesType entries, const ValuesType values,
-                        const RHSType &rhs, LHSType &lhs) {
-  typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
-  typedef typename TriSolveHandle::execution_space execution_space;
-  typedef typename TriSolveHandle::size_type size_type;
-  typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph =
-      thandle.get_sptrsvCudaGraph();
+    Kokkos::parallel_for("Init", Kokkos::RangePolicy<execution_space>(0, 1),
+                         EmptyFunctor());
+    Kokkos::Cuda().fence();
+    cudaStreamSynchronize(stream1);
 
-  auto nlevels = thandle.get_num_levels();
+    auto hnodes_per_level       = thandle.get_host_nodes_per_level();
+    auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
 
-  auto stream1 = lcl_cudagraph->stream;
-  Kokkos::Cuda cuda1(stream1);
-  auto graph = lcl_cudagraph->cudagraph;
+    size_type node_count = 0;
 
-  Kokkos::parallel_for("Init", Kokkos::RangePolicy<execution_space>(0, 1),
-                       EmptyFunctor());
-  Kokkos::Cuda().fence();
-  cudaStreamSynchronize(stream1);
-  // Kokkos::fence();
+    int team_size = thandle.get_team_size();
+    team_size     = team_size == -1 ? 64 : team_size;
 
-  auto hnodes_per_level       = thandle.get_host_nodes_per_level();
-  auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+    // Start capturing stream
+    if (thandle.cudagraphCreated == false) {
+      Kokkos::fence();
+      cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
+      {
+        for (int iter = 0; iter < nlevels; ++iter) {
+          size_type lvl_nodes = hnodes_per_level(iter);
 
-  size_type node_count = 0;
+          auto policy = std::is_same<execution_space, Kokkos::Cuda>::value
+                            ? team_policy(lvl_nodes, team_size, cuda1)
+                            : team_policy(lvl_nodes, team_size);
 
-  int team_size = thandle.get_team_size();
-  team_size     = team_size == -1 ? 64 : team_size;
+          Kokkos::parallel_for(
+              "parfor_u_team_cudagraph",
+              Kokkos::Experimental::require(
+                  policy,
+                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+              UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
+                                               ValuesType, LHSType, RHSType>(
+                  row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                  node_count));
 
-  // Start capturing stream
-  if (thandle.cudagraphCreated == false) {
-    Kokkos::fence();
-    cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
-    {
-      for (int iter = 0; iter < nlevels; ++iter) {
-        size_type lvl_nodes = hnodes_per_level(iter);
-
-        using policy_type = ReturnTeamPolicyType<execution_space>;
-
-        Kokkos::parallel_for(
-            "parfor_l_team_cudagraph",
-            Kokkos::Experimental::require(
-                ReturnTeamPolicyType<execution_space>::get_policy(
-                    lvl_nodes, team_size, cuda1),
-                Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-            LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
-                                             ValuesType, LHSType, RHSType,
-                                             NGBLType>(
-                row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                node_count));
-
-        node_count += hnodes_per_level(iter);
+          node_count += hnodes_per_level(iter);
+        }
       }
-    }
-    cudaStreamEndCapture(stream1, &graph);
+      cudaStreamEndCapture(stream1, &graph);
 
-    // Create graphExec
-    cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL,
-                         0);
-    thandle.cudagraphCreated = true;
-  }
-  // Run graph
-  Kokkos::fence();
-  cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1);
-
-  cudaStreamSynchronize(stream1);
-  Kokkos::fence();
-}  // end lower_tri_solve_cg
-
-template <class TriSolveHandle, class RowMapType, class EntriesType,
-          class ValuesType, class RHSType, class LHSType>
-void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map,
-                        const EntriesType entries, const ValuesType values,
-                        const RHSType &rhs, LHSType &lhs) {
-  typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
-  typedef typename TriSolveHandle::execution_space execution_space;
-  typedef typename TriSolveHandle::size_type size_type;
-  typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph =
-      thandle.get_sptrsvCudaGraph();
-
-  auto nlevels = thandle.get_num_levels();
-
-  auto stream1 = lcl_cudagraph->stream;
-  Kokkos::Cuda cuda1(stream1);
-  auto graph = lcl_cudagraph->cudagraph;
-
-  Kokkos::parallel_for("Init", Kokkos::RangePolicy<execution_space>(0, 1),
-                       EmptyFunctor());
-  Kokkos::Cuda().fence();
-  cudaStreamSynchronize(stream1);
-
-  auto hnodes_per_level       = thandle.get_host_nodes_per_level();
-  auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
-
-  size_type node_count = 0;
-
-  int team_size = thandle.get_team_size();
-  team_size     = team_size == -1 ? 64 : team_size;
-
-  // Start capturing stream
-  if (thandle.cudagraphCreated == false) {
-    Kokkos::fence();
-    cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
-    {
-      for (int iter = 0; iter < nlevels; ++iter) {
-        size_type lvl_nodes = hnodes_per_level(iter);
-
-        using policy_type = ReturnTeamPolicyType<execution_space>;
-
-        Kokkos::parallel_for(
-            "parfor_u_team_cudagraph",
-            Kokkos::Experimental::require(
-                ReturnTeamPolicyType<execution_space>::get_policy(
-                    lvl_nodes, team_size, cuda1),
-                Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-            UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
-                                             ValuesType, LHSType, RHSType,
-                                             NGBLType>(
-                row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                node_count));
-
-        node_count += hnodes_per_level(iter);
-      }
+      // Create graphExec
+      cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL,
+                           NULL, 0);
+      thandle.cudagraphCreated = true;
     }
-    cudaStreamEndCapture(stream1, &graph);
-
-    // Create graphExec
-    cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL,
-                         0);
-    thandle.cudagraphCreated = true;
-  }
-  // Run graph
-  Kokkos::fence();
-  cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1);
+    // Run graph
+    Kokkos::fence();
+    cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1);
 
-  cudaStreamSynchronize(stream1);
-  Kokkos::fence();
-}  // end upper_tri_solve_cg
+    cudaStreamSynchronize(stream1);
+    Kokkos::fence();
+  }  // end upper_tri_solve_cg
 
 #endif
 
-template <class ExecutionSpace, class TriSolveHandle, class RowMapType,
-          class EntriesType, class ValuesType, class RHSType, class LHSType>
-void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
-                     const RowMapType row_map, const EntriesType entries,
-                     const ValuesType values, const RHSType &rhs,
-                     LHSType &lhs) {
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class RHSType, class LHSType>
+  static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle,
+                              const RowMapType row_map,
+                              const EntriesType entries,
+                              const ValuesType values, const RHSType &rhs,
+                              LHSType &lhs) {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
-  cudaProfilerStop();
+    cudaProfilerStop();
 #endif
-  typedef typename TriSolveHandle::size_type size_type;
-  typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
-
-  auto nlevels = thandle.get_num_levels();
-  // Keep this a host View, create device version and copy to back to host
-  // during scheduling This requires making sure the host view in the handle is
-  // properly updated after the symbolic phase
-  auto nodes_per_level        = thandle.get_nodes_per_level();
-  auto hnodes_per_level       = thandle.get_host_nodes_per_level();
-  auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+    auto nlevels = thandle.get_num_levels();
+    // Keep this a host View, create device version and copy to back to host
+    // during scheduling This requires making sure the host view in the handle
+    // is properly updated after the symbolic phase
+    auto nodes_per_level        = thandle.get_nodes_per_level();
+    auto hnodes_per_level       = thandle.get_host_nodes_per_level();
+    auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
 
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
-  using namespace KokkosSparse::Experimental;
-  using memory_space        = typename TriSolveHandle::HandleTempMemorySpace;
-  using device_t            = Kokkos::Device<ExecutionSpace, memory_space>;
-  using integer_view_t      = typename TriSolveHandle::integer_view_t;
-  using integer_view_host_t = typename TriSolveHandle::integer_view_host_t;
-  using scalar_t            = typename ValuesType::non_const_value_type;
-  using range_type          = Kokkos::pair<int, int>;
-  using row_map_host_view_t = Kokkos::View<size_type *, Kokkos::HostSpace>;
-
-  row_map_host_view_t row_map_host;
-
-  const scalar_t zero(0.0);
-  const scalar_t one(1.0);
-
-  auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level();
-
-  if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
-      thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
-      thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) {
-    Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level);
-
-    row_map_host = row_map_host_view_t(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"),
-        row_map.extent(0));
-    Kokkos::deep_copy(row_map_host, row_map);
-  }
+    using namespace KokkosSparse::Experimental;
+    using device_t            = Kokkos::Device<execution_space, temp_mem_space>;
+    using integer_view_host_t = typename TriSolveHandle::integer_view_host_t;
+    using row_map_host_view_t = Kokkos::View<size_type *, Kokkos::HostSpace>;
+
+    row_map_host_view_t row_map_host;
+
+    const scalar_t zero(0.0);
+    const scalar_t one(1.0);
+
+    auto nodes_grouped_by_level_host =
+        thandle.get_host_nodes_grouped_by_level();
+
+    if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
+        thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
+        thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) {
+      Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level);
 
-  // inversion options
-  const bool invert_diagonal    = thandle.get_invert_diagonal();
-  const bool invert_offdiagonal = thandle.get_invert_offdiagonal();
-  const bool unit_diagonal      = thandle.is_unit_diagonal();
+      row_map_host = row_map_host_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"),
+          row_map.extent(0));
+      Kokkos::deep_copy(row_map_host, row_map);
+    }
+
+    // inversion options
+    const bool invert_diagonal    = thandle.get_invert_diagonal();
+    const bool invert_offdiagonal = thandle.get_invert_offdiagonal();
+    const bool unit_diagonal      = thandle.is_unit_diagonal();
 
-  // supernode sizes
-  const int *supercols      = thandle.get_supercols();
-  const int *supercols_host = thandle.get_supercols_host();
+    // supernode sizes
+    const int *supercols      = thandle.get_supercols();
+    const int *supercols_host = thandle.get_supercols_host();
 
-  // kernel types
-  integer_view_t kernel_type      = thandle.get_kernel_type();
-  integer_view_t diag_kernel_type = thandle.get_diag_kernel_type();
+    // kernel types
+    work_view_int_t kernel_type      = thandle.get_kernel_type();
+    work_view_int_t diag_kernel_type = thandle.get_diag_kernel_type();
 
-  integer_view_host_t kernel_type_host = thandle.get_kernel_type_host();
-  integer_view_host_t diag_kernel_type_host =
-      thandle.get_diag_kernel_type_host();
+    integer_view_host_t kernel_type_host = thandle.get_kernel_type_host();
+    integer_view_host_t diag_kernel_type_host =
+        thandle.get_diag_kernel_type_host();
 
-  // workspaces
-  integer_view_t work_offset           = thandle.get_work_offset();
-  integer_view_host_t work_offset_host = thandle.get_work_offset_host();
-  auto work                            = thandle.get_workspace();
+    // workspaces
+    work_view_int_t work_offset          = thandle.get_work_offset();
+    integer_view_host_t work_offset_host = thandle.get_work_offset_host();
+    auto work                            = thandle.get_workspace();
 #endif
 
-  size_type node_count = 0;
+    size_type node_count = 0;
 
 #ifdef profile_supernodal_etree
-  Kokkos::Timer sptrsv_timer;
-  sptrsv_timer.reset();
+    Kokkos::Timer sptrsv_timer;
+    sptrsv_timer.reset();
 #endif
 
-  for (size_type lvl = 0; lvl < nlevels; ++lvl) {
-    {
+    for (size_type lvl = 0; lvl < nlevels; ++lvl) {
       size_type lvl_nodes = hnodes_per_level(lvl);
 
       if (lvl_nodes != 0) {
@@ -2980,27 +2756,24 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
           Kokkos::parallel_for(
               "parfor_fixed_lvl",
               Kokkos::Experimental::require(
-                  Kokkos::RangePolicy<ExecutionSpace>(space, node_count,
-                                                      node_count + lvl_nodes),
+                  range_policy(space, node_count, node_count + lvl_nodes),
                   Kokkos::Experimental::WorkItemProperty::HintLightWeight),
               LowerTriLvlSchedRPSolverFunctor<RowMapType, EntriesType,
-                                              ValuesType, LHSType, RHSType,
-                                              NGBLType>(
+                                              ValuesType, LHSType, RHSType>(
                   row_map, entries, values, lhs, rhs, nodes_grouped_by_level));
         } else if (thandle.get_algorithm() ==
                    KokkosSparse::Experimental::SPTRSVAlgorithm::
                        SEQLVLSCHD_TP1) {
-          using team_policy_t = Kokkos::TeamPolicy<ExecutionSpace>;
-          int team_size       = thandle.get_team_size();
+          int team_size = thandle.get_team_size();
 
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
           TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                      LHSType, RHSType, NGBLType>
+                                      LHSType, RHSType>
               tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
                    true, node_count);
 #else
           LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                           LHSType, RHSType, NGBLType>
+                                           LHSType, RHSType>
               tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
                    node_count);
 #endif
@@ -3008,14 +2781,14 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
             Kokkos::parallel_for(
                 "parfor_l_team",
                 Kokkos::Experimental::require(
-                    team_policy_t(space, lvl_nodes, Kokkos::AUTO),
+                    team_policy(space, lvl_nodes, Kokkos::AUTO),
                     Kokkos::Experimental::WorkItemProperty::HintLightWeight),
                 tstf);
           else
             Kokkos::parallel_for(
                 "parfor_l_team",
                 Kokkos::Experimental::require(
-                    team_policy_t(space, lvl_nodes, team_size),
+                    team_policy(space, lvl_nodes, team_size),
                     Kokkos::Experimental::WorkItemProperty::HintLightWeight),
                 tstf);
         }
@@ -3051,10 +2824,10 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
 
   #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
           TriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType,
-  LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs,
+  LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs,
   nodes_grouped_by_level, true, node_count, vector_size, 0); #else
           LowerTriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType,
-  LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs,
+  LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs,
   nodes_grouped_by_level, node_count, node_groups); #endif
           Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type(
   (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size
@@ -3072,7 +2845,6 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
 #endif
 
           // NOTE: we currently supports only default_layout = LayoutLeft
-          using team_policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
           using supernode_view_type =
               Kokkos::View<scalar_t **, default_layout, device_t,
                            Kokkos::MemoryUnmanaged>;
@@ -3084,13 +2856,13 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
             if (invert_diagonal && !invert_offdiagonal) {
               // copy diagonals to workspaces
               const int *work_offset_data = work_offset.data();
-              SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                  sptrsv_init_functor(-2, node_count, nodes_grouped_by_level,
-                                      supercols, work_offset_data, lhs, work);
+              SparseTriSupernodalSpMVFunctor<LHSType> sptrsv_init_functor(
+                  -2, node_count, nodes_grouped_by_level, supercols,
+                  work_offset_data, lhs, work);
               Kokkos::parallel_for(
                   "parfor_tri_supernode_spmv",
                   Kokkos::Experimental::require(
-                      team_policy_type(space, lvl_nodes, Kokkos::AUTO),
+                      team_policy(space, lvl_nodes, Kokkos::AUTO),
                       Kokkos::Experimental::WorkItemProperty::HintLightWeight),
                   sptrsv_init_functor);
             }
@@ -3175,21 +2947,21 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
             if (invert_offdiagonal) {
               // copy diagonals from workspaces
               const int *work_offset_data = work_offset.data();
-              SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                  sptrsv_init_functor(-1, node_count, nodes_grouped_by_level,
-                                      supercols, work_offset_data, lhs, work);
+              SparseTriSupernodalSpMVFunctor<LHSType> sptrsv_init_functor(
+                  -1, node_count, nodes_grouped_by_level, supercols,
+                  work_offset_data, lhs, work);
               Kokkos::parallel_for(
                   "parfor_tri_supernode_spmv",
                   Kokkos::Experimental::require(
-                      team_policy_type(space, lvl_nodes, Kokkos::AUTO),
+                      team_policy(space, lvl_nodes, Kokkos::AUTO),
                       Kokkos::Experimental::WorkItemProperty::HintLightWeight),
                   sptrsv_init_functor);
             }
           }
 
           // launching sparse-triangular solve functor
-          LowerTriSupernodalFunctor<TriSolveHandle, RowMapType, EntriesType,
-                                    ValuesType, LHSType, NGBLType>
+          LowerTriSupernodalFunctor<RowMapType, EntriesType, ValuesType,
+                                    LHSType>
               sptrsv_functor(unit_diagonal, invert_diagonal, invert_offdiagonal,
                              supercols, row_map, entries, values, lvl,
                              kernel_type, diag_kernel_type, lhs, work,
@@ -3197,7 +2969,7 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
           Kokkos::parallel_for(
               "parfor_lsolve_supernode",
               Kokkos::Experimental::require(
-                  team_policy_type(space, lvl_nodes, Kokkos::AUTO),
+                  team_policy(space, lvl_nodes, Kokkos::AUTO),
                   Kokkos::Experimental::WorkItemProperty::HintLightWeight),
               sptrsv_functor);
 
@@ -3219,7 +2991,6 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
 #endif
 
           // initialize input & output vectors
-          using team_policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
 
           // update with spmv (one or two SpMV)
           bool transpose_spmv =
@@ -3231,25 +3002,25 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
             auto digmat = thandle.get_diagblock(lvl);
             KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work);
             // copy from work to lhs corresponding to diagonal blocks
-            SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                sptrsv_init_functor(-1, node_count, nodes_grouped_by_level,
-                                    supercols, supercols, lhs, work);
+            SparseTriSupernodalSpMVFunctor<LHSType> sptrsv_init_functor(
+                -1, node_count, nodes_grouped_by_level, supercols, supercols,
+                lhs, work);
             Kokkos::parallel_for(
                 "parfor_lsolve_supernode",
                 Kokkos::Experimental::require(
-                    team_policy_type(space, lvl_nodes, Kokkos::AUTO),
+                    team_policy(space, lvl_nodes, Kokkos::AUTO),
                     Kokkos::Experimental::WorkItemProperty::HintLightWeight),
                 sptrsv_init_functor);
           } else {
             // copy lhs corresponding to diagonal blocks to work and zero out in
             // lhs
-            SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                sptrsv_init_functor(1, node_count, nodes_grouped_by_level,
-                                    supercols, supercols, lhs, work);
+            SparseTriSupernodalSpMVFunctor<LHSType> sptrsv_init_functor(
+                1, node_count, nodes_grouped_by_level, supercols, supercols,
+                lhs, work);
             Kokkos::parallel_for(
                 "parfor_lsolve_supernode",
                 Kokkos::Experimental::require(
-                    team_policy_type(space, lvl_nodes, Kokkos::AUTO),
+                    team_policy(space, lvl_nodes, Kokkos::AUTO),
                     Kokkos::Experimental::WorkItemProperty::HintLightWeight),
                 sptrsv_init_functor);
           }
@@ -3259,13 +3030,13 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
           KokkosSparse::spmv(space, tran, one, submat, work, one, lhs);
 
           // reinitialize workspace
-          SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-              sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level,
-                                      supercols, supercols, lhs, work);
+          SparseTriSupernodalSpMVFunctor<LHSType> sptrsv_finalize_functor(
+              0, node_count, nodes_grouped_by_level, supercols, supercols, lhs,
+              work);
           Kokkos::parallel_for(
               "parfor_lsolve_supernode",
               Kokkos::Experimental::require(
-                  team_policy_type(space, lvl_nodes, Kokkos::AUTO),
+                  team_policy(space, lvl_nodes, Kokkos::AUTO),
                   Kokkos::Experimental::WorkItemProperty::HintLightWeight),
               sptrsv_finalize_functor);
 
@@ -3284,165 +3055,159 @@ void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
         cudaProfilerStop();
 #endif
       }  // end if
-    }    // scope for if-block
-
-  }  // end for lvl
+    }    // end for lvl
 
 #ifdef profile_supernodal_etree
-  Kokkos::fence();
-  double sptrsv_time_seconds = sptrsv_timer.seconds();
-  std::cout << " + Execution space   : " << execution_space::name()
-            << std::endl;
-  std::cout << " + Memory space      : " << memory_space::name() << std::endl;
-  std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl
-            << std::endl;
+    Kokkos::fence();
+    double sptrsv_time_seconds = sptrsv_timer.seconds();
+    std::cout << " + Execution space   : " << execution_space::name()
+              << std::endl;
+    std::cout << " + Memory space      : " << temp_mem_space::name()
+              << std::endl;
+    std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl
+              << std::endl;
 #endif
 
-}  // end lower_tri_solve
+  }  // end lower_tri_solve
 
-template <class ExecutionSpace, class TriSolveHandle, class RowMapType,
-          class EntriesType, class ValuesType, class RHSType, class LHSType>
-void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle,
-                     const RowMapType row_map, const EntriesType entries,
-                     const ValuesType values, const RHSType &rhs,
-                     LHSType &lhs) {
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class RHSType, class LHSType>
+  static void upper_tri_solve(execution_space &space, TriSolveHandle &thandle,
+                              const RowMapType row_map,
+                              const EntriesType entries,
+                              const ValuesType values, const RHSType &rhs,
+                              LHSType &lhs) {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
-  cudaProfilerStop();
+    cudaProfilerStop();
 #endif
-  using memory_space = typename TriSolveHandle::HandleTempMemorySpace;
-  using device_t     = Kokkos::Device<ExecutionSpace, memory_space>;
-  typedef typename TriSolveHandle::size_type size_type;
-  typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
-
-  auto nlevels = thandle.get_num_levels();
-  // Keep this a host View, create device version and copy to back to host
-  // during scheduling This requires making sure the host view in the handle is
-  // properly updated after the symbolic phase
-  auto nodes_per_level  = thandle.get_nodes_per_level();
-  auto hnodes_per_level = thandle.get_host_nodes_per_level();
-  // auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level);
-  // Kokkos::deep_copy(hnodes_per_level, nodes_per_level);
-
-  auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+    using device_t = Kokkos::Device<execution_space, temp_mem_space>;
+
+    auto nlevels = thandle.get_num_levels();
+    // Keep this a host View, create device version and copy to back to host
+    // during scheduling This requires making sure the host view in the handle
+    // is properly updated after the symbolic phase
+    auto nodes_per_level  = thandle.get_nodes_per_level();
+    auto hnodes_per_level = thandle.get_host_nodes_per_level();
+    // auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level);
+    // Kokkos::deep_copy(hnodes_per_level, nodes_per_level);
+
+    auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
 
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
-  using namespace KokkosSparse::Experimental;
-  using integer_view_t      = typename TriSolveHandle::integer_view_t;
-  using integer_view_host_t = typename TriSolveHandle::integer_view_host_t;
-  using scalar_t            = typename ValuesType::non_const_value_type;
-  using range_type          = Kokkos::pair<int, int>;
-  using row_map_host_view_t = Kokkos::View<size_type *, Kokkos::HostSpace>;
+    using namespace KokkosSparse::Experimental;
+    using integer_view_host_t = typename TriSolveHandle::integer_view_host_t;
+    using row_map_host_view_t = Kokkos::View<size_type *, Kokkos::HostSpace>;
 
-  row_map_host_view_t row_map_host;
+    row_map_host_view_t row_map_host;
 
-  const scalar_t zero(0.0);
-  const scalar_t one(1.0);
+    const scalar_t zero(0.0);
+    const scalar_t one(1.0);
 
-  auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level();
+    auto nodes_grouped_by_level_host =
+        thandle.get_host_nodes_grouped_by_level();
 
-  if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
-      thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
-      thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) {
-    Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level);
+    if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
+        thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
+        thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) {
+      Kokkos::deep_copy(nodes_grouped_by_level_host, nodes_grouped_by_level);
 
-    row_map_host = row_map_host_view_t(
-        Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"),
-        row_map.extent(0));
-    Kokkos::deep_copy(row_map_host, row_map);
-  }
+      row_map_host = row_map_host_view_t(
+          Kokkos::view_alloc(Kokkos::WithoutInitializing, "host rowmap"),
+          row_map.extent(0));
+      Kokkos::deep_copy(row_map_host, row_map);
+    }
 
-  // supernode sizes
-  const int *supercols      = thandle.get_supercols();
-  const int *supercols_host = thandle.get_supercols_host();
+    // supernode sizes
+    const int *supercols      = thandle.get_supercols();
+    const int *supercols_host = thandle.get_supercols_host();
 
-  // inversion option
-  const bool invert_diagonal    = thandle.get_invert_diagonal();
-  const bool invert_offdiagonal = thandle.get_invert_offdiagonal();
+    // inversion option
+    const bool invert_diagonal    = thandle.get_invert_diagonal();
+    const bool invert_offdiagonal = thandle.get_invert_offdiagonal();
 
-  // kernel types
-  integer_view_t kernel_type      = thandle.get_kernel_type();
-  integer_view_t diag_kernel_type = thandle.get_diag_kernel_type();
+    // kernel types
+    work_view_int_t kernel_type      = thandle.get_kernel_type();
+    work_view_int_t diag_kernel_type = thandle.get_diag_kernel_type();
 
-  integer_view_host_t kernel_type_host = thandle.get_kernel_type_host();
-  integer_view_host_t diag_kernel_type_host =
-      thandle.get_diag_kernel_type_host();
+    integer_view_host_t kernel_type_host = thandle.get_kernel_type_host();
+    integer_view_host_t diag_kernel_type_host =
+        thandle.get_diag_kernel_type_host();
 
-  // workspace
-  integer_view_t work_offset           = thandle.get_work_offset();
-  integer_view_host_t work_offset_host = thandle.get_work_offset_host();
-  auto work                            = thandle.get_workspace();
+    // workspace
+    work_view_int_t work_offset          = thandle.get_work_offset();
+    integer_view_host_t work_offset_host = thandle.get_work_offset_host();
+    auto work                            = thandle.get_workspace();
 #endif
 
-  size_type node_count = 0;
+    size_type node_count = 0;
 
-// This must stay serial; would be nice to try out Cuda's graph stuff to reduce
-// kernel launch overhead
+    // This must stay serial; would be nice to try out Cuda's graph stuff to
+    // reduce kernel launch overhead
 #ifdef profile_supernodal_etree
-  Kokkos::Timer sptrsv_timer;
-  sptrsv_timer.reset();
+    Kokkos::Timer sptrsv_timer;
+    sptrsv_timer.reset();
 #endif
-  for (size_type lvl = 0; lvl < nlevels; ++lvl) {
-    size_type lvl_nodes = hnodes_per_level(lvl);
+    for (size_type lvl = 0; lvl < nlevels; ++lvl) {
+      size_type lvl_nodes = hnodes_per_level(lvl);
 
-    if (lvl_nodes != 0) {
+      if (lvl_nodes != 0) {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
-      cudaProfilerStart();
+        cudaProfilerStart();
 #endif
 
-      if (thandle.get_algorithm() ==
-          KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) {
-        Kokkos::parallel_for(
-            "parfor_fixed_lvl",
-            Kokkos::Experimental::require(
-                Kokkos::RangePolicy<ExecutionSpace>(space, node_count,
-                                                    node_count + lvl_nodes),
-                Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-            UpperTriLvlSchedRPSolverFunctor<RowMapType, EntriesType, ValuesType,
-                                            LHSType, RHSType, NGBLType>(
-                row_map, entries, values, lhs, rhs, nodes_grouped_by_level));
-      } else if (thandle.get_algorithm() ==
-                 KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) {
-        using team_policy_t = Kokkos::TeamPolicy<ExecutionSpace>;
-
-        int team_size = thandle.get_team_size();
+        if (thandle.get_algorithm() ==
+            KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) {
+          Kokkos::parallel_for(
+              "parfor_fixed_lvl",
+              Kokkos::Experimental::require(
+                  range_policy(space, node_count, node_count + lvl_nodes),
+                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+              UpperTriLvlSchedRPSolverFunctor<RowMapType, EntriesType,
+                                              ValuesType, LHSType, RHSType>(
+                  row_map, entries, values, lhs, rhs, nodes_grouped_by_level));
+        } else if (thandle.get_algorithm() ==
+                   KokkosSparse::Experimental::SPTRSVAlgorithm::
+                       SEQLVLSCHD_TP1) {
+          int team_size = thandle.get_team_size();
 
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                    LHSType, RHSType, NGBLType>
-            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                 false, node_count);
+          TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                      LHSType, RHSType>
+              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                   false, node_count);
 #else
-        UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                         LHSType, RHSType, NGBLType>
-            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                 node_count);
+          UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                           LHSType, RHSType>
+              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                   node_count);
 #endif
-        if (team_size == -1)
-          Kokkos::parallel_for(
-              "parfor_u_team",
-              Kokkos::Experimental::require(
-                  team_policy_t(space, lvl_nodes, Kokkos::AUTO),
-                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-              tstf);
-        else
-          Kokkos::parallel_for(
-              "parfor_u_team",
-              Kokkos::Experimental::require(
-                  team_policy_t(space, lvl_nodes, team_size),
-                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-              tstf);
-      }
-      // TP2 algorithm has issues with some offset-ordinal combo to be addressed
-      /*
-      else if ( thandle.get_algorithm() ==
-KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { typedef
-Kokkos::TeamPolicy<execution_space> tvt_policy_type;
-
-        int team_size = thandle.get_team_size();
-        if ( team_size == -1 ) {
+          if (team_size == -1)
+            Kokkos::parallel_for(
+                "parfor_u_team",
+                Kokkos::Experimental::require(
+                    team_policy(space, lvl_nodes, Kokkos::AUTO),
+                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+                tstf);
+          else
+            Kokkos::parallel_for(
+                "parfor_u_team",
+                Kokkos::Experimental::require(
+                    team_policy(space, lvl_nodes, team_size),
+                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+                tstf);
+        }
+        // TP2 algorithm has issues with some offset-ordinal combo to be
+        // addressed
+        /*
+          else if ( thandle.get_algorithm() ==
+          KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) {
+typedef Kokkos::TeamPolicy<execution_space> tvt_policy_type;
+
+          int team_size = thandle.get_team_size();
+          if ( team_size == -1 ) {
           team_size = std::is_same< typename
-Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 :
-64;
+          Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace
+>::value ? 1 : 64;
         }
         int vector_size = thandle.get_team_size();
         if ( vector_size == -1 ) {
@@ -3461,10 +3226,10 @@ node_group (thread has full ownership of a node)
 
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
         TriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType,
-LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs,
+LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs,
 nodes_grouped_by_level, false, node_count, vector_size, 0); #else
         UpperTriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType,
-LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs,
+LHSType, RHSType> tstf(row_map, entries, values, lhs, rhs,
 nodes_grouped_by_level, node_count, node_groups); #endif
 
         Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type(
@@ -3472,105 +3237,208 @@ nodes_grouped_by_level, node_count, node_groups); #endif
 tstf); } // end elseif
       */
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
-      else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
-               thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
-               thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) {
+        else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
+                 thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
+                 thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) {
 
 #ifdef profile_supernodal_etree
-        size_t flops = 0;
-        Kokkos::Timer timer;
-        timer.reset();
+          size_t flops = 0;
+          Kokkos::Timer timer;
+          timer.reset();
 #endif
 
-        using team_policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
-        if (thandle.is_column_major()) {  // U stored in CSC
-          if (diag_kernel_type_host(lvl) == 3) {
-            // using device-level kernels (functor is called to gather the input
-            // into workspace)
-            scalar_t *dataU = const_cast<scalar_t *>(values.data());
+          if (thandle.is_column_major()) {  // U stored in CSC
+            if (diag_kernel_type_host(lvl) == 3) {
+              // using device-level kernels (functor is called to gather the
+              // input into workspace)
+              scalar_t *dataU = const_cast<scalar_t *>(values.data());
+
+              if (invert_diagonal && !invert_offdiagonal) {
+                // copy diagonals to workspaces
+                const int *work_offset_data = work_offset.data();
+                SparseTriSupernodalSpMVFunctor<LHSType> sptrsv_init_functor(
+                    -2, node_count, nodes_grouped_by_level, supercols,
+                    work_offset_data, lhs, work);
+                Kokkos::parallel_for(
+                    "parfor_tri_supernode_spmv",
+                    Kokkos::Experimental::require(
+                        team_policy(space, lvl_nodes, Kokkos::AUTO),
+                        Kokkos::Experimental::WorkItemProperty::
+                            HintLightWeight),
+                    sptrsv_init_functor);
+              }
+              for (size_type league_rank = 0; league_rank < lvl_nodes;
+                   league_rank++) {
+                auto s = nodes_grouped_by_level_host(node_count + league_rank);
+
+                // supernodal column size
+                int j1 = supercols_host[s];
+                int j2 = supercols_host[s + 1];
+                int nscol =
+                    j2 - j1;  // number of columns in the s-th supernode column
+
+                int i1    = row_map_host(j1);
+                int i2    = row_map_host(j1 + 1);
+                int nsrow = i2 - i1;  // "total" number of rows in all the
+                                      // supernodes (diagonal+off-diagonal)
+                int nsrow2 = nsrow - nscol;  // "total" number of rows in all
+                                             // the off-diagonal supernodes
+#ifdef profile_supernodal_etree
+                flops += 2 * (nscol * nsrow);
+#endif
 
-            if (invert_diagonal && !invert_offdiagonal) {
-              // copy diagonals to workspaces
-              const int *work_offset_data = work_offset.data();
-              SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                  sptrsv_init_functor(-2, node_count, nodes_grouped_by_level,
-                                      supercols, work_offset_data, lhs, work);
-              Kokkos::parallel_for(
-                  "parfor_tri_supernode_spmv",
-                  Kokkos::Experimental::require(
-                      team_policy_type(space, lvl_nodes, Kokkos::AUTO),
-                      Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                  sptrsv_init_functor);
-            }
-            for (size_type league_rank = 0; league_rank < lvl_nodes;
-                 league_rank++) {
-              auto s = nodes_grouped_by_level_host(node_count + league_rank);
+                // workspace
+                int workoffset = work_offset_host(s);
 
-              // supernodal column size
-              int j1 = supercols_host[s];
-              int j2 = supercols_host[s + 1];
-              int nscol =
-                  j2 - j1;  // number of columns in the s-th supernode column
+                // create a view for the s-th supernocal block column
+                // NOTE: we currently supports only default_layout = LayoutLeft
+                Kokkos::View<scalar_t **, default_layout, device_t,
+                             Kokkos::MemoryUnmanaged>
+                    viewU(&dataU[i1], nsrow, nscol);
 
-              int i1    = row_map_host(j1);
-              int i2    = row_map_host(j1 + 1);
-              int nsrow = i2 - i1;         // "total" number of rows in all the
-                                           // supernodes (diagonal+off-diagonal)
-              int nsrow2 = nsrow - nscol;  // "total" number of rows in all the
-                                           // off-diagonal supernodes
-#ifdef profile_supernodal_etree
-              flops += 2 * (nscol * nsrow);
-#endif
+                if (invert_offdiagonal) {
+                  auto Uij = Kokkos::subview(viewU, range_type(0, nsrow),
+                                             Kokkos::ALL());
+                  auto Xj  = Kokkos::subview(lhs, range_type(j1, j2));
+                  auto Z   = Kokkos::subview(
+                      work,
+                      range_type(
+                          workoffset,
+                          workoffset +
+                              nsrow));  // needed with gemv for update&scatter
+                  KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z);
+                } else {
+                  // extract part of the solution, corresponding to the diagonal
+                  // block
+                  auto Xj = Kokkos::subview(lhs, range_type(j1, j2));
 
-              // workspace
-              int workoffset = work_offset_host(s);
+                  // "triangular-solve" to compute Xj
+                  // extract the diagonal block of s-th supernocal column of U
+                  auto Ujj = Kokkos::subview(viewU, range_type(0, nscol),
+                                             Kokkos::ALL());
+                  if (invert_diagonal) {
+                    auto Y = Kokkos::subview(
+                        work, range_type(
+                                  workoffset,
+                                  workoffset + nscol));  // needed for gemv
+                                                         // instead of trmv/trsv
+                    KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj);
+                  } else {
+                    // NOTE: we currently supports only default_layout =
+                    // LayoutLeft
+                    Kokkos::View<scalar_t **, default_layout, device_t,
+                                 Kokkos::MemoryUnmanaged>
+                        Xjj(Xj.data(), nscol, 1);
+                    KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj);
+                  }
+                  // update off-diagonal blocks
+                  if (nsrow2 > 0) {
+                    // extract the off-diagonal blocks of s-th supernodal column
+                    // of U
+                    auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow),
+                                               Kokkos::ALL());
+                    auto Z   = Kokkos::subview(
+                        work, range_type(workoffset + nscol,
+                                         workoffset + nscol +
+                                             nsrow2));  // needed with gemv for
+                                                          // update&scatter
+                    KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z);
+                  }
+                }
+              }
+              if (invert_offdiagonal) {
+                // copy diagonals from workspaces
+                const int *work_offset_data = work_offset.data();
+                SparseTriSupernodalSpMVFunctor<LHSType> sptrsv_init_functor(
+                    -1, node_count, nodes_grouped_by_level, supercols,
+                    work_offset_data, lhs, work);
+                Kokkos::parallel_for(
+                    "parfor_tri_supernode_spmv",
+                    Kokkos::Experimental::require(
+                        team_policy(space, lvl_nodes, Kokkos::AUTO),
+                        Kokkos::Experimental::WorkItemProperty::
+                            HintLightWeight),
+                    sptrsv_init_functor);
+              }
+            }
 
-              // create a view for the s-th supernocal block column
-              // NOTE: we currently supports only default_layout = LayoutLeft
-              Kokkos::View<scalar_t **, default_layout, device_t,
-                           Kokkos::MemoryUnmanaged>
-                  viewU(&dataU[i1], nsrow, nscol);
+            // launching sparse-triangular solve functor
+            UpperTriTranSupernodalFunctor<RowMapType, EntriesType, ValuesType,
+                                          LHSType>
+                sptrsv_functor(invert_diagonal, invert_offdiagonal, supercols,
+                               row_map, entries, values, lvl, kernel_type,
+                               diag_kernel_type, lhs, work, work_offset,
+                               nodes_grouped_by_level, node_count);
 
-              if (invert_offdiagonal) {
-                auto Uij =
-                    Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL());
+            Kokkos::parallel_for(
+                "parfor_usolve_tran_supernode",
+                Kokkos::Experimental::require(
+                    team_policy(space, lvl_nodes, Kokkos::AUTO),
+                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+                sptrsv_functor);
+          } else {  // U stored in CSR
+            // launching sparse-triangular solve functor
+            UpperTriSupernodalFunctor<RowMapType, EntriesType, ValuesType,
+                                      LHSType>
+                sptrsv_functor(invert_diagonal, supercols, row_map, entries,
+                               values, lvl, kernel_type, diag_kernel_type, lhs,
+                               work, work_offset, nodes_grouped_by_level,
+                               node_count);
+
+            Kokkos::parallel_for(
+                "parfor_usolve_supernode",
+                Kokkos::Experimental::require(
+                    team_policy(space, lvl_nodes, Kokkos::AUTO),
+                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+                sptrsv_functor);
+
+            if (diag_kernel_type_host(lvl) == 3) {
+              // using device-level kernels (functor is called to gather the
+              // input into workspace)
+              scalar_t *dataU = const_cast<scalar_t *>(values.data());
+
+              for (size_type league_rank = 0; league_rank < lvl_nodes;
+                   league_rank++) {
+                auto s = nodes_grouped_by_level_host(node_count + league_rank);
+
+                // supernodal column size
+                int j1 = supercols_host[s];
+                int j2 = supercols_host[s + 1];
+                int nscol =
+                    j2 - j1;  // number of columns in the s-th supernode column
+
+                // "total" number of rows in all the supernodes
+                // (diagonal+off-diagonal)
+                int i1    = row_map_host(j1);
+                int i2    = row_map_host(j1 + 1);
+                int nsrow = i2 - i1;
+                // "total" number of rows in all the off-diagonal supernodes
+                int nsrow2 = nsrow - nscol;
+
+                // workspace
+                int workoffset = work_offset_host(s);
+
+                // create a view for the s-th supernocal block column
+                // NOTE: we currently supports only default_layout = LayoutLeft
+                Kokkos::View<scalar_t **, default_layout, device_t,
+                             Kokkos::MemoryUnmanaged>
+                    viewU(&dataU[i1], nsrow, nscol);
+
+                // extract part of the solution, corresponding to the diagonal
+                // block
                 auto Xj = Kokkos::subview(lhs, range_type(j1, j2));
-                auto Z  = Kokkos::subview(
+                auto Y  = Kokkos::subview(
                     work,
                     range_type(
                         workoffset,
                         workoffset +
-                            nsrow));  // needed with gemv for update&scatter
-                KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z);
-              } else {
-                // extract part of the solution, corresponding to the diagonal
-                // block
-                auto Xj = Kokkos::subview(lhs, range_type(j1, j2));
+                            nscol));  // needed for gemv instead of trmv/trsv
 
-                // "triangular-solve" to compute Xj
-                // extract the diagonal block of s-th supernocal column of U
-                auto Ujj =
-                    Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL());
-                if (invert_diagonal) {
-                  auto Y = Kokkos::subview(
-                      work,
-                      range_type(
-                          workoffset,
-                          workoffset +
-                              nscol));  // needed for gemv instead of trmv/trsv
-                  KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj);
-                } else {
-                  // NOTE: we currently supports only default_layout =
-                  // LayoutLeft
-                  Kokkos::View<scalar_t **, default_layout, device_t,
-                               Kokkos::MemoryUnmanaged>
-                      Xjj(Xj.data(), nscol, 1);
-                  KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj);
-                }
-                // update off-diagonal blocks
+                // update with off-diagonal blocks
                 if (nsrow2 > 0) {
                   // extract the off-diagonal blocks of s-th supernodal column
-                  // of U
+                  // of
+                  // U
                   auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow),
                                              Kokkos::ALL());
                   auto Z   = Kokkos::subview(
@@ -3579,720 +3447,607 @@ tstf); } // end elseif
                           workoffset + nscol,
                           workoffset + nscol +
                               nsrow2));  // needed with gemv for update&scatter
-                  KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z);
+                  KokkosBlas::gemv(space, "T", -one, Uij, Z, one, Xj);
                 }
-              }
-            }
-            if (invert_offdiagonal) {
-              // copy diagonals from workspaces
-              const int *work_offset_data = work_offset.data();
-              SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                  sptrsv_init_functor(-1, node_count, nodes_grouped_by_level,
-                                      supercols, work_offset_data, lhs, work);
-              Kokkos::parallel_for(
-                  "parfor_tri_supernode_spmv",
-                  Kokkos::Experimental::require(
-                      team_policy_type(space, lvl_nodes, Kokkos::AUTO),
-                      Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                  sptrsv_init_functor);
-            }
-          }
-
-          // launching sparse-triangular solve functor
-          UpperTriTranSupernodalFunctor<TriSolveHandle, RowMapType, EntriesType,
-                                        ValuesType, LHSType, NGBLType>
-              sptrsv_functor(invert_diagonal, invert_offdiagonal, supercols,
-                             row_map, entries, values, lvl, kernel_type,
-                             diag_kernel_type, lhs, work, work_offset,
-                             nodes_grouped_by_level, node_count);
-
-          using team_policy_t = Kokkos::TeamPolicy<ExecutionSpace>;
-          Kokkos::parallel_for(
-              "parfor_usolve_tran_supernode",
-              Kokkos::Experimental::require(
-                  team_policy_t(space, lvl_nodes, Kokkos::AUTO),
-                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-              sptrsv_functor);
-        } else {  // U stored in CSR
-          // launching sparse-triangular solve functor
-          UpperTriSupernodalFunctor<TriSolveHandle, RowMapType, EntriesType,
-                                    ValuesType, LHSType, NGBLType>
-              sptrsv_functor(invert_diagonal, supercols, row_map, entries,
-                             values, lvl, kernel_type, diag_kernel_type, lhs,
-                             work, work_offset, nodes_grouped_by_level,
-                             node_count);
-
-          using team_policy_t = Kokkos::TeamPolicy<ExecutionSpace>;
-          Kokkos::parallel_for(
-              "parfor_usolve_supernode",
-              Kokkos::Experimental::require(
-                  team_policy_t(space, lvl_nodes, Kokkos::AUTO),
-                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-              sptrsv_functor);
-
-          if (diag_kernel_type_host(lvl) == 3) {
-            // using device-level kernels (functor is called to gather the input
-            // into workspace)
-            scalar_t *dataU = const_cast<scalar_t *>(values.data());
-
-            for (size_type league_rank = 0; league_rank < lvl_nodes;
-                 league_rank++) {
-              auto s = nodes_grouped_by_level_host(node_count + league_rank);
-
-              // supernodal column size
-              int j1 = supercols_host[s];
-              int j2 = supercols_host[s + 1];
-              int nscol =
-                  j2 - j1;  // number of columns in the s-th supernode column
 
-              // "total" number of rows in all the supernodes
-              // (diagonal+off-diagonal)
-              int i1    = row_map_host(j1);
-              int i2    = row_map_host(j1 + 1);
-              int nsrow = i2 - i1;
-              // "total" number of rows in all the off-diagonal supernodes
-              int nsrow2 = nsrow - nscol;
-
-              // workspace
-              int workoffset = work_offset_host(s);
-
-              // create a view for the s-th supernocal block column
-              // NOTE: we currently supports only default_layout = LayoutLeft
-              Kokkos::View<scalar_t **, default_layout, device_t,
-                           Kokkos::MemoryUnmanaged>
-                  viewU(&dataU[i1], nsrow, nscol);
-
-              // extract part of the solution, corresponding to the diagonal
-              // block
-              auto Xj = Kokkos::subview(lhs, range_type(j1, j2));
-              auto Y  = Kokkos::subview(
-                  work,
-                  range_type(
-                      workoffset,
-                      workoffset +
-                          nscol));  // needed for gemv instead of trmv/trsv
-
-              // update with off-diagonal blocks
-              if (nsrow2 > 0) {
-                // extract the off-diagonal blocks of s-th supernodal column of
-                // U
-                auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow),
-                                           Kokkos::ALL());
-                auto Z   = Kokkos::subview(
-                    work,
-                    range_type(
-                        workoffset + nscol,
-                        workoffset + nscol +
-                            nsrow2));  // needed with gemv for update&scatter
-                KokkosBlas::gemv(space, "T", -one, Uij, Z, one, Xj);
+                // "triangular-solve" to compute Xj
+                // extract the diagonal block of s-th supernocal column of U
+                auto Ujj =
+                    Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL());
+                if (invert_diagonal) {
+                  KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y);
+                } else {
+                  // NOTE: we currently supports only default_layout =
+                  // LayoutLeft
+                  Kokkos::View<scalar_t **, default_layout, device_t,
+                               Kokkos::MemoryUnmanaged>
+                      Xjj(Xj.data(), nscol, 1);
+                  KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj);
+                }
               }
-
-              // "triangular-solve" to compute Xj
-              // extract the diagonal block of s-th supernocal column of U
-              auto Ujj =
-                  Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL());
               if (invert_diagonal) {
-                KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y);
-              } else {
-                // NOTE: we currently supports only default_layout = LayoutLeft
-                Kokkos::View<scalar_t **, default_layout, device_t,
-                             Kokkos::MemoryUnmanaged>
-                    Xjj(Xj.data(), nscol, 1);
-                KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj);
+                // copy diagonals from workspaces
+                const int *work_offset_data = work_offset.data();
+                SparseTriSupernodalSpMVFunctor<LHSType> sptrsv_init_functor(
+                    -1, node_count, nodes_grouped_by_level, supercols,
+                    work_offset_data, lhs, work);
+                Kokkos::parallel_for(
+                    "parfor_tri_supernode_spmv",
+                    Kokkos::Experimental::require(
+                        team_policy(space, lvl_nodes, Kokkos::AUTO),
+                        Kokkos::Experimental::WorkItemProperty::
+                            HintLightWeight),
+                    sptrsv_init_functor);
               }
             }
-            if (invert_diagonal) {
-              // copy diagonals from workspaces
-              const int *work_offset_data = work_offset.data();
-              SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                  sptrsv_init_functor(-1, node_count, nodes_grouped_by_level,
-                                      supercols, work_offset_data, lhs, work);
-              Kokkos::parallel_for(
-                  "parfor_tri_supernode_spmv",
-                  Kokkos::Experimental::require(
-                      team_policy_type(space, lvl_nodes, Kokkos::AUTO),
-                      Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                  sptrsv_init_functor);
-            }
           }
-        }
 #ifdef profile_supernodal_etree
-        Kokkos::fence();
-        double time_seconds = timer.seconds();
-        std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds
-                  << " flop count: " << flops
-                  << " kernel-type: " << kernel_type_host(lvl)
-                  << " # of supernodes: " << lvl_nodes << std::endl;
+          Kokkos::fence();
+          double time_seconds = timer.seconds();
+          std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds
+                    << " flop count: " << flops
+                    << " kernel-type: " << kernel_type_host(lvl)
+                    << " # of supernodes: " << lvl_nodes << std::endl;
 #endif
-      } else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_SPMV ||
-                 thandle.get_algorithm() ==
-                     SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
+        } else if (thandle.get_algorithm() ==
+                       SPTRSVAlgorithm::SUPERNODAL_SPMV ||
+                   thandle.get_algorithm() ==
+                       SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
 #ifdef profile_supernodal_etree
-        Kokkos::Timer timer;
-        timer.reset();
+          Kokkos::Timer timer;
+          timer.reset();
 #endif
 
-        // initialize input & output vectors
-        using team_policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
+          // initialize input & output vectors
 
-        // update with one, or two, spmv
-        bool transpose_spmv =
-            ((!thandle.transpose_spmv() && thandle.is_column_major()) ||
-             (thandle.transpose_spmv() && !thandle.is_column_major()));
-        const char *tran = (transpose_spmv ? "T" : "N");
-        if (!transpose_spmv) {  // U stored in CSR
-          if (!invert_offdiagonal) {
-            // solve with diagonals
-            auto digmat = thandle.get_diagblock(lvl);
-            KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work);
-            // copy from work to lhs corresponding to diagonal blocks
-            SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                sptrsv_init_functor(-1, node_count, nodes_grouped_by_level,
-                                    supercols, supercols, lhs, work);
-            Kokkos::parallel_for(
-                "parfor_lsolve_supernode",
-                Kokkos::Experimental::require(
-                    team_policy_type(space, lvl_nodes, Kokkos::AUTO),
-                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                sptrsv_init_functor);
+          // update with one, or two, spmv
+          bool transpose_spmv =
+              ((!thandle.transpose_spmv() && thandle.is_column_major()) ||
+               (thandle.transpose_spmv() && !thandle.is_column_major()));
+          const char *tran = (transpose_spmv ? "T" : "N");
+          if (!transpose_spmv) {  // U stored in CSR
+            if (!invert_offdiagonal) {
+              // solve with diagonals
+              auto digmat = thandle.get_diagblock(lvl);
+              KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work);
+              // copy from work to lhs corresponding to diagonal blocks
+              SparseTriSupernodalSpMVFunctor<LHSType> sptrsv_init_functor(
+                  -1, node_count, nodes_grouped_by_level, supercols, supercols,
+                  lhs, work);
+              Kokkos::parallel_for(
+                  "parfor_lsolve_supernode",
+                  Kokkos::Experimental::require(
+                      team_policy(space, lvl_nodes, Kokkos::AUTO),
+                      Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+                  sptrsv_init_functor);
+            } else {
+              // zero out lhs corresponding to diagonal blocks in lhs, and copy
+              // to work
+              SparseTriSupernodalSpMVFunctor<LHSType> sptrsv_init_functor(
+                  1, node_count, nodes_grouped_by_level, supercols, supercols,
+                  lhs, work);
+              Kokkos::parallel_for(
+                  "parfor_lsolve_supernode",
+                  Kokkos::Experimental::require(
+                      team_policy(space, lvl_nodes, Kokkos::AUTO),
+                      Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+                  sptrsv_init_functor);
+            }
+            // update with off-diagonals (potentiall combined with diagonal
+            // solves)
+            auto submat = thandle.get_submatrix(lvl);
+            KokkosSparse::spmv(space, tran, one, submat, work, one, lhs);
           } else {
-            // zero out lhs corresponding to diagonal blocks in lhs, and copy to
-            // work
-            SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                sptrsv_init_functor(1, node_count, nodes_grouped_by_level,
-                                    supercols, supercols, lhs, work);
-            Kokkos::parallel_for(
-                "parfor_lsolve_supernode",
-                Kokkos::Experimental::require(
-                    team_policy_type(space, lvl_nodes, Kokkos::AUTO),
-                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                sptrsv_init_functor);
-          }
-          // update with off-diagonals (potentiall combined with diagonal
-          // solves)
-          auto submat = thandle.get_submatrix(lvl);
-          KokkosSparse::spmv(space, tran, one, submat, work, one, lhs);
-        } else {
-          if (!invert_offdiagonal) {
-            // zero out lhs corresponding to diagonal blocks in lhs, and copy to
-            // work
-            SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-                sptrsv_init_functor(1, node_count, nodes_grouped_by_level,
-                                    supercols, supercols, lhs, work);
-            Kokkos::parallel_for(
-                "parfor_lsolve_supernode",
-                Kokkos::Experimental::require(
-                    team_policy_type(space, lvl_nodes, Kokkos::AUTO),
-                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                sptrsv_init_functor);
+            if (!invert_offdiagonal) {
+              // zero out lhs corresponding to diagonal blocks in lhs, and copy
+              // to work
+              SparseTriSupernodalSpMVFunctor<LHSType> sptrsv_init_functor(
+                  1, node_count, nodes_grouped_by_level, supercols, supercols,
+                  lhs, work);
+              Kokkos::parallel_for(
+                  "parfor_lsolve_supernode",
+                  Kokkos::Experimental::require(
+                      team_policy(space, lvl_nodes, Kokkos::AUTO),
+                      Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+                  sptrsv_init_functor);
 
-            // update with off-diagonals
-            auto submat = thandle.get_submatrix(lvl);
-            KokkosSparse::spmv(space, tran, one, submat, lhs, one, work);
+              // update with off-diagonals
+              auto submat = thandle.get_submatrix(lvl);
+              KokkosSparse::spmv(space, tran, one, submat, lhs, one, work);
 
-            // solve with diagonals
-            auto digmat = thandle.get_diagblock(lvl);
-            KokkosSparse::spmv(space, tran, one, digmat, work, one, lhs);
-          } else {
-            std::cout << " ** invert_offdiag with U in CSR not supported **"
-                      << std::endl;
+              // solve with diagonals
+              auto digmat = thandle.get_diagblock(lvl);
+              KokkosSparse::spmv(space, tran, one, digmat, work, one, lhs);
+            } else {
+              std::cout << " ** invert_offdiag with U in CSR not supported **"
+                        << std::endl;
+            }
           }
-        }
-        // reinitialize workspace
-        SparseTriSupernodalSpMVFunctor<TriSolveHandle, LHSType, NGBLType>
-            sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level,
-                                    supercols, supercols, lhs, work);
-        Kokkos::parallel_for(
-            "parfor_lsolve_supernode",
-            Kokkos::Experimental::require(
-                team_policy_type(space, lvl_nodes, Kokkos::AUTO),
-                Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-            sptrsv_finalize_functor);
+          // reinitialize workspace
+          SparseTriSupernodalSpMVFunctor<LHSType> sptrsv_finalize_functor(
+              0, node_count, nodes_grouped_by_level, supercols, supercols, lhs,
+              work);
+          Kokkos::parallel_for(
+              "parfor_lsolve_supernode",
+              Kokkos::Experimental::require(
+                  team_policy(space, lvl_nodes, Kokkos::AUTO),
+                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+              sptrsv_finalize_functor);
 
 #ifdef profile_supernodal_etree
-        Kokkos::fence();
-        double time_seconds = timer.seconds();
-        std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds
-                  << " kernel-type: " << kernel_type_host(lvl)
-                  << " # of supernodes: " << lvl_nodes << std::endl;
+          Kokkos::fence();
+          double time_seconds = timer.seconds();
+          std::cout << " > SUPERNODAL UpperTri: " << lvl << " " << time_seconds
+                    << " kernel-type: " << kernel_type_host(lvl)
+                    << " # of supernodes: " << lvl_nodes << std::endl;
 #endif
-      }
+        }
 #endif
-      node_count += lvl_nodes;
+        node_count += lvl_nodes;
 
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
-      cudaProfilerStop();
+        cudaProfilerStop();
 #endif
-    }  // end if
-  }    // end for lvl
+      }  // end if
+    }    // end for lvl
 #ifdef profile_supernodal_etree
-  Kokkos::fence();
-  double sptrsv_time_seconds = sptrsv_timer.seconds();
-  std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl
-            << std::endl;
-  std::cout << "  + Execution space    : " << ExecutionSpace::name()
-            << std::endl;
-  std::cout << " + Memory space       : " << memory_space::name() << std::endl;
+    Kokkos::fence();
+    double sptrsv_time_seconds = sptrsv_timer.seconds();
+    std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl
+              << std::endl;
+    std::cout << "  + Execution space    : " << execution_space::name()
+              << std::endl;
+    std::cout << " + Memory space       : " << temp_mem_space::name()
+              << std::endl;
 #endif
 
-}  // end upper_tri_solve
+  }  // end upper_tri_solve
 
-template <class ExecutionSpace, class TriSolveHandle, class RowMapType,
-          class EntriesType, class ValuesType, class RHSType, class LHSType>
-void tri_solve_chain(ExecutionSpace &space, TriSolveHandle &thandle,
-                     const RowMapType row_map, const EntriesType entries,
-                     const ValuesType values, const RHSType &rhs, LHSType &lhs,
-                     const bool /*is_lowertri_*/) {
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class RHSType, class LHSType>
+  static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle,
+                              const RowMapType row_map,
+                              const EntriesType entries,
+                              const ValuesType values, const RHSType &rhs,
+                              LHSType &lhs, const bool /*is_lowertri_*/) {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
-  cudaProfilerStop();
-#endif
-  typedef typename TriSolveHandle::size_type size_type;
-  typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
-
-  // Algorithm is checked before this function is called
-  auto h_chain_ptr            = thandle.get_host_chain_ptr();
-  size_type num_chain_entries = thandle.get_num_chain_entries();
-
-  // Keep this a host View, create device version and copy to back to host
-  // during scheduling This requires making sure the host view in the handle is
-  // properly updated after the symbolic phase
-  auto nodes_per_level  = thandle.get_nodes_per_level();
-  auto hnodes_per_level = thandle.get_host_nodes_per_level();
-
-  auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
-
-  const bool is_lowertri = thandle.is_lower_tri();
-
-  size_type node_count = 0;
-
-  // REFACTORED to cleanup; next, need debug and timer routines
-  using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
-  using large_cutoff_policy_type =
-      Kokkos::TeamPolicy<LargerCutoffTag, ExecutionSpace>;
-  /*
-    using TP1Functor = TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
-    ValuesType, LHSType, RHSType, NGBLType>; using LTP1Functor =
-    LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-    LHSType, RHSType, NGBLType>; using UTP1Functor =
-    UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-    LHSType, RHSType, NGBLType>; using LSingleBlockFunctor =
-    LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
-    LHSType, RHSType, NGBLType>; using USingleBlockFunctor =
-    UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
-    LHSType, RHSType, NGBLType>;
-  */
-  using SingleBlockFunctor =
-      TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
-                                       LHSType, RHSType, NGBLType>;
-
-  int team_size = thandle.get_team_size();
-  int vector_size =
-      thandle.get_vector_size() > 0 ? thandle.get_vector_size() : 1;
-
-  auto cutoff               = thandle.get_chain_threshold();
-  int team_size_singleblock = team_size;
-
-  // Enumerate options
-  // ts -1,0 | cu 0 - select default ts == 1
-  // ts -1,0 | cu > 0 - select default ts; restriction: ts <= tsmax (auto)
-  // ts > 0 | cu 0 - set
-  // ts > 0 | cu > 0 - set
-  // Controls ts,cu > 0
-  // co > ts  - not all rows can be mapped to a thread - must call largercutoff
-  // impl co <= ts - okay, kernel must be careful not to access out-of-bounds;
-  // some threads idol
-  if (team_size_singleblock <= 0 && cutoff == 0) {
-    team_size_singleblock = 1;
-    // If cutoff == 0, no single-block calls will be made, team_size_singleblock
-    // is unimportant
-  }
-
-  // This is only necessary for Lower,UpperTri functor versions; else,
-  // is_lowertri can be passed as arg to the generic Tri functor...
-  if (is_lowertri) {
-    for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) {
-      size_type schain = h_chain_ptr(chainlink);
-      size_type echain = h_chain_ptr(chainlink + 1);
-
-      if (echain - schain == 1) {
-        // if team_size is -1 (unset), get recommended size from Kokkos
-#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                    LHSType, RHSType, NGBLType>
-            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                 true, node_count);
-#else
-        LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                         LHSType, RHSType, NGBLType>
-            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                 node_count);
+    cudaProfilerStop();
 #endif
-        if (team_size == -1) {
-          team_size =
-              policy_type(space, 1, 1, vector_size)
-                  .team_size_recommended(tstf, Kokkos::ParallelForTag());
-        }
-
-        size_type lvl_nodes = hnodes_per_level(schain);  // lvl == echain????
-        Kokkos::parallel_for(
-            "parfor_l_team_chain1",
-            Kokkos::Experimental::require(
-                policy_type(space, lvl_nodes, team_size, vector_size),
-                Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-            tstf);
-        node_count += lvl_nodes;
-
-      } else {
-        size_type lvl_nodes = 0;
-
-        for (size_type i = schain; i < echain; ++i) {
-          lvl_nodes += hnodes_per_level(i);
-        }
+    // Algorithm is checked before this function is called
+    auto h_chain_ptr            = thandle.get_host_chain_ptr();
+    size_type num_chain_entries = thandle.get_num_chain_entries();
+
+    // Keep this a host View, create device version and copy to back to host
+    // during scheduling This requires making sure the host view in the handle
+    // is properly updated after the symbolic phase
+    auto nodes_per_level  = thandle.get_nodes_per_level();
+    auto hnodes_per_level = thandle.get_host_nodes_per_level();
+
+    auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+
+    const bool is_lowertri = thandle.is_lower_tri();
+
+    size_type node_count = 0;
+
+    // REFACTORED to cleanup; next, need debug and timer routines
+    using large_cutoff_policy_type =
+        Kokkos::TeamPolicy<LargerCutoffTag, execution_space>;
+    /*
+      using TP1Functor = TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
+      ValuesType, LHSType, RHSType>; using LTP1Functor =
+      LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+      LHSType, RHSType>; using UTP1Functor =
+      UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+      LHSType, RHSType>; using LSingleBlockFunctor =
+      LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
+      LHSType, RHSType>; using USingleBlockFunctor =
+      UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
+      LHSType, RHSType>;
+    */
+    using SingleBlockFunctor =
+        TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
+                                         LHSType, RHSType>;
+
+    int team_size = thandle.get_team_size();
+    int vector_size =
+        thandle.get_vector_size() > 0 ? thandle.get_vector_size() : 1;
+
+    auto cutoff               = thandle.get_chain_threshold();
+    int team_size_singleblock = team_size;
+
+    // Enumerate options
+    // ts -1,0 | cu 0 - select default ts == 1
+    // ts -1,0 | cu > 0 - select default ts; restriction: ts <= tsmax (auto)
+    // ts > 0 | cu 0 - set
+    // ts > 0 | cu > 0 - set
+    // Controls ts,cu > 0
+    // co > ts  - not all rows can be mapped to a thread - must call
+    // largercutoff impl co <= ts - okay, kernel must be careful not to access
+    // out-of-bounds; some threads idol
+    if (team_size_singleblock <= 0 && cutoff == 0) {
+      team_size_singleblock = 1;
+      // If cutoff == 0, no single-block calls will be made,
+      // team_size_singleblock is unimportant
+    }
 
-        if (team_size_singleblock <= 0) {
-          team_size_singleblock =
-              policy_type(space, 1, 1, vector_size)
-                  .team_size_recommended(
-                      SingleBlockFunctor(row_map, entries, values, lhs, rhs,
-                                         nodes_grouped_by_level,
-                                         nodes_per_level, node_count, schain,
-                                         echain, is_lowertri),
-                      Kokkos::ParallelForTag());
-        }
+    // This is only necessary for Lower,UpperTri functor versions; else,
+    // is_lowertri can be passed as arg to the generic Tri functor...
+    if (is_lowertri) {
+      for (size_type chainlink = 0; chainlink < num_chain_entries;
+           ++chainlink) {
+        size_type schain = h_chain_ptr(chainlink);
+        size_type echain = h_chain_ptr(chainlink + 1);
 
-        if (cutoff <= team_size_singleblock) {
+        if (echain - schain == 1) {
+          // if team_size is -1 (unset), get recommended size from Kokkos
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
-                                           LHSType, RHSType, NGBLType>
+          TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                      LHSType, RHSType>
               tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   nodes_per_level, node_count, schain, echain, true);
+                   true, node_count);
 #else
-          LowerTriLvlSchedTP1SingleBlockFunctor<
-              RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>
+          LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                           LHSType, RHSType>
               tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   nodes_per_level, node_count, schain, echain);
+                   node_count);
 #endif
+          if (team_size == -1) {
+            team_size =
+                team_policy(space, 1, 1, vector_size)
+                    .team_size_recommended(tstf, Kokkos::ParallelForTag());
+          }
+
+          size_type lvl_nodes = hnodes_per_level(schain);  // lvl == echain????
           Kokkos::parallel_for(
-              "parfor_l_team_chainmulti",
+              "parfor_l_team_chain1",
               Kokkos::Experimental::require(
-                  policy_type(space, 1, team_size_singleblock, vector_size),
+                  team_policy(space, lvl_nodes, team_size, vector_size),
                   Kokkos::Experimental::WorkItemProperty::HintLightWeight),
               tstf);
+          node_count += lvl_nodes;
+
         } else {
-          // team_size_singleblock < cutoff => kernel must allow for a
-          // block-stride internally
+          size_type lvl_nodes = 0;
+
+          for (size_type i = schain; i < echain; ++i) {
+            lvl_nodes += hnodes_per_level(i);
+          }
+
+          if (team_size_singleblock <= 0) {
+            team_size_singleblock =
+                team_policy(space, 1, 1, vector_size)
+                    .team_size_recommended(
+                        SingleBlockFunctor(row_map, entries, values, lhs, rhs,
+                                           nodes_grouped_by_level,
+                                           nodes_per_level, node_count, schain,
+                                           echain, is_lowertri),
+                        Kokkos::ParallelForTag());
+          }
+
+          if (cutoff <= team_size_singleblock) {
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
-                                           LHSType, RHSType, NGBLType>
-              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   nodes_per_level, node_count, schain, echain, true, 0,
-                   cutoff);
+            TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
+                                             ValuesType, LHSType, RHSType>
+                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                     nodes_per_level, node_count, schain, echain, true);
 #else
-          LowerTriLvlSchedTP1SingleBlockFunctor<
-              RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>
-              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   nodes_per_level, node_count, schain, echain, cutoff);
+            LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
+                                                  ValuesType, LHSType, RHSType>
+                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                     nodes_per_level, node_count, schain, echain);
 #endif
-          Kokkos::parallel_for(
-              "parfor_l_team_chainmulti_cutoff",
-              Kokkos::Experimental::require(
-                  large_cutoff_policy_type(1, team_size_singleblock,
-                                           vector_size),
-                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-              tstf);
-        }
-        node_count += lvl_nodes;
-      }
-      // TODO: space.fence()
-      Kokkos::fence();  // TODO - is this necessary? that is, can the
-                        // parallel_for launch before the s/echain values have
-                        // been updated?
-    }
-
-  } else {
-    for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) {
-      size_type schain = h_chain_ptr(chainlink);
-      size_type echain = h_chain_ptr(chainlink + 1);
-
-      if (echain - schain == 1) {
-        // if team_size is -1 (unset), get recommended size from Kokkos
+            Kokkos::parallel_for(
+                "parfor_l_team_chainmulti",
+                Kokkos::Experimental::require(
+                    team_policy(space, 1, team_size_singleblock, vector_size),
+                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+                tstf);
+          } else {
+            // team_size_singleblock < cutoff => kernel must allow for a
+            // block-stride internally
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                    LHSType, RHSType, NGBLType>
-            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                 is_lowertri, node_count);
+            TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
+                                             ValuesType, LHSType, RHSType>
+                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                     nodes_per_level, node_count, schain, echain, true, 0,
+                     cutoff);
 #else
-        UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                         LHSType, RHSType, NGBLType>
-            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                 node_count);
+            LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
+                                                  ValuesType, LHSType, RHSType>
+                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                     nodes_per_level, node_count, schain, echain, cutoff);
 #endif
-        if (team_size == -1) {
-          team_size =
-              policy_type(space, 1, 1, vector_size)
-                  .team_size_recommended(tstf, Kokkos::ParallelForTag());
-        }
-
-        // TODO To use cudagraph here, need to know how many non-unit chains
-        // there are, create a graph for each and launch accordingly
-        size_type lvl_nodes = hnodes_per_level(schain);  // lvl == echain????
-        Kokkos::parallel_for(
-            "parfor_u_team_chain1",
-            Kokkos::Experimental::require(
-                policy_type(space, lvl_nodes, team_size, vector_size),
-                Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-            tstf);
-        node_count += lvl_nodes;
-
-      } else {
-        size_type lvl_nodes = 0;
-
-        for (size_type i = schain; i < echain; ++i) {
-          lvl_nodes += hnodes_per_level(i);
+            Kokkos::parallel_for(
+                "parfor_l_team_chainmulti_cutoff",
+                Kokkos::Experimental::require(
+                    large_cutoff_policy_type(1, team_size_singleblock,
+                                             vector_size),
+                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+                tstf);
+          }
+          node_count += lvl_nodes;
         }
+        // TODO: space.fence()
+        Kokkos::fence();  // TODO - is this necessary? that is, can the
+                          // parallel_for launch before the s/echain values have
+                          // been updated?
+      }
 
-        if (team_size_singleblock <= 0) {
-          // team_size_singleblock = policy_type(1, 1,
-          // 1).team_size_recommended(SingleBlockFunctor(row_map, entries,
-          // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count),
-          // Kokkos::ParallelForTag());
-          team_size_singleblock =
-              policy_type(space, 1, 1, vector_size)
-                  .team_size_recommended(
-                      SingleBlockFunctor(row_map, entries, values, lhs, rhs,
-                                         nodes_grouped_by_level,
-                                         nodes_per_level, node_count, schain,
-                                         echain, is_lowertri),
-                      Kokkos::ParallelForTag());
-        }
+    } else {
+      for (size_type chainlink = 0; chainlink < num_chain_entries;
+           ++chainlink) {
+        size_type schain = h_chain_ptr(chainlink);
+        size_type echain = h_chain_ptr(chainlink + 1);
 
-        if (cutoff <= team_size_singleblock) {
+        if (echain - schain == 1) {
+          // if team_size is -1 (unset), get recommended size from Kokkos
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
-                                           LHSType, RHSType, NGBLType>
+          TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                      LHSType, RHSType>
               tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   nodes_per_level, node_count, schain, echain, is_lowertri);
+                   is_lowertri, node_count);
 #else
-          UpperTriLvlSchedTP1SingleBlockFunctor<
-              RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>
+          UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                           LHSType, RHSType>
               tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   nodes_per_level, node_count, schain, echain);
+                   node_count);
 #endif
+          if (team_size == -1) {
+            team_size =
+                team_policy(space, 1, 1, vector_size)
+                    .team_size_recommended(tstf, Kokkos::ParallelForTag());
+          }
+
+          // TODO To use cudagraph here, need to know how many non-unit chains
+          // there are, create a graph for each and launch accordingly
+          size_type lvl_nodes = hnodes_per_level(schain);  // lvl == echain????
           Kokkos::parallel_for(
-              "parfor_u_team_chainmulti",
+              "parfor_u_team_chain1",
               Kokkos::Experimental::require(
-                  policy_type(space, 1, team_size_singleblock, vector_size),
+                  team_policy(space, lvl_nodes, team_size, vector_size),
                   Kokkos::Experimental::WorkItemProperty::HintLightWeight),
               tstf);
+          node_count += lvl_nodes;
+
         } else {
-          // team_size_singleblock < cutoff => kernel must allow for a
-          // block-stride internally
+          size_type lvl_nodes = 0;
+
+          for (size_type i = schain; i < echain; ++i) {
+            lvl_nodes += hnodes_per_level(i);
+          }
+
+          if (team_size_singleblock <= 0) {
+            // team_size_singleblock = team_policy(1, 1,
+            // 1).team_size_recommended(SingleBlockFunctor(row_map, entries,
+            // values, lhs, rhs, nodes_grouped_by_level, is_lowertri,
+            // node_count), Kokkos::ParallelForTag());
+            team_size_singleblock =
+                team_policy(space, 1, 1, vector_size)
+                    .team_size_recommended(
+                        SingleBlockFunctor(row_map, entries, values, lhs, rhs,
+                                           nodes_grouped_by_level,
+                                           nodes_per_level, node_count, schain,
+                                           echain, is_lowertri),
+                        Kokkos::ParallelForTag());
+          }
+
+          if (cutoff <= team_size_singleblock) {
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
-                                           LHSType, RHSType, NGBLType>
-              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   nodes_per_level, node_count, schain, echain, is_lowertri, 0,
-                   cutoff);
+            TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
+                                             ValuesType, LHSType, RHSType>
+                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                     nodes_per_level, node_count, schain, echain, is_lowertri);
 #else
-          UpperTriLvlSchedTP1SingleBlockFunctor<
-              RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>
-              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   nodes_per_level, node_count, schain, echain, cutoff);
+            UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
+                                                  ValuesType, LHSType, RHSType>
+                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                     nodes_per_level, node_count, schain, echain);
 #endif
-          Kokkos::parallel_for(
-              "parfor_u_team_chainmulti_cutoff",
-              Kokkos::Experimental::require(
-                  large_cutoff_policy_type(1, team_size_singleblock,
-                                           vector_size),
-                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-              tstf);
+            Kokkos::parallel_for(
+                "parfor_u_team_chainmulti",
+                Kokkos::Experimental::require(
+                    team_policy(space, 1, team_size_singleblock, vector_size),
+                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+                tstf);
+          } else {
+            // team_size_singleblock < cutoff => kernel must allow for a
+            // block-stride internally
+#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+            TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
+                                             ValuesType, LHSType, RHSType>
+                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                     nodes_per_level, node_count, schain, echain, is_lowertri,
+                     0, cutoff);
+#else
+            UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
+                                                  ValuesType, LHSType, RHSType>
+                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                     nodes_per_level, node_count, schain, echain, cutoff);
+#endif
+            Kokkos::parallel_for(
+                "parfor_u_team_chainmulti_cutoff",
+                Kokkos::Experimental::require(
+                    large_cutoff_policy_type(1, team_size_singleblock,
+                                             vector_size),
+                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+                tstf);
+          }
+          node_count += lvl_nodes;
         }
-        node_count += lvl_nodes;
+        // TODO: space.fence()
+        Kokkos::fence();  // TODO - is this necessary? that is, can the
+                          // parallel_for launch before the s/echain values have
+                          // been updated?
       }
-      // TODO: space.fence()
-      Kokkos::fence();  // TODO - is this necessary? that is, can the
-                        // parallel_for launch before the s/echain values have
-                        // been updated?
     }
-  }
-
-}  // end tri_solve_chain
-
-// --------------------------------
-// Stream interfaces
-// --------------------------------
-
-template <class ExecutionSpace, class TriSolveHandle, class RowMapType,
-          class EntriesType, class ValuesType, class RHSType, class LHSType>
-void lower_tri_solve_streams(const std::vector<ExecutionSpace> &execspace_v,
-                             const std::vector<TriSolveHandle *> &thandle_v,
-                             const std::vector<RowMapType> &row_map_v,
-                             const std::vector<EntriesType> &entries_v,
-                             const std::vector<ValuesType> &values_v,
-                             const std::vector<RHSType> &rhs_v,
-                             std::vector<LHSType> &lhs_v) {
-  // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment
-  using size_type = typename TriSolveHandle::size_type;
-  using NGBLType  = typename TriSolveHandle::nnz_lno_view_t;
-  using nodes_per_level_type =
-      typename TriSolveHandle::hostspace_nnz_lno_view_t;
-  using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t;
-
-  // Create vectors for handles' data in streams
-  int nstreams = execspace_v.size();
-  std::vector<size_type> nlevels_v(nstreams);
-  std::vector<nodes_per_level_type> hnodes_per_level_v(nstreams);
-  std::vector<nodes_grouped_by_level_type> nodes_grouped_by_level_v(nstreams);
-  std::vector<size_type> node_count_v(nstreams);
-
-  // Retrieve data from handles and find max. number of levels among streams
-  size_type nlevels_max = 0;
-  for (int i = 0; i < nstreams; i++) {
-    nlevels_v[i]                = thandle_v[i]->get_num_levels();
-    hnodes_per_level_v[i]       = thandle_v[i]->get_host_nodes_per_level();
-    nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level();
-    node_count_v[i]             = 0;
-    if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i];
-  }
-
-  // Main loop must be performed sequential
-  for (size_type lvl = 0; lvl < nlevels_max; lvl++) {
-    // 1. Launch work on all streams
+  }  // end tri_solve_chain
+
+  // --------------------------------
+  // Stream interfaces
+  // --------------------------------
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class RHSType, class LHSType>
+  static void lower_tri_solve_streams(
+      const std::vector<execution_space> &execspace_v,
+      const std::vector<TriSolveHandle *> &thandle_v,
+      const std::vector<RowMapType> &row_map_v,
+      const std::vector<EntriesType> &entries_v,
+      const std::vector<ValuesType> &values_v,
+      const std::vector<RHSType> &rhs_v, std::vector<LHSType> &lhs_v) {
+    // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment
+    using nodes_per_level_type =
+        typename TriSolveHandle::hostspace_nnz_lno_view_t;
+    using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t;
+
+    // Create vectors for handles' data in streams
+    int nstreams = execspace_v.size();
+    std::vector<size_type> nlevels_v(nstreams);
+    std::vector<nodes_per_level_type> hnodes_per_level_v(nstreams);
+    std::vector<nodes_grouped_by_level_type> nodes_grouped_by_level_v(nstreams);
+    std::vector<size_type> node_count_v(nstreams);
+
+    // Retrieve data from handles and find max. number of levels among streams
+    size_type nlevels_max = 0;
     for (int i = 0; i < nstreams; i++) {
-      // Only if stream i-th still has this level
-      if (lvl < nlevels_v[i]) {
-        size_type lvl_nodes = hnodes_per_level_v[i](lvl);
-        if (lvl_nodes != 0) {
-          if (thandle_v[i]->get_algorithm() ==
-              KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) {
-            Kokkos::parallel_for(
-                "parfor_fixed_lvl",
-                Kokkos::RangePolicy<ExecutionSpace>(
-                    execspace_v[i], node_count_v[i],
-                    node_count_v[i] + lvl_nodes),
-                LowerTriLvlSchedRPSolverFunctor<RowMapType, EntriesType,
-                                                ValuesType, LHSType, RHSType,
-                                                NGBLType>(
-                    row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i],
-                    nodes_grouped_by_level_v[i]));
-          } else if (thandle_v[i]->get_algorithm() ==
-                     KokkosSparse::Experimental::SPTRSVAlgorithm::
-                         SEQLVLSCHD_TP1) {
-            using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
-            int team_size     = thandle_v[i]->get_team_size();
+      nlevels_v[i]                = thandle_v[i]->get_num_levels();
+      hnodes_per_level_v[i]       = thandle_v[i]->get_host_nodes_per_level();
+      nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level();
+      node_count_v[i]             = 0;
+      if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i];
+    }
+
+    // Main loop must be performed sequential
+    for (size_type lvl = 0; lvl < nlevels_max; lvl++) {
+      // 1. Launch work on all streams
+      for (int i = 0; i < nstreams; i++) {
+        // Only if stream i-th still has this level
+        if (lvl < nlevels_v[i]) {
+          size_type lvl_nodes = hnodes_per_level_v[i](lvl);
+          if (lvl_nodes != 0) {
+            if (thandle_v[i]->get_algorithm() ==
+                KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) {
+              Kokkos::parallel_for(
+                  "parfor_fixed_lvl",
+                  range_policy(execspace_v[i], node_count_v[i],
+                               node_count_v[i] + lvl_nodes),
+                  LowerTriLvlSchedRPSolverFunctor<RowMapType, EntriesType,
+                                                  ValuesType, LHSType, RHSType>(
+                      row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
+                      rhs_v[i], nodes_grouped_by_level_v[i]));
+            } else if (thandle_v[i]->get_algorithm() ==
+                       KokkosSparse::Experimental::SPTRSVAlgorithm::
+                           SEQLVLSCHD_TP1) {
+              int team_size = thandle_v[i]->get_team_size();
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-            TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                        LHSType, RHSType, NGBLType>
-                tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
-                     rhs_v[i], nodes_grouped_by_level_v[i], true,
-                     node_count_v[i]);
+              TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                          LHSType, RHSType>
+                  tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
+                       rhs_v[i], nodes_grouped_by_level_v[i], true,
+                       node_count_v[i]);
 #else
-            LowerTriLvlSchedTP1SolverFunctor<
-                RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>
-                tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
-                     rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]);
+              LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
+                                               ValuesType, LHSType, RHSType>
+                  tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
+                       rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]);
 #endif
-            if (team_size == -1)
-              Kokkos::parallel_for(
-                  "parfor_l_team",
-                  policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf);
-            else
-              Kokkos::parallel_for(
-                  "parfor_l_team",
-                  policy_type(execspace_v[i], lvl_nodes, team_size), tstf);
-          }
-          node_count_v[i] += lvl_nodes;
-        }  // end if (lvl_nodes != 0)
-      }    // end if (lvl < nlevels_v[i])
-    }      // end for streams
-  }        // end for lvl
-}  // end lower_tri_solve_streams
-
-template <class ExecutionSpace, class TriSolveHandle, class RowMapType,
-          class EntriesType, class ValuesType, class RHSType, class LHSType>
-void upper_tri_solve_streams(const std::vector<ExecutionSpace> &execspace_v,
-                             const std::vector<TriSolveHandle *> &thandle_v,
-                             const std::vector<RowMapType> &row_map_v,
-                             const std::vector<EntriesType> &entries_v,
-                             const std::vector<ValuesType> &values_v,
-                             const std::vector<RHSType> &rhs_v,
-                             std::vector<LHSType> &lhs_v) {
-  // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment
-  using size_type = typename TriSolveHandle::size_type;
-  using NGBLType  = typename TriSolveHandle::nnz_lno_view_t;
-  using nodes_per_level_type =
-      typename TriSolveHandle::hostspace_nnz_lno_view_t;
-  using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t;
-
-  // Create vectors for handles' data in streams
-  int nstreams = execspace_v.size();
-  std::vector<size_type> nlevels_v(nstreams);
-  std::vector<nodes_per_level_type> hnodes_per_level_v(nstreams);
-  std::vector<nodes_grouped_by_level_type> nodes_grouped_by_level_v(nstreams);
-  std::vector<size_type> node_count_v(nstreams);
-
-  // Retrieve data from handles and find max. number of levels among streams
-  size_type nlevels_max = 0;
-  for (int i = 0; i < nstreams; i++) {
-    nlevels_v[i]                = thandle_v[i]->get_num_levels();
-    hnodes_per_level_v[i]       = thandle_v[i]->get_host_nodes_per_level();
-    nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level();
-    node_count_v[i]             = 0;
-    if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i];
-  }
-
-  // Main loop must be performed sequential
-  for (size_type lvl = 0; lvl < nlevels_max; lvl++) {
-    // 1. Launch work on all streams
+              if (team_size == -1)
+                Kokkos::parallel_for(
+                    "parfor_l_team",
+                    team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf);
+              else
+                Kokkos::parallel_for(
+                    "parfor_l_team",
+                    team_policy(execspace_v[i], lvl_nodes, team_size), tstf);
+            }
+            node_count_v[i] += lvl_nodes;
+          }  // end if (lvl_nodes != 0)
+        }    // end if (lvl < nlevels_v[i])
+      }      // end for streams
+    }        // end for lvl
+  }          // end lower_tri_solve_streams
+
+  template <class RowMapType, class EntriesType, class ValuesType,
+            class RHSType, class LHSType>
+  static void upper_tri_solve_streams(
+      const std::vector<execution_space> &execspace_v,
+      const std::vector<TriSolveHandle *> &thandle_v,
+      const std::vector<RowMapType> &row_map_v,
+      const std::vector<EntriesType> &entries_v,
+      const std::vector<ValuesType> &values_v,
+      const std::vector<RHSType> &rhs_v, std::vector<LHSType> &lhs_v) {
+    // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment
+    using nodes_per_level_type =
+        typename TriSolveHandle::hostspace_nnz_lno_view_t;
+    using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t;
+
+    // Create vectors for handles' data in streams
+    int nstreams = execspace_v.size();
+    std::vector<size_type> nlevels_v(nstreams);
+    std::vector<nodes_per_level_type> hnodes_per_level_v(nstreams);
+    std::vector<nodes_grouped_by_level_type> nodes_grouped_by_level_v(nstreams);
+    std::vector<size_type> node_count_v(nstreams);
+
+    // Retrieve data from handles and find max. number of levels among streams
+    size_type nlevels_max = 0;
     for (int i = 0; i < nstreams; i++) {
-      // Only if stream i-th still has this level
-      if (lvl < nlevels_v[i]) {
-        size_type lvl_nodes = hnodes_per_level_v[i](lvl);
-        if (lvl_nodes != 0) {
-          if (thandle_v[i]->get_algorithm() ==
-              KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) {
-            Kokkos::parallel_for(
-                "parfor_fixed_lvl",
-                Kokkos::RangePolicy<ExecutionSpace>(
-                    execspace_v[i], node_count_v[i],
-                    node_count_v[i] + lvl_nodes),
-                UpperTriLvlSchedRPSolverFunctor<RowMapType, EntriesType,
-                                                ValuesType, LHSType, RHSType,
-                                                NGBLType>(
-                    row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i],
-                    nodes_grouped_by_level_v[i]));
-          } else if (thandle_v[i]->get_algorithm() ==
-                     KokkosSparse::Experimental::SPTRSVAlgorithm::
-                         SEQLVLSCHD_TP1) {
-            using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
-            int team_size     = thandle_v[i]->get_team_size();
+      nlevels_v[i]                = thandle_v[i]->get_num_levels();
+      hnodes_per_level_v[i]       = thandle_v[i]->get_host_nodes_per_level();
+      nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level();
+      node_count_v[i]             = 0;
+      if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i];
+    }
+
+    // Main loop must be performed sequential
+    for (size_type lvl = 0; lvl < nlevels_max; lvl++) {
+      // 1. Launch work on all streams
+      for (int i = 0; i < nstreams; i++) {
+        // Only if stream i-th still has this level
+        if (lvl < nlevels_v[i]) {
+          size_type lvl_nodes = hnodes_per_level_v[i](lvl);
+          if (lvl_nodes != 0) {
+            if (thandle_v[i]->get_algorithm() ==
+                KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) {
+              Kokkos::parallel_for(
+                  "parfor_fixed_lvl",
+                  range_policy(execspace_v[i], node_count_v[i],
+                               node_count_v[i] + lvl_nodes),
+                  UpperTriLvlSchedRPSolverFunctor<RowMapType, EntriesType,
+                                                  ValuesType, LHSType, RHSType>(
+                      row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
+                      rhs_v[i], nodes_grouped_by_level_v[i]));
+            } else if (thandle_v[i]->get_algorithm() ==
+                       KokkosSparse::Experimental::SPTRSVAlgorithm::
+                           SEQLVLSCHD_TP1) {
+              int team_size = thandle_v[i]->get_team_size();
 #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-            TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                        LHSType, RHSType, NGBLType>
-                tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
-                     rhs_v[i], nodes_grouped_by_level_v[i], false,
-                     node_count_v[i]);
+              TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                          LHSType, RHSType>
+                  tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
+                       rhs_v[i], nodes_grouped_by_level_v[i], false,
+                       node_count_v[i]);
 #else
-            UpperTriLvlSchedTP1SolverFunctor<
-                RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>
-                tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
-                     rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]);
+              UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
+                                               ValuesType, LHSType, RHSType>
+                  tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
+                       rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]);
 #endif
-            if (team_size == -1)
-              Kokkos::parallel_for(
-                  "parfor_l_team",
-                  policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf);
-            else
-              Kokkos::parallel_for(
-                  "parfor_l_team",
-                  policy_type(execspace_v[i], lvl_nodes, team_size), tstf);
-          }
-          node_count_v[i] += lvl_nodes;
-        }  // end if (lvl_nodes != 0)
-      }    // end if (lvl < nlevels_v[i])
-    }      // end for streams
-  }        // end for lvl
-}  // end upper_tri_solve_streams
+              if (team_size == -1)
+                Kokkos::parallel_for(
+                    "parfor_l_team",
+                    team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf);
+              else
+                Kokkos::parallel_for(
+                    "parfor_l_team",
+                    team_policy(execspace_v[i], lvl_nodes, team_size), tstf);
+            }
+            node_count_v[i] += lvl_nodes;
+          }  // end if (lvl_nodes != 0)
+        }    // end if (lvl < nlevels_v[i])
+      }      // end for streams
+    }        // end for lvl
+  }          // end upper_tri_solve_streams
+
+};  // struct SptrsvWrap
 
 }  // namespace Experimental
 }  // namespace Impl
diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp
index 6ad321c286..d69c499c60 100644
--- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp
+++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp
@@ -120,6 +120,9 @@ struct SPTRSV_SOLVE<ExecutionSpace, KernelHandle, RowMapType, EntriesType,
   static void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle,
                            const RowMapType row_map, const EntriesType entries,
                            const ValuesType values, BType b, XType x) {
+    using Sptrsv =
+        Experimental::SptrsvWrap<typename KernelHandle::SPTRSVHandleType>;
+
     // Call specific algorithm type
     auto sptrsv_handle = handle->get_sptrsv_handle();
     Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri()
@@ -132,19 +135,19 @@ struct SPTRSV_SOLVE<ExecutionSpace, KernelHandle, RowMapType, EntriesType,
       }
       if (sptrsv_handle->get_algorithm() ==
           KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) {
-        Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries,
-                                      values, b, x, true);
+        Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, values,
+                                b, x, true);
       } else {
 #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
         using ExecSpace = typename RowMapType::memory_space::execution_space;
         if (std::is_same<ExecSpace, Kokkos::Cuda>::value)
           // TODO: set stream in thandle's sptrsvCudaGraph
-          Experimental::lower_tri_solve_cg(*sptrsv_handle, row_map, entries,
-                                           values, b, x);
+          Sptrsv::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, values,
+                                     b, x);
         else
 #endif
-          Experimental::lower_tri_solve(space, *sptrsv_handle, row_map, entries,
-                                        values, b, x);
+          Sptrsv::lower_tri_solve(space, *sptrsv_handle, row_map, entries,
+                                  values, b, x);
       }
     } else {
       if (sptrsv_handle->is_symbolic_complete() == false) {
@@ -153,19 +156,19 @@ struct SPTRSV_SOLVE<ExecutionSpace, KernelHandle, RowMapType, EntriesType,
       }
       if (sptrsv_handle->get_algorithm() ==
           KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) {
-        Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries,
-                                      values, b, x, false);
+        Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, values,
+                                b, x, false);
       } else {
 #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
         using ExecSpace = typename RowMapType::memory_space::execution_space;
         if (std::is_same<ExecSpace, Kokkos::Cuda>::value)
           // TODO: set stream in thandle's sptrsvCudaGraph
-          Experimental::upper_tri_solve_cg(*sptrsv_handle, row_map, entries,
-                                           values, b, x);
+          Sptrsv::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, values,
+                                     b, x);
         else
 #endif
-          Experimental::upper_tri_solve(space, *sptrsv_handle, row_map, entries,
-                                        values, b, x);
+          Sptrsv::upper_tri_solve(space, *sptrsv_handle, row_map, entries,
+                                  values, b, x);
       }
     }
     Kokkos::Profiling::popRegion();
@@ -178,6 +181,8 @@ struct SPTRSV_SOLVE<ExecutionSpace, KernelHandle, RowMapType, EntriesType,
       const std::vector<EntriesType> &entries_v,
       const std::vector<ValuesType> &values_v, const std::vector<BType> &b_v,
       std::vector<XType> &x_v) {
+    using Sptrsv =
+        Experimental::SptrsvWrap<typename KernelHandle::SPTRSVHandleType>;
     // Call specific algorithm type
     // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment
     //       Assume streams have the same either lower or upper matrix type
@@ -197,9 +202,8 @@ struct SPTRSV_SOLVE<ExecutionSpace, KernelHandle, RowMapType, EntriesType,
                                            entries_v[i]);
         }
       }
-      Experimental::lower_tri_solve_streams(execspace_v, sptrsv_handle_v,
-                                            row_map_v, entries_v, values_v, b_v,
-                                            x_v);
+      Sptrsv::lower_tri_solve_streams(execspace_v, sptrsv_handle_v, row_map_v,
+                                      entries_v, values_v, b_v, x_v);
     } else {
       for (int i = 0; i < static_cast<int>(execspace_v.size()); i++) {
         if (sptrsv_handle_v[i]->is_symbolic_complete() == false) {
@@ -208,9 +212,8 @@ struct SPTRSV_SOLVE<ExecutionSpace, KernelHandle, RowMapType, EntriesType,
                                            entries_v[i]);
         }
       }
-      Experimental::upper_tri_solve_streams(execspace_v, sptrsv_handle_v,
-                                            row_map_v, entries_v, values_v, b_v,
-                                            x_v);
+      Sptrsv::upper_tri_solve_streams(execspace_v, sptrsv_handle_v, row_map_v,
+                                      entries_v, values_v, b_v, x_v);
     }
     Kokkos::Profiling::popRegion();
   }
diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp
index 680045823e..11c43dbb97 100644
--- a/sparse/src/KokkosKernels_Handle.hpp
+++ b/sparse/src/KokkosKernels_Handle.hpp
@@ -837,10 +837,12 @@ class KokkosKernelsHandle {
   SPTRSVHandleType *get_sptrsv_handle() { return this->sptrsvHandle; }
 
   void create_sptrsv_handle(KokkosSparse::Experimental::SPTRSVAlgorithm algm,
-                            size_type nrows, bool lower_tri) {
+                            size_type nrows, bool lower_tri,
+                            size_type block_size = 0) {
     this->destroy_sptrsv_handle();
     this->is_owner_of_the_sptrsv_handle = true;
-    this->sptrsvHandle = new SPTRSVHandleType(algm, nrows, lower_tri);
+    this->sptrsvHandle =
+        new SPTRSVHandleType(algm, nrows, lower_tri, block_size);
     //    this->sptrsvHandle->init_handle(nrows);
     this->sptrsvHandle->set_team_size(this->team_work_size);
     this->sptrsvHandle->set_vector_size(this->vector_size);
diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp
index cf23bfdc1f..fb322b7f95 100644
--- a/sparse/src/KokkosSparse_sptrsv_handle.hpp
+++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp
@@ -56,76 +56,79 @@ template <class size_type_, class lno_t_, class scalar_t_, class ExecutionSpace,
           class TemporaryMemorySpace, class PersistentMemorySpace>
 class SPTRSVHandle {
  public:
-  typedef ExecutionSpace HandleExecSpace;
-  typedef TemporaryMemorySpace HandleTempMemorySpace;
-  typedef PersistentMemorySpace HandlePersistentMemorySpace;
-
-  typedef ExecutionSpace execution_space;
-  typedef HandlePersistentMemorySpace memory_space;
-
-  typedef typename std::remove_const<size_type_>::type size_type;
-  typedef const size_type const_size_type;
-
-  typedef typename std::remove_const<lno_t_>::type nnz_lno_t;
-  typedef const nnz_lno_t const_nnz_lno_t;
-
-  typedef typename std::remove_const<scalar_t_>::type scalar_t;
-  typedef const scalar_t const_nnz_scalar_t;
-
-  // row_map type (managed memory)
-  typedef typename Kokkos::View<size_type *, HandleTempMemorySpace>
-      nnz_row_view_temp_t;
-  typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace>
-      nnz_row_view_t;
-  typedef typename nnz_row_view_t::HostMirror host_nnz_row_view_t;
-  typedef typename Kokkos::View<int *, HandlePersistentMemorySpace>
-      int_row_view_t;
-  typedef typename Kokkos::View<int64_t *, HandlePersistentMemorySpace>
-      int64_row_view_t;
+  using HandleExecSpace             = ExecutionSpace;
+  using HandleTempMemorySpace       = TemporaryMemorySpace;
+  using HandlePersistentMemorySpace = PersistentMemorySpace;
+
+  using execution_space = ExecutionSpace;
+  using memory_space    = HandlePersistentMemorySpace;
+
+  using TeamPolicy  = Kokkos::TeamPolicy<execution_space>;
+  using RangePolicy = Kokkos::RangePolicy<execution_space>;
+
+  using size_type       = typename std::remove_const<size_type_>::type;
+  using const_size_type = const size_type;
+
+  using nnz_lno_t       = typename std::remove_const<lno_t_>::type;
+  using const_nnz_lno_t = const nnz_lno_t;
+
+  using scalar_t           = typename std::remove_const<scalar_t_>::type;
+  using const_nnz_scalar_t = const scalar_t;
+
+  // Row_map type (managed memory)
+  using nnz_row_view_temp_t =
+      typename Kokkos::View<size_type *, HandleTempMemorySpace>;
+  using nnz_row_view_t =
+      typename Kokkos::View<size_type *, HandlePersistentMemorySpace>;
+  using host_nnz_row_view_t = typename nnz_row_view_t::HostMirror;
+  using int_row_view_t =
+      typename Kokkos::View<int *, HandlePersistentMemorySpace>;
+  using int64_row_view_t =
+      typename Kokkos::View<int64_t *, HandlePersistentMemorySpace>;
   // typedef typename row_lno_persistent_work_view_t::HostMirror
   // row_lno_persistent_work_host_view_t; //Host view type
-  typedef typename Kokkos::View<
+  using nnz_row_unmanaged_view_t = typename Kokkos::View<
       const size_type *, HandlePersistentMemorySpace,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>
-      nnz_row_unmanaged_view_t;  // for rank1 subviews
+      Kokkos::MemoryTraits<Kokkos::Unmanaged |
+                           Kokkos::RandomAccess>>;  // for rank1 subviews
 
   // values type (managed memory)
-  typedef typename Kokkos::View<scalar_t *, HandleTempMemorySpace>
-      nnz_scalar_view_temp_t;
-  typedef typename Kokkos::View<scalar_t *, HandlePersistentMemorySpace>
-      nnz_scalar_view_t;
-  typedef typename nnz_scalar_view_t::HostMirror host_nnz_scalar_view_t;
-  typedef typename Kokkos::View<
+  using nnz_scalar_view_temp_t =
+      typename Kokkos::View<scalar_t *, HandleTempMemorySpace>;
+  using nnz_scalar_view_t =
+      typename Kokkos::View<scalar_t *, HandlePersistentMemorySpace>;
+  using host_nnz_scalar_view_t      = typename nnz_scalar_view_t::HostMirror;
+  using nnz_scalar_unmanaged_view_t = typename Kokkos::View<
       const scalar_t *, HandlePersistentMemorySpace,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>
-      nnz_scalar_unmanaged_view_t;  // for rank1 subviews
+      Kokkos::MemoryTraits<Kokkos::Unmanaged |
+                           Kokkos::RandomAccess>>;  // for rank1 subviews
 
   // entries type (managed memory)
-  typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace>
-      nnz_lno_view_temp_t;
-  typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace>
-      nnz_lno_view_t;
-  typedef typename Kokkos::View<nnz_lno_t *, Kokkos::HostSpace>
-      hostspace_nnz_lno_view_t;
-  typedef typename nnz_lno_view_t::HostMirror host_nnz_lno_view_t;
-  typedef typename Kokkos::View<
+  using nnz_lno_view_temp_t =
+      typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace>;
+  using nnz_lno_view_t =
+      typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace>;
+  using hostspace_nnz_lno_view_t =
+      typename Kokkos::View<nnz_lno_t *, Kokkos::HostSpace>;
+  using host_nnz_lno_view_t      = typename nnz_lno_view_t::HostMirror;
+  using nnz_lno_unmanaged_view_t = typename Kokkos::View<
       const nnz_lno_t *, HandlePersistentMemorySpace,
-      Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>
-      nnz_lno_unmanaged_view_t;  // for rank1 subviews
+      Kokkos::MemoryTraits<Kokkos::Unmanaged |
+                           Kokkos::RandomAccess>>;  // for rank1 subviews
   // typedef typename nnz_lno_persistent_work_view_t::HostMirror
   // nnz_lno_persistent_work_host_view_t; //Host view type
 
-  typedef typename std::make_signed<
-      typename nnz_row_view_t::non_const_value_type>::type signed_integral_t;
-  typedef Kokkos::View<signed_integral_t *,
-                       typename nnz_row_view_t::array_layout,
-                       typename nnz_row_view_t::device_type,
-                       typename nnz_row_view_t::memory_traits>
-      signed_nnz_lno_view_t;
-  typedef typename signed_nnz_lno_view_t::HostMirror host_signed_nnz_lno_view_t;
+  using signed_integral_t = typename std::make_signed<
+      typename nnz_row_view_t::non_const_value_type>::type;
+  using signed_nnz_lno_view_t =
+      Kokkos::View<signed_integral_t *, typename nnz_row_view_t::array_layout,
+                   typename nnz_row_view_t::device_type,
+                   typename nnz_row_view_t::memory_traits>;
 
-  typedef typename Kokkos::View<scalar_t **, HandlePersistentMemorySpace>
-      mtx_scalar_view_t;
+  using host_signed_nnz_lno_view_t = typename signed_nnz_lno_view_t::HostMirror;
+
+  using mtx_scalar_view_t =
+      typename Kokkos::View<scalar_t **, HandlePersistentMemorySpace>;
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 #if (CUDA_VERSION >= 11030)
@@ -214,7 +217,7 @@ class SPTRSVHandle {
   };
 #endif
 
-  typedef cuSparseHandleType SPTRSVcuSparseHandleType;
+  using SPTRSVcuSparseHandleType = cuSparseHandleType;
 #endif
 
 #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
@@ -228,7 +231,7 @@ class SPTRSVHandle {
     //~cudaGraphWrapperType() { }
   };
 
-  typedef cudaGraphWrapperType SPTRSVcudaGraphWrapperType;
+  using SPTRSVcudaGraphWrapperType = cudaGraphWrapperType;
 
   void create_SPTRSVcudaGraphWrapperType() {
     destroy_SPTRSVcudaGraphWrapperType();
@@ -296,6 +299,7 @@ class SPTRSVHandle {
   nnz_lno_view_t nodes_grouped_by_level;
   hostspace_nnz_lno_view_t hnodes_grouped_by_level;  // NEW
   size_type nlevel;
+  size_type block_size;  // block_size > 0 implies BSR
 
   int team_size;
   int vector_size;
@@ -423,7 +427,8 @@ class SPTRSVHandle {
 
  public:
   SPTRSVHandle(SPTRSVAlgorithm choice, const size_type nrows_, bool lower_tri_,
-               bool symbolic_complete_ = false, bool numeric_complete_ = false)
+               const size_type block_size_ = 0, bool symbolic_complete_ = false,
+               bool numeric_complete_ = false)
       :
 #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
         cudagraphCreated(false),
@@ -438,6 +443,7 @@ class SPTRSVHandle {
         nodes_grouped_by_level(),
         hnodes_grouped_by_level(),
         nlevel(0),
+        block_size(block_size_),
         team_size(-1),
         vector_size(-1),
         stored_diagonal(false),
@@ -1007,6 +1013,14 @@ class SPTRSVHandle {
 
   void set_num_levels(size_type nlevels_) { this->nlevel = nlevels_; }
 
+  KOKKOS_INLINE_FUNCTION
+  size_type get_block_size() const { return block_size; }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_block_size(const size_type block_size_) {
+    this->block_size = block_size_;
+  }
+
   void set_symbolic_complete() { this->symbolic_complete = true; }
   void set_symbolic_incomplete() { this->symbolic_complete = false; }
 

From cb7a552b75ddf7648782b127e82e8210bd4bca48 Mon Sep 17 00:00:00 2001
From: Luc Berger <lberge@sandia.gov>
Date: Wed, 10 Jul 2024 20:21:19 -0600
Subject: [PATCH 28/32] Sparse - SpMV: removing calls to unsuported oneapi -
 MKL functions (#2274)

---
 sparse/src/KokkosSparse_spmv.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp
index 5fa0be3619..de98701b7c 100644
--- a/sparse/src/KokkosSparse_spmv.hpp
+++ b/sparse/src/KokkosSparse_spmv.hpp
@@ -281,7 +281,7 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[],
 #ifdef KOKKOS_ENABLE_SYCL
       if constexpr (std::is_same_v<ExecutionSpace,
                                    Kokkos::Experimental::SYCL>) {
-        useNative = useNative || (mode[0] == Conjugate[0]);
+        useNative = useNative || (mode[0] != NoTranspose[0]);
       }
 #endif
 #endif

From aaa634b6c4a8b97d24fea68b863cbe9dd615764b Mon Sep 17 00:00:00 2001
From: Luc Berger <lberge@sandia.gov>
Date: Thu, 11 Jul 2024 18:52:28 -0600
Subject: [PATCH 29/32] Sycl gemv beta (#2276)

* BLAS - GEMV: zero out Y when beta == 0 in SYCL TPL code path

* BLAS - GEMV: reverting wrong change from previous PR, my bad.

* Applying clang-format
---
 blas/src/KokkosBlas2_gemv.hpp                | 5 ++---
 blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 4 ++++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp
index e68f2cca75..88ffc63810 100644
--- a/blas/src/KokkosBlas2_gemv.hpp
+++ b/blas/src/KokkosBlas2_gemv.hpp
@@ -165,9 +165,8 @@ void gemv(const ExecutionSpace& space, const char trans[],
   // oneMKL supports both row-major and column-major of A
   // but only supports oneapi::mkl::transpose::nontrans op
   useFallback =
-      useFallback || ((tolower(*trans) == 't' || tolower(*trans) == 'c') &&
-                      std::is_same_v<typename AViewType::memory_space,
-                                     Kokkos::Experimental::SYCLDeviceUSMSpace>);
+      useFallback || !std::is_same_v<typename AViewType::memory_space,
+                                     Kokkos::Experimental::SYCLDeviceUSMSpace>;
 #endif
 #endif
 
diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp
index 304dd349bf..07d9476b66 100644
--- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp
+++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp
@@ -824,6 +824,10 @@ struct kokkos_to_std_type_map<T, true> {
                      const AViewType& A, const XViewType& X,                   \
                      typename YViewType::const_value_type& beta,               \
                      const YViewType& Y) {                                     \
+      if (beta == Kokkos::ArithTraits<SCALAR>::zero()) {                       \
+        Kokkos::deep_copy(Y, Kokkos::ArithTraits<SCALAR>::zero());             \
+      }                                                                        \
+                                                                               \
       bool row_major       = std::is_same<Kokkos::LayoutRight, LAYOUT>::value; \
       const std::int64_t M = A.extent(0);                                      \
       const std::int64_t N = A.extent(1);                                      \

From e7a4b0723857d56d3d15301456606760c3f17252 Mon Sep 17 00:00:00 2001
From: Daniel Arndt <arndtd@ornl.gov>
Date: Fri, 12 Jul 2024 10:38:47 -0400
Subject: [PATCH 30/32] Unify alignPtrTo implementation (#2275)

---
 common/src/KokkosKernels_Utils.hpp | 37 ++++++------------------------
 1 file changed, 7 insertions(+), 30 deletions(-)

diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp
index 89aeabb823..92419424b6 100644
--- a/common/src/KokkosKernels_Utils.hpp
+++ b/common/src/KokkosKernels_Utils.hpp
@@ -1527,41 +1527,18 @@ struct array_sum_reduce {
   }
 };
 
-/* Several alternatives were considered for SYCL, including
-
-unsigned int f1(unsigned int i, unsigned int align)
-{
-    return ((i + align - 1) / align * align);
-}
-
-unsigned int f2(unsigned int i, unsigned int align)
-{
-    return (i + align - 1) & (-align);
-}
-
-f1 should be equivalent to the below, but it produces incorrect results on SYCL
-f2 is how GCC does std::align, but it also produces incorrect results on SYCL
-possibly alignof(T) is not a power-of-2 on SYCL? Or a compiler error.
-*/
-#if defined(KOKKOS_ENABLE_SYCL)
 template <typename T, typename InPtr>
-KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr p) {
-  std::uintptr_t ptrVal = reinterpret_cast<std::uintptr_t>(p);
-  while (ptrVal % alignof(T)) {
-    ++ptrVal;
-  }
-  return reinterpret_cast<T *>(ptrVal);
-}
-#else
-template <typename T, typename InPtr>
-KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr p) {
+KOKKOS_INLINE_FUNCTION T *alignPtrTo(InPtr *p) {
   // ugly but computationally free and the "right" way to do this in C++
-  std::uintptr_t ptrVal = reinterpret_cast<std::uintptr_t>(p);
+  const std::uintptr_t ptrVal = reinterpret_cast<std::uintptr_t>(p);
   // ptrVal + (align - 1) lands inside the next valid aligned scalar_t,
   // and the mask produces the start of that scalar_t.
-  return reinterpret_cast<T *>((ptrVal + alignof(T) - 1) & (~(alignof(T) - 1)));
+  const std::uintptr_t ptrValNew =
+      (ptrVal + alignof(T) - 1) & (~(alignof(T) - 1));
+  return reinterpret_cast<T *>(
+      reinterpret_cast<char *>(const_cast<std::remove_cv_t<InPtr> *>(p)) +
+      (ptrValNew - ptrVal));
 }
-#endif
 
 }  // namespace Impl
 }  // namespace KokkosKernels

From 3ce7adb7280cf190f15b205b11aac06dcb2d6181 Mon Sep 17 00:00:00 2001
From: Baptiste Legouix <stilynx51@gmail.com>
Date: Fri, 12 Jul 2024 16:40:09 +0200
Subject: [PATCH 31/32] init (#2273)

---
 sparse/impl/KokkosSparse_spmv_team_spec.hpp | 4 ++--
 sparse/src/KokkosSparse_spmv_team.hpp       | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sparse/impl/KokkosSparse_spmv_team_spec.hpp b/sparse/impl/KokkosSparse_spmv_team_spec.hpp
index 156123b113..f065a34fb6 100644
--- a/sparse/impl/KokkosSparse_spmv_team_spec.hpp
+++ b/sparse/impl/KokkosSparse_spmv_team_spec.hpp
@@ -37,7 +37,7 @@ struct TeamSpmv {
     return Impl::TeamSpmvInternal::invoke<
         MemberType, ScalarType, typename ValuesViewType::non_const_value_type,
         typename IntView::non_const_value_type, dobeta>(
-        member, x.extent(0), alpha, values.data(), values.stride_0(),
+        member, y.extent(0), alpha, values.data(), values.stride_0(),
         row_ptr.data(), row_ptr.stride_0(), colIndices.data(),
         colIndices.stride_0(), x.data(), x.stride_0(), beta, y.data(),
         y.stride_0());
@@ -56,7 +56,7 @@ struct TeamVectorSpmv {
     return Impl::TeamVectorSpmvInternal::invoke<
         MemberType, ScalarType, typename ValuesViewType::non_const_value_type,
         typename IntView::non_const_value_type, dobeta>(
-        member, x.extent(0), alpha, values.data(), values.stride_0(),
+        member, y.extent(0), alpha, values.data(), values.stride_0(),
         row_ptr.data(), row_ptr.stride_0(), colIndices.data(),
         colIndices.stride_0(), x.data(), x.stride_0(), beta, y.data(),
         y.stride_0());
diff --git a/sparse/src/KokkosSparse_spmv_team.hpp b/sparse/src/KokkosSparse_spmv_team.hpp
index 6c68478501..c3f2bfa49f 100644
--- a/sparse/src/KokkosSparse_spmv_team.hpp
+++ b/sparse/src/KokkosSparse_spmv_team.hpp
@@ -62,7 +62,7 @@ int KOKKOS_INLINE_FUNCTION team_spmv(
     return 1;
   }
 
-  if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) {
+  if ((x.extent(0) + 1) != row_ptr.extent(0)) {
     Kokkos::printf(
         "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: "
         "x: %d, y: %d, row_ptr: %d",
@@ -116,7 +116,7 @@ int KOKKOS_INLINE_FUNCTION team_vector_spmv(
     return 1;
   }
 
-  if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) {
+  if ((x.extent(0) + 1) != row_ptr.extent(0)) {
     Kokkos::printf(
         "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: "
         "x: %d, y: %d, row_ptr: %d",

From c93b6dc32f6eaa4fc8a13052fb638a30e6a34b49 Mon Sep 17 00:00:00 2001
From: James Foucar <jgfouca@sandia.gov>
Date: Tue, 16 Jul 2024 15:13:31 -0600
Subject: [PATCH 32/32] Bigger sptrsv cleanup (#2280)

* Some cleanup and refactoring
* Remove Upper/Lower TriLvlSchedTP2SolverFunctors
* Remove Upper/Lower single block functors
* Remove unused TriLvlSchedTP1SingleBlockFunctorDiagValues and merge upper/lower tri_solve_cg
* Merge two big upper/lower branch of tri_solve_chain
* Merge upper/lower tri_solve_streams
* Switch over block spiluk precond test to use new block sptrsv
---
 .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp     |    4 +-
 .../impl/KokkosSparse_sptrsv_solve_impl.hpp   | 2646 +++--------------
 .../impl/KokkosSparse_sptrsv_solve_spec.hpp   |   32 +-
 sparse/unit_test/Test_Sparse_sptrsv.hpp       |  559 +---
 4 files changed, 585 insertions(+), 2656 deletions(-)

diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
index 019a63fcd7..0a4a75933e 100644
--- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
+++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
@@ -305,7 +305,6 @@ void sptrsvcuSPARSE_solve(ExecutionSpace &space, KernelHandle *sptrsv_handle,
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
 #if (CUDA_VERSION >= 11030)
   typedef typename KernelHandle::nnz_lno_t idx_type;
-  typedef typename KernelHandle::size_type size_type;
   typedef typename KernelHandle::scalar_t scalar_type;
   typedef typename KernelHandle::memory_space memory_space;
 
@@ -474,7 +473,6 @@ void sptrsvcuSPARSE_solve_streams(
 ) {
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
   using idx_type         = typename KernelHandle::nnz_lno_t;
-  using size_type        = typename KernelHandle::size_type;
   using scalar_type      = typename KernelHandle::nnz_scalar_t;
   using memory_space     = typename KernelHandle::HandlePersistentMemorySpace;
   using sptrsvHandleType = typename KernelHandle::SPTRSVHandleType;
@@ -544,6 +542,8 @@ void sptrsvcuSPARSE_solve_streams(
     }
   }
 #else  // CUDA_VERSION < 11030
+  using size_type = typename KernelHandle::size_type;
+
   const bool is_cuda_space =
       std::is_same<memory_space, Kokkos::CudaSpace>::value ||
       std::is_same<memory_space, Kokkos::CudaUVMSpace>::value ||
diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
index d385a390cd..bc31f14791 100644
--- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
+++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
@@ -35,8 +35,6 @@
 #include "KokkosBatched_Trsm_Team_Impl.hpp"
 #endif
 
-//#define SERIAL_FOR_LOOP
-
 #define KOKKOSKERNELS_SPTRSV_TRILVLSCHED
 
 //#define KOKKOSPSTRSV_SOLVE_IMPL_PROFILE 1
@@ -100,11 +98,12 @@ struct SptrsvWrap {
     void operator()(const int) const {}
   };
 
-  // This functor unifies the lower and upper implementations, the hope is the
-  // "is_lowertri" check does not add noticable time on larger problems
+  /**
+   * Common base class for sptrsv functors
+   */
   template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct TriLvlSchedTP1SolverFunctor {
+            class LHSType, class RHSType, bool BlockEnabled>
+  struct Common {
     RowMapType row_map;
     EntriesType entries;
     ValuesType values;
@@ -112,538 +111,280 @@ struct SptrsvWrap {
     RHSType rhs;
     entries_t nodes_grouped_by_level;
 
-    const bool is_lowertri;
-
-    long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                      // offset
-
-    TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
-                                const EntriesType &entries_,
-                                const ValuesType &values_, LHSType &lhs_,
-                                const RHSType &rhs_,
-                                const entries_t &nodes_grouped_by_level_,
-                                const bool &is_lowertri_,
-                                const long &node_count_)
+    Common(const RowMapType &row_map_, const EntriesType &entries_,
+           const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
+           const entries_t &nodes_grouped_by_level_,
+           const size_type block_size_ = 0)
         : row_map(row_map_),
           entries(entries_),
           values(values_),
           lhs(lhs_),
           rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_),
-          is_lowertri(is_lowertri_),
-          node_count(node_count_) {}
+          nodes_grouped_by_level(nodes_grouped_by_level_) {
+      KK_REQUIRE_MSG(!BlockEnabled, "Blocks are not yet supported.");
+      KK_REQUIRE_MSG(block_size_ == 0, "Blocks are not yet supported.");
+    }
 
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-      auto rowid     = nodes_grouped_by_level(my_league + node_count);
-      auto my_rank   = team.team_rank();
-
-      auto soffset   = row_map(rowid);
-      auto eoffset   = row_map(rowid + 1);
-      auto rhs_rowid = rhs(rowid);
-      scalar_t diff  = scalar_t(0.0);
-
-      Kokkos::parallel_reduce(
-          Kokkos::TeamThreadRange(team, soffset, eoffset),
-          [&](const long ptr, scalar_t &tdiff) {
-            auto colid = entries(ptr);
-
-            auto val = values(ptr);
-            if (colid != rowid) {
-              tdiff = tdiff - val * lhs(colid);
-            }
-          },
-          diff);
+    struct ReduceSumFunctor {
+      const Common *m_obj;
+      const lno_t rowid;
+      lno_t diag;
+
+      KOKKOS_INLINE_FUNCTION
+      void operator()(size_type i, scalar_t &accum) const {
+        const auto colid = m_obj->entries(i);
+        auto val         = m_obj->values(i);
+        auto lhs_colid   = m_obj->lhs(colid);
+        accum -= val * lhs_colid;
+        KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag");
+      }
+    };
 
-      team.team_barrier();
+    struct ReduceSumDiagFunctor {
+      const Common *m_obj;
+      const lno_t rowid;
+      lno_t diag;
 
-      // At end, finalize rowid == colid
-      // only one thread should do this; can also use Kokkos::single
-      if (my_rank == 0) {
-        // ASSUMPTION: sorted diagonal value located at eoffset - 1
-        lhs(rowid) = is_lowertri ? (rhs_rowid + diff) / values(eoffset - 1)
-                                 : (rhs_rowid + diff) / values(soffset);
+      KOKKOS_INLINE_FUNCTION
+      void operator()(size_type i, scalar_t &accum) const {
+        const auto colid = m_obj->entries(i);
+        if (colid != rowid) {
+          auto val       = m_obj->values(i);
+          auto lhs_colid = m_obj->lhs(colid);
+          accum -= val * lhs_colid;
+        } else {
+          diag = i;
+        }
       }
-    }
+    };
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(const UnsortedTag &, const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-      auto rowid     = nodes_grouped_by_level(my_league + node_count);
-      auto my_rank   = team.team_rank();
-
-      auto soffset   = row_map(rowid);
-      auto eoffset   = row_map(rowid + 1);
-      auto rhs_rowid = rhs(rowid);
-      scalar_t diff  = scalar_t(0.0);
-
-      auto diag = -1;
-
-      Kokkos::parallel_reduce(
-          Kokkos::TeamThreadRange(team, soffset, eoffset),
-          [&](const long ptr, scalar_t &tdiff) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              tdiff = tdiff - val * lhs(colid);
-            } else {
-              diag = ptr;
-            }
-          },
-          diff);
-      team.team_barrier();
-
-      // At end, finalize rowid == colid
-      // only one thread should do this; can also use Kokkos::single
-      if (my_rank == 0) {
-        lhs(rowid) = (rhs_rowid + diff) / values(diag);
-      }
+    static void add_and_divide(scalar_t &lhs_val, const scalar_t &rhs_val,
+                               const scalar_t &diag_val) {
+      lhs_val = (lhs_val + rhs_val) / diag_val;
     }
-  };
-
-  template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct TriLvlSchedTP1SolverFunctorDiagValues {
-    RowMapType row_map;
-    EntriesType entries;
-    ValuesType values;
-    LHSType lhs;
-    RHSType rhs;
-    entries_t nodes_grouped_by_level;
-    ValuesType diagonal_values;  // inserted according to rowid
-
-    const bool is_lowertri;
-
-    long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                      // offset
-    long dense_nrows;
-
-    TriLvlSchedTP1SolverFunctorDiagValues(
-        const RowMapType &row_map_, const EntriesType &entries_,
-        const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
-        const entries_t &nodes_grouped_by_level_,
-        const ValuesType &diagonal_values_, const bool is_lowertri_,
-        long node_count_, long dense_nrows_ = 0)
-        : row_map(row_map_),
-          entries(entries_),
-          values(values_),
-          lhs(lhs_),
-          rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_),
-          diagonal_values(diagonal_values_),
-          is_lowertri(is_lowertri_),
-          node_count(node_count_),
-          dense_nrows(dense_nrows_) {}
 
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-      auto rowid     = nodes_grouped_by_level(my_league + node_count);
-      auto my_rank   = team.team_rank();
-
-      auto soffset   = row_map(rowid);
-      auto eoffset   = row_map(rowid + 1);
-      auto rhs_rowid = rhs(rowid);
-      scalar_t diff  = scalar_t(0.0);
-
-      Kokkos::parallel_reduce(
-          Kokkos::TeamThreadRange(team, soffset, eoffset),
-          [&](const long ptr, scalar_t &tdiff) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              tdiff = tdiff - val * lhs(colid);
-            }
-          },
-          diff);
+    template <bool IsSerial, bool IsSorted, bool IsLower,
+              bool UseThreadVec = false>
+    KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team,
+                                           const int my_rank,
+                                           const long node_count) const {
+      static_assert(
+          !((!IsSerial && BlockEnabled) && UseThreadVec),
+          "ThreadVectorRanges are not yet supported for block-enabled");
+      static_assert(!(IsSerial && UseThreadVec),
+                    "Requested thread vector range in serial?");
+
+      const auto rowid   = nodes_grouped_by_level(my_rank + node_count);
+      const auto soffset = row_map(rowid);
+      const auto eoffset = row_map(rowid + 1);
+      const auto rhs_val = rhs(rowid);
+      scalar_t &lhs_val  = lhs(rowid);
+
+      // Set up range to auto-skip diag if is sorted
+      const auto itr_b = soffset + (IsSorted ? (IsLower ? 0 : 1) : 0);
+      const auto itr_e = eoffset - (IsSorted ? (IsLower ? 1 : 0) : 0);
+
+      // We don't need the reducer to find the diag item if sorted
+      using reducer_t =
+          std::conditional_t<IsSorted, ReduceSumFunctor, ReduceSumDiagFunctor>;
+      reducer_t rf{this, rowid, -1};
+
+      if constexpr (IsSerial) {
+        KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial");
+        KK_KERNEL_ASSERT_MSG(team == nullptr, "Team provided in serial?");
+        for (auto ptr = itr_b; ptr < itr_e; ++ptr) {
+          rf(ptr, lhs_val);
+        }
+      } else {
+        KK_KERNEL_ASSERT_MSG(team != nullptr,
+                             "Cannot do team operations without team");
+        if constexpr (!UseThreadVec) {
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e),
+                                  rf, lhs_val);
+          team->team_barrier();
+        } else {
+          Kokkos::parallel_reduce(
+              Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, lhs_val);
+        }
+      }
 
-      team.team_barrier();
+      // If sorted, we already know the diag. Otherwise, get it from the reducer
+      rf.diag = IsSorted ? (IsLower ? eoffset - 1 : soffset) : rf.diag;
 
-      // At end, finalize rowid == colid
-      // only one thread should do this; can also use Kokkos::single
-      if (my_rank == 0) {
-        // lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) :
-        // (rhs_rowid+diff)/values(soffset);
-        lhs(rowid) = (rhs_rowid + diff) / diagonal_values(rowid);
+      // At end, handle the diag element. We need to be careful to avoid race
+      // conditions here.
+      if constexpr (IsSerial) {
+        // Serial case is easy, there's only 1 thread so just do the
+        // add_and_divide
+        KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Serial should always know diag");
+        add_and_divide(lhs_val, rhs_val, values(rf.diag));
+      } else {
+        if constexpr (IsSorted) {
+          // Parallel sorted case is complex. All threads know what the diag is.
+          // If we have a team sharing the work, we need to ensure only one
+          // thread performs the add_and_divide.
+          KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Sorted should always know diag");
+          if constexpr (!UseThreadVec) {
+            Kokkos::single(Kokkos::PerTeam(*team), [&]() {
+              add_and_divide(lhs_val, rhs_val, values(rf.diag));
+            });
+          } else {
+            add_and_divide(lhs_val, rhs_val, values(rf.diag));
+          }
+        } else {
+          // Parallel unsorted case. Only one thread should know what the diag
+          // item is. We have that one do the add_and_divide.
+          if (rf.diag != -1) {
+            add_and_divide(lhs_val, rhs_val, values(rf.diag));
+          }
+        }
       }
     }
   };
 
   template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct TriLvlSchedTP2SolverFunctor {
-    RowMapType row_map;
-    EntriesType entries;
-    ValuesType values;
-    LHSType lhs;
-    RHSType rhs;
-    entries_t nodes_grouped_by_level;
+            class LHSType, class RHSType, bool IsLower, bool BlockEnabled>
+  struct TriLvlSchedTP1SolverFunctor
+      : public Common<RowMapType, EntriesType, ValuesType, LHSType, RHSType,
+                      BlockEnabled> {
+    using Base = Common<RowMapType, EntriesType, ValuesType, LHSType, RHSType,
+                        BlockEnabled>;
 
-    const bool is_lowertri;
     long node_count;  // like "block" offset into ngbl, my_league is the "local"
                       // offset
-    long node_groups;
-    long dense_nrows;
 
-    TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_,
+    TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
                                 const EntriesType &entries_,
                                 const ValuesType &values_, LHSType &lhs_,
                                 const RHSType &rhs_,
                                 const entries_t &nodes_grouped_by_level_,
-                                const bool is_lowertri_, long node_count_,
-                                long node_groups_ = 0, long dense_nrows_ = 0)
-        : row_map(row_map_),
-          entries(entries_),
-          values(values_),
-          lhs(lhs_),
-          rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_),
-          is_lowertri(is_lowertri_),
-          node_count(node_count_),
-          node_groups(node_groups_),
-          dense_nrows(dense_nrows_) {}
+                                const long &node_count_,
+                                const size_type block_size_ = 0)
+        : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_,
+               block_size_),
+          node_count(node_count_) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-
-      size_t nrows = row_map.extent(0) - 1;
-
-      Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
-            auto rowid = nodes_grouped_by_level(node_count +
-                                                my_league * node_groups + ng);
-            if (size_t(rowid) < nrows) {
-              auto soffset   = row_map(rowid);
-              auto eoffset   = row_map(rowid + 1);
-              auto rhs_rowid = rhs(rowid);
-              scalar_t diff  = scalar_t(0.0);
-
-              Kokkos::parallel_reduce(
-                  Kokkos::ThreadVectorRange(team, soffset, eoffset),
-                  [&](const long ptr, scalar_t &tdiff) {
-                    auto colid = entries(ptr);
-                    auto val   = values(ptr);
-                    if (colid != rowid) {
-                      tdiff = tdiff - val * lhs(colid);
-                    }
-                  },
-                  diff);
-
-              // ASSUMPTION: sorted diagonal value located at eoffset - 1
-              lhs(rowid) = is_lowertri
-                               ? (rhs_rowid + diff) / values(eoffset - 1)
-                               : (rhs_rowid + diff) / values(soffset);
-            }  // end if
-          });  // end TeamThreadRange
-
-      team.team_barrier();
+      Base::template solve_impl<false, true, IsLower>(&team, team.league_rank(),
+                                                      node_count);
     }
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const UnsortedTag &, const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-
-      size_t nrows = row_map.extent(0) - 1;
-
-      Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
-            auto rowid = nodes_grouped_by_level(node_count +
-                                                my_league * node_groups + ng);
-            if (size_t(rowid) < nrows) {
-              auto soffset   = row_map(rowid);
-              auto eoffset   = row_map(rowid + 1);
-              auto rhs_rowid = rhs(rowid);
-              scalar_t diff  = scalar_t(0.0);
-
-              auto diag = -1;
-              Kokkos::parallel_reduce(
-                  Kokkos::ThreadVectorRange(team, soffset, eoffset),
-                  [&](const long ptr, scalar_t &tdiff) {
-                    auto colid = entries(ptr);
-                    auto val   = values(ptr);
-                    if (colid != rowid) {
-                      tdiff = tdiff - val * lhs(colid);
-                    } else {
-                      diag = ptr;
-                    }
-                  },
-                  diff);
-
-              lhs(rowid) = (rhs_rowid + diff) / values(diag);
-            }  // end if
-          });  // end TeamThreadRange
-
-      team.team_barrier();
+      Base::template solve_impl<false, false, IsLower>(
+          &team, team.league_rank(), node_count);
     }
   };
 
   // Lower vs Upper Multi-block Functors
 
   template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct LowerTriLvlSchedRPSolverFunctor {
-    RowMapType row_map;
-    EntriesType entries;
-    ValuesType values;
-    LHSType lhs;
-    RHSType rhs;
-    entries_t nodes_grouped_by_level;
-
-    LowerTriLvlSchedRPSolverFunctor(const RowMapType &row_map_,
-                                    const EntriesType &entries_,
-                                    const ValuesType &values_, LHSType &lhs_,
-                                    const RHSType &rhs_,
-                                    const entries_t &nodes_grouped_by_level_)
-        : row_map(row_map_),
-          entries(entries_),
-          values(values_),
-          lhs(lhs_),
-          rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_) {}
+            class LHSType, class RHSType, bool IsLower, bool BlockEnabled>
+  struct TriLvlSchedRPSolverFunctor
+      : public Common<RowMapType, EntriesType, ValuesType, LHSType, RHSType,
+                      BlockEnabled> {
+    using Base = Common<RowMapType, EntriesType, ValuesType, LHSType, RHSType,
+                        BlockEnabled>;
+
+    TriLvlSchedRPSolverFunctor(const RowMapType &row_map_,
+                               const EntriesType &entries_,
+                               const ValuesType &values_, LHSType &lhs_,
+                               const RHSType &rhs_,
+                               const entries_t &nodes_grouped_by_level_,
+                               const size_type block_size_ = 0)
+        : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_,
+               block_size_) {}
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const lno_t i) const {
-      auto rowid = nodes_grouped_by_level(i);
-      // Assuming indices are sorted per row, diag entry is final index in the
-      // list
-
-      long soffset   = row_map(rowid);
-      long eoffset   = row_map(rowid + 1);
-      auto rhs_rowid = rhs(rowid);
-
-      for (long ptr = soffset; ptr < eoffset; ++ptr) {
-        auto colid = entries(ptr);
-        auto val   = values(ptr);
-        if (colid != rowid) {
-          rhs_rowid = rhs_rowid - val * lhs(colid);
-        } else {
-          lhs(rowid) = rhs_rowid / val;
-        }
-      }  // end for ptr
+      Base::template solve_impl<true, true, IsLower>(nullptr, 0, i);
     }
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const UnsortedTag &, const lno_t i) const {
-      auto rowid     = nodes_grouped_by_level(i);
-      long soffset   = row_map(rowid);
-      long eoffset   = row_map(rowid + 1);
-      auto rhs_rowid = rhs(rowid);
-      auto diag      = -1;
-
-      for (long ptr = soffset; ptr < eoffset; ++ptr) {
-        auto colid = entries(ptr);
-        auto val   = values(ptr);
-        if (colid != rowid) {
-          rhs_rowid = rhs_rowid - val * lhs(colid);
-        } else {
-          diag = ptr;
-        }
-      }  // end for ptr
-      lhs(rowid) = rhs_rowid / values(diag);
+      Base::template solve_impl<true, false, IsLower>(nullptr, 0, i);
     }
   };
 
   template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct LowerTriLvlSchedTP1SolverFunctor {
-    RowMapType row_map;
-    EntriesType entries;
-    ValuesType values;
-    LHSType lhs;
-    RHSType rhs;
-    entries_t nodes_grouped_by_level;
+            class LHSType, class RHSType, bool IsLower>
+  struct TriLvlSchedTP1SingleBlockFunctor
+      : public Common<RowMapType, EntriesType, ValuesType, LHSType, RHSType,
+                      false> {
+    using Base =
+        Common<RowMapType, EntriesType, ValuesType, LHSType, RHSType, false>;
+
+    entries_t nodes_per_level;
 
     long node_count;  // like "block" offset into ngbl, my_league is the "local"
                       // offset
-    long node_groups;
-
-    LowerTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
-                                     const EntriesType &entries_,
-                                     const ValuesType &values_, LHSType &lhs_,
-                                     const RHSType &rhs_,
-                                     const entries_t &nodes_grouped_by_level_,
-                                     long node_count_, long node_groups_ = 0)
-        : row_map(row_map_),
-          entries(entries_),
-          values(values_),
-          lhs(lhs_),
-          rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_),
+    long lvl_start;
+    long lvl_end;
+    const int dense_nrows;
+    const int cutoff;
+    // team_size: each team can be assigned a row, if there are enough rows...
+
+    TriLvlSchedTP1SingleBlockFunctor(
+        const RowMapType &row_map_, const EntriesType &entries_,
+        const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
+        const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_,
+        long node_count_, long lvl_start_, long lvl_end_,
+        const int dense_nrows_ = 0, const int cutoff_ = 0)
+        : Base(row_map_, entries_, values_, lhs_, rhs_,
+               nodes_grouped_by_level_),
+          nodes_per_level(nodes_per_level_),
           node_count(node_count_),
-          node_groups(node_groups_) {}
+          lvl_start(lvl_start_),
+          lvl_end(lvl_end_),
+          dense_nrows(dense_nrows_),
+          cutoff(cutoff_) {}
 
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-      auto rowid     = nodes_grouped_by_level(my_league + node_count);
-      auto my_rank   = team.team_rank();
-
-      auto soffset   = row_map(rowid);
-      auto eoffset   = row_map(rowid + 1);
-      auto rhs_rowid = rhs(rowid);
-      scalar_t diff  = scalar_t(0.0);
-
-      Kokkos::parallel_reduce(
-          Kokkos::TeamThreadRange(team, soffset, eoffset),
-          [&](const long ptr, scalar_t &tdiff) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              tdiff = tdiff - val * lhs(colid);
-            }
-          },
-          diff);
+    // SingleBlock: Only one block (or league) executing; team_rank used to map
+    // thread to row
 
-      team.team_barrier();
+    template <bool IsSorted, bool LargerCutoff>
+    KOKKOS_INLINE_FUNCTION void common_impl(const member_type &team) const {
+      auto mut_node_count = node_count;
 
-      // At end, finalize rowid == colid
-      // only one thread should do this; can also use Kokkos::single
-      if (my_rank == 0) {
-        // ASSUMPTION: sorted diagonal value located at eoffset - 1
-        lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1);
+      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
+        const auto nodes_this_lvl = nodes_per_level(lvl);
+        const auto my_team_rank   = team.team_rank();
+        const auto loop_cutoff    = LargerCutoff ? cutoff : my_team_rank + 1;
+        // If cutoff > team_size, then a thread will be responsible for multiple
+        // rows - this may be a helpful scenario depending on occupancy etc.
+        for (int my_rank = my_team_rank; my_rank < loop_cutoff;
+             my_rank += team.team_size()) {
+          if (my_rank < nodes_this_lvl) {
+            Base::template solve_impl<false, IsSorted, IsLower, true>(
+                &team, my_rank, mut_node_count);
+          }
+        }
+        mut_node_count += nodes_this_lvl;
+        team.team_barrier();
       }
     }
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(const UnsortedTag &, const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-      auto rowid     = nodes_grouped_by_level(my_league + node_count);
-      auto my_rank   = team.team_rank();
-
-      auto soffset   = row_map(rowid);
-      auto eoffset   = row_map(rowid + 1);
-      auto rhs_rowid = rhs(rowid);
-      scalar_t diff  = scalar_t(0.0);
-
-      auto diag = -1;
-
-      Kokkos::parallel_reduce(
-          Kokkos::TeamThreadRange(team, soffset, eoffset),
-          [&](const long ptr, scalar_t &tdiff) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              tdiff = tdiff - val * lhs(colid);
-            } else {
-              diag = ptr;
-            }
-          },
-          diff);
-      team.team_barrier();
-
-      // At end, finalize rowid == colid
-      // only one thread should do this; can also use Kokkos::single
-      if (my_rank == 0) {
-        lhs(rowid) = (rhs_rowid + diff) / values(diag);
-      }
+    void operator()(const member_type &team) const {
+      common_impl<true, false>(team);
     }
-  };
-
-  // FIXME CUDA: This algorithm not working with all integral type combos
-  // In any case, this serves as a skeleton for 3-level hierarchical parallelism
-  // for alg dev
-  template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct LowerTriLvlSchedTP2SolverFunctor {
-    RowMapType row_map;
-    EntriesType entries;
-    ValuesType values;
-    LHSType lhs;
-    RHSType rhs;
-    entries_t nodes_grouped_by_level;
-
-    long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                      // offset
-    long node_groups;
-
-    LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_,
-                                     const EntriesType &entries_,
-                                     const ValuesType &values_, LHSType &lhs_,
-                                     const RHSType &rhs_,
-                                     const entries_t &nodes_grouped_by_level_,
-                                     long node_count_, long node_groups_ = 0)
-        : row_map(row_map_),
-          entries(entries_),
-          values(values_),
-          lhs(lhs_),
-          rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_),
-          node_count(node_count_),
-          node_groups(node_groups_) {}
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-
-      size_t nrows = row_map.extent(0) - 1;
-
-      Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
-            auto rowid = nodes_grouped_by_level(node_count +
-                                                my_league * node_groups + ng);
-            if (size_t(rowid) < nrows) {
-              auto soffset   = row_map(rowid);
-              auto eoffset   = row_map(rowid + 1);
-              auto rhs_rowid = rhs(rowid);
-              scalar_t diff  = scalar_t(0.0);
-
-              Kokkos::parallel_reduce(
-                  Kokkos::ThreadVectorRange(team, soffset, eoffset),
-                  [&](const long ptr, scalar_t &tdiff) {
-                    auto colid = entries(ptr);
-                    auto val   = values(ptr);
-                    if (colid != rowid) {
-                      tdiff = tdiff - val * lhs(colid);
-                    }
-                  },
-                  diff);
-
-              // ASSUMPTION: sorted diagonal value located at eoffset - 1
-              lhs(rowid) = (rhs_rowid + diff) / values(eoffset - 1);
-            }  // end if
-          });  // end TeamThreadRange
-
-      team.team_barrier();
+    void operator()(const UnsortedTag &, const member_type &team) const {
+      common_impl<false, false>(team);
     }
 
     KOKKOS_INLINE_FUNCTION
-    void operator()(const UnsortedTag &, const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-
-      size_t nrows = row_map.extent(0) - 1;
-
-      Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
-            auto rowid = nodes_grouped_by_level(node_count +
-                                                my_league * node_groups + ng);
-            if (size_t(rowid) < nrows) {
-              auto soffset   = row_map(rowid);
-              auto eoffset   = row_map(rowid + 1);
-              auto rhs_rowid = rhs(rowid);
-              scalar_t diff  = scalar_t(0.0);
-
-              auto diag = -1;
-              Kokkos::parallel_reduce(
-                  Kokkos::ThreadVectorRange(team, soffset, eoffset),
-                  [&](const long ptr, scalar_t &tdiff) {
-                    auto colid = entries(ptr);
-                    auto val   = values(ptr);
-                    if (colid != rowid) {
-                      tdiff = tdiff - val * lhs(colid);
-                    } else {
-                      diag = ptr;
-                    }
-                  },
-                  diff);
-
-              // ASSUMPTION: sorted diagonal value located at eoffset - 1
-              lhs(rowid) = (rhs_rowid + diff) / values(diag);
-            }  // end if
-          });  // end TeamThreadRange
+    void operator()(const LargerCutoffTag &, const member_type &team) const {
+      common_impl<true, true>(team);
+    }
 
-      team.team_barrier();
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const UnsortedLargerCutoffTag &,
+                    const member_type &team) const {
+      common_impl<false, true>(team);
     }
   };
 
@@ -1217,1396 +958,54 @@ struct SptrsvWrap {
                 KokkosBatched::Uplo::Upper, KokkosBatched::Trans::NoTranspose,
                 KokkosBatched::Diag::NonUnit,
                 KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj,
-                                                              Xjj);
-          }
-        }
-        team.team_barrier();
-      }
-      if (nsrow2 > 0) {
-        /* GEMM to update off diagonal blocks, Z = Uij * Xj */
-        auto Z = Kokkos::subview(
-            work, range_type(workoffset + nscol, workoffset + nsrow));
-        if (!invert_offdiagonal && diag_kernel_type(level) != 3) {
-          // not device-level TRSM-solve
-          auto Uij =
-              Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL());
-          KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::NoTranspose,
-                               KokkosBlas::Algo::Gemv::Unblocked>::invoke(team,
-                                                                          one,
-                                                                          Uij,
-                                                                          Xj,
-                                                                          zero,
-                                                                          Z);
-          team.team_barrier();
-        }
-
-        /* scatter vector into Z */
-        int i2 = i1 + nscol;  // offset into rowind
-        Kokkos::View<scalar_t *, temp_mem_space,
-                     Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::Atomic>>
-            Xatomic(X.data(), X.extent(0));
-        for (int ii = team_rank; ii < nsrow2; ii += team_size) {
-          int i = rowind(i2 + ii);
-          Xatomic(i) -= Z(ii);
-        }
-        team.team_barrier();
-      }
-    }
-  };
-#endif
-
-  template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct UpperTriLvlSchedRPSolverFunctor {
-    RowMapType row_map;
-    EntriesType entries;
-    ValuesType values;
-    LHSType lhs;
-    RHSType rhs;
-    entries_t nodes_grouped_by_level;
-
-    UpperTriLvlSchedRPSolverFunctor(const RowMapType &row_map_,
-                                    const EntriesType &entries_,
-                                    const ValuesType &values_, LHSType &lhs_,
-                                    const RHSType &rhs_,
-                                    const entries_t &nodes_grouped_by_level_)
-        : row_map(row_map_),
-          entries(entries_),
-          values(values_),
-          lhs(lhs_),
-          rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const lno_t i) const {
-      auto rowid = nodes_grouped_by_level(i);
-      // Assuming indices are sorted per row, diag entry is final index in the
-      // list
-      long soffset   = row_map(rowid);
-      long eoffset   = row_map(rowid + 1);
-      auto rhs_rowid = rhs(rowid);
-      for (long ptr = eoffset - 1; ptr >= soffset; --ptr) {
-        auto colid = entries(ptr);
-        auto val   = values(ptr);
-        if (colid != rowid) {
-          rhs_rowid = rhs_rowid - val * lhs(colid);
-        } else {
-          lhs(rowid) = rhs_rowid / val;
-        }
-      }  // end for ptr
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const UnsortedTag &, const lno_t i) const {
-      auto rowid     = nodes_grouped_by_level(i);
-      long soffset   = row_map(rowid);
-      long eoffset   = row_map(rowid + 1);
-      auto rhs_rowid = rhs(rowid);
-      auto diag      = -1;
-      for (long ptr = eoffset - 1; ptr >= soffset; --ptr) {
-        auto colid = entries(ptr);
-        auto val   = values(ptr);
-        if (colid != rowid) {
-          rhs_rowid = rhs_rowid - val * lhs(colid);
-        } else {
-          diag = ptr;
-        }
-      }  // end for ptr
-      lhs(rowid) = rhs_rowid / values(diag);
-    }
-  };
-
-  template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct UpperTriLvlSchedTP1SolverFunctor {
-    RowMapType row_map;
-    EntriesType entries;
-    ValuesType values;
-    LHSType lhs;
-    RHSType rhs;
-    entries_t nodes_grouped_by_level;
-
-    long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                      // offset
-    long node_groups;
-
-    UpperTriLvlSchedTP1SolverFunctor(const RowMapType &row_map_,
-                                     const EntriesType &entries_,
-                                     const ValuesType &values_, LHSType &lhs_,
-                                     const RHSType &rhs_,
-                                     const entries_t &nodes_grouped_by_level_,
-                                     long node_count_, long node_groups_ = 0)
-        : row_map(row_map_),
-          entries(entries_),
-          values(values_),
-          lhs(lhs_),
-          rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_),
-          node_count(node_count_),
-          node_groups(node_groups_) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-      auto rowid     = nodes_grouped_by_level(my_league + node_count);
-      auto my_rank   = team.team_rank();
-
-      auto soffset   = row_map(rowid);
-      auto eoffset   = row_map(rowid + 1);
-      auto rhs_rowid = rhs(rowid);
-      scalar_t diff  = scalar_t(0.0);
-
-      Kokkos::parallel_reduce(
-          Kokkos::TeamThreadRange(team, soffset, eoffset),
-          [&](const long ptr, scalar_t &tdiff) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              tdiff = tdiff - val * lhs(colid);
-            }
-          },
-          diff);
-
-      team.team_barrier();
-
-      // At end, finalize rowid == colid
-      // only one thread should do this, also can use Kokkos::single
-      if (my_rank == 0) {
-        // ASSUMPTION: sorted diagonal value located at start offset
-        lhs(rowid) = (rhs_rowid + diff) / values(soffset);
-      }
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const UnsortedTag &, const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-      auto rowid     = nodes_grouped_by_level(my_league + node_count);
-      auto my_rank   = team.team_rank();
-
-      auto soffset   = row_map(rowid);
-      auto eoffset   = row_map(rowid + 1);
-      auto rhs_rowid = rhs(rowid);
-      scalar_t diff  = scalar_t(0.0);
-
-      auto diag = -1;
-
-      Kokkos::parallel_reduce(
-          Kokkos::TeamThreadRange(team, soffset, eoffset),
-          [&](const long ptr, scalar_t &tdiff) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              tdiff = tdiff - val * lhs(colid);
-            } else {
-              diag = ptr;
-            }
-          },
-          diff);
-      team.team_barrier();
-
-      // At end, finalize rowid == colid
-      // only one thread should do this, also can use Kokkos::single
-      if (my_rank == 0) {
-        lhs(rowid) = (rhs_rowid + diff) / values(diag);
-      }
-    }
-  };
-
-  // FIXME CUDA: This algorithm not working with all integral type combos
-  // In any case, this serves as a skeleton for 3-level hierarchical parallelism
-  // for alg dev
-  template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct UpperTriLvlSchedTP2SolverFunctor {
-    RowMapType row_map;
-    EntriesType entries;
-    ValuesType values;
-    LHSType lhs;
-    RHSType rhs;
-    entries_t nodes_grouped_by_level;
-
-    long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                      // offset
-    long node_groups;
-
-    UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_,
-                                     const EntriesType &entries_,
-                                     const ValuesType &values_, LHSType &lhs_,
-                                     const RHSType &rhs_,
-                                     const entries_t &nodes_grouped_by_level_,
-                                     long node_count_, long node_groups_ = 0)
-        : row_map(row_map_),
-          entries(entries_),
-          values(values_),
-          lhs(lhs_),
-          rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_),
-          node_count(node_count_),
-          node_groups(node_groups_) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-
-      size_t nrows = row_map.extent(0) - 1;
-
-      Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
-            auto rowid = nodes_grouped_by_level(node_count +
-                                                my_league * node_groups + ng);
-            if (size_t(rowid) < nrows) {
-              auto soffset   = row_map(rowid);
-              auto eoffset   = row_map(rowid + 1);
-              auto rhs_rowid = rhs(rowid);
-              scalar_t diff  = scalar_t(0.0);
-
-              Kokkos::parallel_reduce(
-                  Kokkos::ThreadVectorRange(team, soffset, eoffset),
-                  [&](const long ptr, scalar_t &tdiff) {
-                    auto colid = entries(ptr);
-                    auto val   = values(ptr);
-                    if (colid != rowid) {
-                      tdiff = tdiff - val * lhs(colid);
-                    }
-                  },
-                  diff);
-
-              // ASSUMPTION: sorted diagonal value located at start offset
-              lhs(rowid) = (rhs_rowid + diff) / values(soffset);
-            }  // end if
-          });  // end TeamThreadRange
-
-      team.team_barrier();
-    }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const UnsortedTag &, const member_type &team) const {
-      auto my_league = team.league_rank();  // map to rowid
-
-      size_t nrows = row_map.extent(0) - 1;
-
-      Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(team, 0, node_groups), [&](const long ng) {
-            auto rowid = nodes_grouped_by_level(node_count +
-                                                my_league * node_groups + ng);
-            if (size_t(rowid) < nrows) {
-              auto soffset   = row_map(rowid);
-              auto eoffset   = row_map(rowid + 1);
-              auto rhs_rowid = rhs(rowid);
-              scalar_t diff  = scalar_t(0.0);
-
-              auto diag = -1;
-              Kokkos::parallel_reduce(
-                  Kokkos::ThreadVectorRange(team, soffset, eoffset),
-                  [&](const long ptr, scalar_t &tdiff) {
-                    auto colid = entries(ptr);
-                    auto val   = values(ptr);
-                    if (colid != rowid) {
-                      tdiff = tdiff - val * lhs(colid);
-                    } else {
-                      diag = ptr;
-                    }
-                  },
-                  diff);
-
-              lhs(rowid) = (rhs_rowid + diff) / values(diag);
-            }  // end if
-          });  // end TeamThreadRange
-
-      team.team_barrier();
-    }
-  };
-
-  // --------------------------------
-  // Single-block functors
-  // --------------------------------
-
-  template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct LowerTriLvlSchedTP1SingleBlockFunctor {
-    RowMapType row_map;
-    EntriesType entries;
-    ValuesType values;
-    LHSType lhs;
-    RHSType rhs;
-    entries_t nodes_grouped_by_level;
-    entries_t nodes_per_level;
-
-    long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                      // offset
-    long lvl_start;
-    long lvl_end;
-    long cutoff;
-    // team_size: each team can be assigned a row, if there are enough rows...
-
-    LowerTriLvlSchedTP1SingleBlockFunctor(
-        const RowMapType &row_map_, const EntriesType &entries_,
-        const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
-        const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_,
-        long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0)
-        : row_map(row_map_),
-          entries(entries_),
-          values(values_),
-          lhs(lhs_),
-          rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_),
-          nodes_per_level(nodes_per_level_),
-          node_count(node_count_),
-          lvl_start(lvl_start_),
-          lvl_end(lvl_end_),
-          cutoff(cutoff_) {}
-
-    // SingleBlock: Only one block (or league) executing; team_rank used to map
-    // thread to row
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_rank         = team.team_rank();
-        diff                = scalar_t(0.0);
-
-        if (my_rank < nodes_this_lvl) {
-          // THIS is where the mapping of threadid to rowid happens
-          rowid = nodes_grouped_by_level(my_rank + mut_node_count);
-
-          soffset = row_map(rowid);
-          eoffset = row_map(rowid + 1);
-          rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              diff -= val * lhs(colid);
-            }
-          }
-#else
-          auto trange = eoffset - soffset;
-          Kokkos::parallel_reduce(
-              Kokkos::ThreadVectorRange(team, trange),
-              [&](const int loffset, scalar_t &tdiff) {
-                auto ptr   = soffset + loffset;
-                auto colid = entries(ptr);
-                auto val   = values(ptr);
-                if (colid != rowid) {
-                  tdiff -= val * lhs(colid);
-                }
-              },
-              diff);
-#endif
-          // ASSUMPTION: sorted diagonal value located at eoffset - 1
-          lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
-        }  // end if team.team_rank() < nodes_this_lvl
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl per thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end operator
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const UnsortedTag &, const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_rank         = team.team_rank();
-        diff                = scalar_t(0.0);
-
-        if (my_rank < nodes_this_lvl) {
-          // THIS is where the mapping of threadid to rowid happens
-          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-          soffset = row_map(rowid);
-          eoffset = row_map(rowid + 1);
-          rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              diff -= val * lhs(colid);
-            }
-          }
-#else
-          auto trange = eoffset - soffset;
-          auto diag   = -1;
-
-          Kokkos::parallel_reduce(
-              Kokkos::ThreadVectorRange(team, trange),
-              [&](const int loffset, scalar_t &tdiff) {
-                auto ptr = soffset + loffset;
-
-                auto colid = entries(ptr);
-                auto val   = values(ptr);
-                if (colid != rowid) {
-                  tdiff -= val * lhs(colid);
-                } else {
-                  diag = ptr;
-                }
-              },
-              diff);
-#endif
-          lhs(rowid) = (rhs_val + diff) / values(diag);
-        }  // end if team.team_rank() < nodes_this_lvl
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl per thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end operator
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const LargerCutoffTag &, const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_team_rank    = team.team_rank();
-        // If cutoff > team_size, then a thread will be responsible for multiple
-        // rows - this may be a helpful scenario depending on occupancy etc.
-        for (int my_rank = my_team_rank; my_rank < cutoff;
-             my_rank += team.team_size()) {
-          diff = scalar_t(0.0);
-          if (my_rank < nodes_this_lvl) {
-            // THIS is where the mapping of threadid to rowid happens
-            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-            soffset = row_map(rowid);
-            eoffset = row_map(rowid + 1);
-            rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if (colid != rowid) {
-                diff -= val * lhs(colid);
-              }
-            }
-#else
-            auto trange = eoffset - soffset;
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, trange),
-                [&](const int loffset, scalar_t &tdiff) {
-                  auto ptr   = soffset + loffset;
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff -= val * lhs(colid);
-                  }
-                },
-                diff);
-#endif
-            // ASSUMPTION: sorted diagonal value located at eoffset - 1 for
-            // lower tri, soffset for upper tri
-            lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
-          }  // end if team.team_rank() < nodes_this_lvl
-        }    // end for my_rank loop
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl per thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end tagged operator
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const UnsortedLargerCutoffTag &,
-                    const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_team_rank    = team.team_rank();
-        // If cutoff > team_size, then a thread will be responsible for multiple
-        // rows - this may be a helpful scenario depending on occupancy etc.
-        for (int my_rank = my_team_rank; my_rank < cutoff;
-             my_rank += team.team_size()) {
-          diff = scalar_t(0.0);
-          if (my_rank < nodes_this_lvl) {
-            // THIS is where the mapping of threadid to rowid happens
-            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-            soffset = row_map(rowid);
-            eoffset = row_map(rowid + 1);
-            rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if (colid != rowid) {
-                diff -= val * lhs(colid);
-              }
-            }
-#else
-            auto trange = eoffset - soffset;
-            auto diag   = -1;
-
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, trange),
-                [&](const int loffset, scalar_t &tdiff) {
-                  auto ptr   = soffset + loffset;
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff -= val * lhs(colid);
-                  } else {
-                    diag = ptr;
-                  }
-                },
-                diff);
-#endif
-            lhs(rowid) = (rhs_val + diff) / values(diag);
-          }  // end if team.team_rank() < nodes_this_lvl
-        }    // end for my_rank loop
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl per thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end tagged operator
-  };
-
-  template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct UpperTriLvlSchedTP1SingleBlockFunctor {
-    RowMapType row_map;
-    EntriesType entries;
-    ValuesType values;
-    LHSType lhs;
-    RHSType rhs;
-    entries_t nodes_grouped_by_level;
-    entries_t nodes_per_level;
-
-    long node_count;  // like "block" offset into ngbl, my_league is the "local"
-    // offset
-    long lvl_start;
-    long lvl_end;
-    long cutoff;
-    // team_size: each team can be assigned a row, if there are enough rows...
-
-    UpperTriLvlSchedTP1SingleBlockFunctor(
-        const RowMapType &row_map_, const EntriesType &entries_,
-        const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
-        const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_,
-        long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0)
-        : row_map(row_map_),
-          entries(entries_),
-          values(values_),
-          lhs(lhs_),
-          rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_),
-          nodes_per_level(nodes_per_level_),
-          node_count(node_count_),
-          lvl_start(lvl_start_),
-          lvl_end(lvl_end_),
-          cutoff(cutoff_) {}
-
-    // SingleBlock: Only one block (or league) executing; team_rank used to map
-    // thread to row
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_rank         = team.team_rank();
-        diff                = scalar_t(0.0);
-
-        if (my_rank < nodes_this_lvl) {
-          // THIS is where the mapping of threadid to rowid happens
-          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-          soffset = row_map(rowid);
-          eoffset = row_map(rowid + 1);
-          rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              diff -= val * lhs(colid);
-            }
-          }
-#else
-          auto trange = eoffset - soffset;
-          Kokkos::parallel_reduce(
-              Kokkos::ThreadVectorRange(team, trange),
-              [&](const int loffset, scalar_t &tdiff) {
-                auto ptr   = soffset + loffset;
-                auto colid = entries(ptr);
-                auto val   = values(ptr);
-                if (colid != rowid) {
-                  tdiff -= val * lhs(colid);
-                }
-              },
-              diff);
-#endif
-          // ASSUMPTION: sorted diagonal value located at soffset
-          lhs(rowid) = (rhs_val + diff) / values(soffset);
-        }  // end if
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl each thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end operator
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const UnsortedTag &, const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_rank         = team.team_rank();
-        diff                = scalar_t(0.0);
-
-        if (my_rank < nodes_this_lvl) {
-          // THIS is where the mapping of threadid to rowid happens
-          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-          soffset = row_map(rowid);
-          eoffset = row_map(rowid + 1);
-          rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-          auto diag = -1;
-          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              diff -= val * lhs(colid);
-            } else {
-              diag = ptr;
-            }
-          }
-#else
-          auto trange = eoffset - soffset;
-          auto diag   = -1;
-
-          Kokkos::parallel_reduce(
-              Kokkos::ThreadVectorRange(team, trange),
-              [&](const int loffset, scalar_t &tdiff) {
-                auto ptr   = soffset + loffset;
-                auto colid = entries(ptr);
-                auto val   = values(ptr);
-                if (colid != rowid) {
-                  tdiff -= val * lhs(colid);
-                } else {
-                  diag = ptr;
-                }
-              },
-              diff);
-#endif
-          lhs(rowid) = (rhs_val + diff) / values(diag);
-        }  // end if
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl each thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end operator
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const LargerCutoffTag &, const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_team_rank    = team.team_rank();
-        // If cutoff > team_size, then a thread will be responsible for multiple
-        // rows - this may be a helpful scenario depending on occupancy etc.
-        for (int my_rank = my_team_rank; my_rank < cutoff;
-             my_rank += team.team_size()) {
-          diff = scalar_t(0.0);
-          if (my_rank < nodes_this_lvl) {
-            // THIS is where the mapping of threadid to rowid happens
-            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-            soffset = row_map(rowid);
-            eoffset = row_map(rowid + 1);
-            rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if (colid != rowid) {
-                diff -= val * lhs(colid);
-              }
-            }
-#else
-            auto trange = eoffset - soffset;
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, trange),
-                [&](const int loffset, scalar_t &tdiff) {
-                  auto ptr   = soffset + loffset;
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff -= val * lhs(colid);
-                  }
-                },
-                diff);
-#endif
-            // ASSUMPTION: sorted diagonal value located at eoffset - 1 for
-            // lower tri, soffset for upper tri
-            lhs(rowid) = (rhs_val + diff) / values(soffset);
-          }  // end if team.team_rank() < nodes_this_lvl
-        }    // end for my_rank loop
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl per thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end tagged operator
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const UnsortedLargerCutoffTag &,
-                    const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_team_rank    = team.team_rank();
-        // If cutoff > team_size, then a thread will be responsible for multiple
-        // rows - this may be a helpful scenario depending on occupancy etc.
-        for (int my_rank = my_team_rank; my_rank < cutoff;
-             my_rank += team.team_size()) {
-          diff = scalar_t(0.0);
-          if (my_rank < nodes_this_lvl) {
-            // THIS is where the mapping of threadid to rowid happens
-            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-            soffset = row_map(rowid);
-            eoffset = row_map(rowid + 1);
-            rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-            auto diag = -1;
-            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if (colid != rowid) {
-                diff -= val * lhs(colid);
-              } else {
-                diag = ptr;
-              }
-            }
-#else
-            auto trange = eoffset - soffset;
-            auto diag   = -1;
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, trange),
-                [&](const int loffset, scalar_t &tdiff) {
-                  auto ptr   = soffset + loffset;
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff -= val * lhs(colid);
-                  } else {
-                    diag = ptr;
-                  }
-                },
-                diff);
-#endif
-            lhs(rowid) = (rhs_val + diff) / values(diag);
-          }  // end if team.team_rank() < nodes_this_lvl
-        }    // end for my_rank loop
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl per thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end tagged operator
-  };
-
-  template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct TriLvlSchedTP1SingleBlockFunctor {
-    RowMapType row_map;
-    EntriesType entries;
-    ValuesType values;
-    LHSType lhs;
-    RHSType rhs;
-    entries_t nodes_grouped_by_level;
-    entries_t nodes_per_level;
-
-    long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                      // offset
-    long lvl_start;
-    long lvl_end;
-    const bool is_lowertri;
-    const int dense_nrows;
-    const int cutoff;
-    // team_size: each team can be assigned a row, if there are enough rows...
-
-    TriLvlSchedTP1SingleBlockFunctor(
-        const RowMapType &row_map_, const EntriesType &entries_,
-        const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
-        const entries_t &nodes_grouped_by_level_, entries_t &nodes_per_level_,
-        long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_,
-        const int dense_nrows_ = 0, const int cutoff_ = 0)
-        : row_map(row_map_),
-          entries(entries_),
-          values(values_),
-          lhs(lhs_),
-          rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_),
-          nodes_per_level(nodes_per_level_),
-          node_count(node_count_),
-          lvl_start(lvl_start_),
-          lvl_end(lvl_end_),
-          is_lowertri(is_lower_),
-          dense_nrows(dense_nrows_),
-          cutoff(cutoff_) {}
-
-    // SingleBlock: Only one block (or league) executing; team_rank used to map
-    // thread to row
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_rank         = team.team_rank();
-        diff                = scalar_t(0.0);
-
-        if (my_rank < nodes_this_lvl) {
-          // THIS is where the mapping of threadid to rowid happens
-          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-          soffset = row_map(rowid);
-          eoffset = row_map(rowid + 1);
-          rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              diff -= val * lhs(colid);
-            }
-          }
-#else
-          auto trange = eoffset - soffset;
-          Kokkos::parallel_reduce(
-              Kokkos::ThreadVectorRange(team, trange),
-              [&](const int loffset, scalar_t &tdiff) {
-                auto ptr   = soffset + loffset;
-                auto colid = entries(ptr);
-                auto val   = values(ptr);
-                if (colid != rowid) {
-                  tdiff -= val * lhs(colid);
-                }
-              },
-              diff);
-#endif
-
-          // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower
-          // tri, soffset for upper tri
-          if (is_lowertri)
-            lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
-          else
-            lhs(rowid) = (rhs_val + diff) / values(soffset);
-        }  // end if team.team_rank() < nodes_this_lvl
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl per thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end operator
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const UnsortedTag &, const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_rank         = team.team_rank();
-        diff                = scalar_t(0.0);
-
-        if (my_rank < nodes_this_lvl) {
-          // THIS is where the mapping of threadid to rowid happens
-          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-          soffset = row_map(rowid);
-          eoffset = row_map(rowid + 1);
-          rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-          auto diag = -1;
-          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              diff -= val * lhs(colid);
-            } else {
-              diag = ptr;
-            }
-          }
-#else
-          auto trange = eoffset - soffset;
-          auto diag   = -1;
-          Kokkos::parallel_reduce(
-              Kokkos::ThreadVectorRange(team, trange),
-              [&](const int loffset, scalar_t &tdiff) {
-                auto ptr   = soffset + loffset;
-                auto colid = entries(ptr);
-                auto val   = values(ptr);
-                if (colid != rowid) {
-                  tdiff -= val * lhs(colid);
-                } else {
-                  diag = ptr;
-                }
-              },
-              diff);
-#endif
-          lhs(rowid) = (rhs_val + diff) / values(diag);
-        }  // end if team.team_rank() < nodes_this_lvl
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl per thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end operator
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const LargerCutoffTag &, const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_team_rank    = team.team_rank();
-        // If cutoff > team_size, then a thread will be responsible for multiple
-        // rows - this may be a helpful scenario depending on occupancy etc.
-        for (int my_rank = my_team_rank; my_rank < cutoff;
-             my_rank += team.team_size()) {
-          diff = scalar_t(0.0);
-          if (my_rank < nodes_this_lvl) {
-            // THIS is where the mapping of threadid to rowid happens
-            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-            soffset = row_map(rowid);
-            eoffset = row_map(rowid + 1);
-            rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if (colid != rowid) {
-                diff -= val * lhs(colid);
-              }
-            }
-#else
-            auto trange = eoffset - soffset;
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, trange),
-                [&](const int loffset, scalar_t &tdiff) {
-                  auto ptr   = soffset + loffset;
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff -= val * lhs(colid);
-                  }
-                },
-                diff);
-#endif
-
-            // ASSUMPTION: sorted diagonal value located at eoffset - 1 for
-            // lower tri, soffset for upper tri
-            if (is_lowertri)
-              lhs(rowid) = (rhs_val + diff) / values(eoffset - 1);
-            else
-              lhs(rowid) = (rhs_val + diff) / values(soffset);
-          }  // end if team.team_rank() < nodes_this_lvl
-        }    // end for my_rank loop
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl per thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end tagged operator
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const UnsortedLargerCutoffTag &,
-                    const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_team_rank    = team.team_rank();
-        // If cutoff > team_size, then a thread will be responsible for multiple
-        // rows - this may be a helpful scenario depending on occupancy etc.
-        for (int my_rank = my_team_rank; my_rank < cutoff;
-             my_rank += team.team_size()) {
-          diff = scalar_t(0.0);
-          if (my_rank < nodes_this_lvl) {
-            // THIS is where the mapping of threadid to rowid happens
-            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-            soffset = row_map(rowid);
-            eoffset = row_map(rowid + 1);
-            rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-            auto diag = -1;
-            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if (colid != rowid) {
-                diff -= val * lhs(colid);
-              } else {
-                diag = ptr;
-              }
-            }
-#else
-            auto trange = eoffset - soffset;
-            auto diag   = -1;
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, trange),
-                [&](const int loffset, scalar_t &tdiff) {
-                  auto ptr   = soffset + loffset;
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff -= val * lhs(colid);
-                  } else {
-                    diag = ptr;
-                  }
-                },
-                diff);
-#endif
-            lhs(rowid) = (rhs_val + diff) / values(diag);
-          }  // end if team.team_rank() < nodes_this_lvl
-        }    // end for my_rank loop
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl per thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end tagged operator
-  };
-
-  template <class RowMapType, class EntriesType, class ValuesType,
-            class LHSType, class RHSType>
-  struct TriLvlSchedTP1SingleBlockFunctorDiagValues {
-    RowMapType row_map;
-    EntriesType entries;
-    ValuesType values;
-    LHSType lhs;
-    RHSType rhs;
-    entries_t nodes_grouped_by_level;
-    entries_t nodes_per_level;
-    ValuesType diagonal_values;
-
-    long node_count;  // like "block" offset into ngbl, my_league is the "local"
-                      // offset
-    long lvl_start;
-    long lvl_end;
-    const bool is_lowertri;
-    const int dense_nrows;
-    const int cutoff;
-    // team_size: each team can be assigned a row, if there are enough rows...
-
-    TriLvlSchedTP1SingleBlockFunctorDiagValues(
-        const RowMapType &row_map_, const EntriesType &entries_,
-        const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_,
-        const entries_t &nodes_grouped_by_level_,
-        const entries_t &nodes_per_level_, const ValuesType &diagonal_values_,
-        long node_count_, const long lvl_start_, const long lvl_end_,
-        const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0)
-        : row_map(row_map_),
-          entries(entries_),
-          values(values_),
-          lhs(lhs_),
-          rhs(rhs_),
-          nodes_grouped_by_level(nodes_grouped_by_level_),
-          nodes_per_level(nodes_per_level_),
-          diagonal_values(diagonal_values_),
-          node_count(node_count_),
-          lvl_start(lvl_start_),
-          lvl_end(lvl_end_),
-          is_lowertri(is_lower_),
-          dense_nrows(dense_nrows_),
-          cutoff(cutoff_) {}
-
-    // SingleBlock: Only one block (or league) executing; team_rank used to map
-    // thread to row
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_rank         = team.team_rank();
-        diff                = scalar_t(0.0);
-
-        if (my_rank < nodes_this_lvl) {
-          // THIS is where the mapping of threadid to rowid happens
-          rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-          soffset = row_map(rowid);
-          eoffset = row_map(rowid + 1);
-          rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-          for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-            auto colid = entries(ptr);
-            auto val   = values(ptr);
-            if (colid != rowid) {
-              diff -= val * lhs(colid);
-            }
-          }
-#else
-          auto trange = eoffset - soffset;
-          Kokkos::parallel_reduce(
-              Kokkos::ThreadVectorRange(team, trange),
-              [&](const int loffset, scalar_t &tdiff) {
-                auto ptr   = soffset + loffset;
-                auto colid = entries(ptr);
-                auto val   = values(ptr);
-
-                if (colid != rowid) {
-                  tdiff -= val * lhs(colid);
-                }
-              },
-              diff);
-#endif
-          // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower
-          // tri, soffset for upper tri
-          lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid);
-        }  // end if team.team_rank() < nodes_this_lvl
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl per thread
-          mut_node_count += nodes_this_lvl;
-        }
-        team.team_barrier();
-      }  // end for lvl
-    }    // end operator
-
-    KOKKOS_INLINE_FUNCTION
-    void operator()(const LargerCutoffTag &, const member_type &team) const {
-      long mut_node_count = node_count;
-      typename entries_t::non_const_value_type rowid{0};
-      typename RowMapType::non_const_value_type soffset{0};
-      typename RowMapType::non_const_value_type eoffset{0};
-      typename RHSType::non_const_value_type rhs_val{0};
-      scalar_t diff = scalar_t(0.0);
-
-      for (auto lvl = lvl_start; lvl < lvl_end; ++lvl) {
-        auto nodes_this_lvl = nodes_per_level(lvl);
-        int my_team_rank    = team.team_rank();
-        // If cutoff > team_size, then a thread will be responsible for multiple
-        // rows - this may be a helpful scenario depending on occupancy etc.
-        for (int my_rank = my_team_rank; my_rank < cutoff;
-             my_rank += team.team_size()) {
-          diff = scalar_t(0.0);
-          if (my_rank < nodes_this_lvl) {
-            // THIS is where the mapping of threadid to rowid happens
-            rowid   = nodes_grouped_by_level(my_rank + mut_node_count);
-            soffset = row_map(rowid);
-            eoffset = row_map(rowid + 1);
-            rhs_val = rhs(rowid);
-
-#ifdef SERIAL_FOR_LOOP
-            for (auto ptr = soffset; ptr < eoffset; ++ptr) {
-              auto colid = entries(ptr);
-              auto val   = values(ptr);
-              if (colid != rowid) {
-                diff -= val * lhs(colid);
-              }
-            }
-#else
-            auto trange = eoffset - soffset;
-            Kokkos::parallel_reduce(
-                Kokkos::ThreadVectorRange(team, trange),
-                [&](const int loffset, scalar_t &tdiff) {
-                  auto ptr   = soffset + loffset;
-                  auto colid = entries(ptr);
-                  auto val   = values(ptr);
-                  if (colid != rowid) {
-                    tdiff -= val * lhs(colid);
-                  }
-                },
-                diff);
-#endif
-            lhs(rowid) = (rhs_val + diff) / diagonal_values(rowid);
-          }  // end if team.team_rank() < nodes_this_lvl
-        }    // end for my_rank loop
-        {
-          // Update mut_node_count from nodes_per_level(lvl) each iteration of
-          // lvl per thread
-          mut_node_count += nodes_this_lvl;
+                                                              Xjj);
+          }
         }
         team.team_barrier();
-      }  // end for lvl
-    }    // end tagged operator
-  };
-
-#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
-  template <class RowMapType, class EntriesType, class ValuesType,
-            class RHSType, class LHSType>
-  static void lower_tri_solve_cg(TriSolveHandle &thandle,
-                                 const RowMapType row_map,
-                                 const EntriesType entries,
-                                 const ValuesType values, const RHSType &rhs,
-                                 LHSType &lhs) {
-    typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph =
-        thandle.get_sptrsvCudaGraph();
-
-    auto nlevels = thandle.get_num_levels();
-
-    auto stream1 = lcl_cudagraph->stream;
-    Kokkos::Cuda cuda1(stream1);
-    auto graph = lcl_cudagraph->cudagraph;
-
-    Kokkos::parallel_for("Init", Kokkos::RangePolicy<execution_space>(0, 1),
-                         EmptyFunctor());
-    Kokkos::Cuda().fence();
-    cudaStreamSynchronize(stream1);
-    // Kokkos::fence();
-
-    auto hnodes_per_level       = thandle.get_host_nodes_per_level();
-    auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
-
-    size_type node_count = 0;
-
-    int team_size = thandle.get_team_size();
-    team_size     = team_size == -1 ? 64 : team_size;
-
-    // Start capturing stream
-    if (thandle.cudagraphCreated == false) {
-      Kokkos::fence();
-      cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
-      {
-        for (int iter = 0; iter < nlevels; ++iter) {
-          size_type lvl_nodes = hnodes_per_level(iter);
-
-          auto policy = std::is_same<execution_space, Kokkos::Cuda>::value
-                            ? team_policy(lvl_nodes, team_size, cuda1)
-                            : team_policy(lvl_nodes, team_size);
-
-          Kokkos::parallel_for(
-              "parfor_l_team_cudagraph",
-              Kokkos::Experimental::require(
-                  policy,
-                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-              LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
-                                               ValuesType, LHSType, RHSType>(
-                  row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                  node_count));
+      }
+      if (nsrow2 > 0) {
+        /* GEMM to update off diagonal blocks, Z = Uij * Xj */
+        auto Z = Kokkos::subview(
+            work, range_type(workoffset + nscol, workoffset + nsrow));
+        if (!invert_offdiagonal && diag_kernel_type(level) != 3) {
+          // not device-level TRSM-solve
+          auto Uij =
+              Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL());
+          KokkosBlas::TeamGemv<member_type, KokkosBatched::Trans::NoTranspose,
+                               KokkosBlas::Algo::Gemv::Unblocked>::invoke(team,
+                                                                          one,
+                                                                          Uij,
+                                                                          Xj,
+                                                                          zero,
+                                                                          Z);
+          team.team_barrier();
+        }
 
-          node_count += hnodes_per_level(iter);
+        /* scatter vector into Z */
+        int i2 = i1 + nscol;  // offset into rowind
+        Kokkos::View<scalar_t *, temp_mem_space,
+                     Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::Atomic>>
+            Xatomic(X.data(), X.extent(0));
+        for (int ii = team_rank; ii < nsrow2; ii += team_size) {
+          int i = rowind(i2 + ii);
+          Xatomic(i) -= Z(ii);
         }
+        team.team_barrier();
       }
-      cudaStreamEndCapture(stream1, &graph);
-
-      // Create graphExec
-      cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL,
-                           NULL, 0);
-      thandle.cudagraphCreated = true;
     }
-    // Run graph
-    Kokkos::fence();
-    cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1);
+  };
+#endif
 
-    cudaStreamSynchronize(stream1);
-    Kokkos::fence();
-  }  // end lower_tri_solve_cg
+  //
+  // End of functors, begin external API
+  //
 
-  template <class RowMapType, class EntriesType, class ValuesType,
+#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+  template <bool IsLower, class RowMapType, class EntriesType, class ValuesType,
             class RHSType, class LHSType>
-  static void upper_tri_solve_cg(TriSolveHandle &thandle,
-                                 const RowMapType row_map,
-                                 const EntriesType entries,
-                                 const ValuesType values, const RHSType &rhs,
-                                 LHSType &lhs) {
+  static void tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map,
+                           const EntriesType entries, const ValuesType values,
+                           const RHSType &rhs, LHSType &lhs) {
     typename TriSolveHandle::SPTRSVcudaGraphWrapperType *lcl_cudagraph =
         thandle.get_sptrsvCudaGraph();
 
@@ -2642,12 +1041,12 @@ struct SptrsvWrap {
                             : team_policy(lvl_nodes, team_size);
 
           Kokkos::parallel_for(
-              "parfor_u_team_cudagraph",
+              "parfor_l_team_cudagraph",
               Kokkos::Experimental::require(
                   policy,
                   Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-              UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
-                                               ValuesType, LHSType, RHSType>(
+              TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                          LHSType, RHSType, IsLower>(
                   row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
                   node_count));
 
@@ -2667,10 +1066,14 @@ struct SptrsvWrap {
 
     cudaStreamSynchronize(stream1);
     Kokkos::fence();
-  }  // end upper_tri_solve_cg
+  }  // end tri_solve_cg
 
 #endif
 
+#define FunctorTypeMacro(Functor, IsLower, BlockEnabled)                  \
+  Functor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, IsLower, \
+          BlockEnabled>
+
   template <class RowMapType, class EntriesType, class ValuesType,
             class RHSType, class LHSType>
   static void lower_tri_solve(execution_space &space, TriSolveHandle &thandle,
@@ -2681,13 +1084,22 @@ struct SptrsvWrap {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
     cudaProfilerStop();
 #endif
-    auto nlevels = thandle.get_num_levels();
+    const auto nlevels = thandle.get_num_levels();
     // Keep this a host View, create device version and copy to back to host
     // during scheduling This requires making sure the host view in the handle
     // is properly updated after the symbolic phase
-    auto nodes_per_level        = thandle.get_nodes_per_level();
-    auto hnodes_per_level       = thandle.get_host_nodes_per_level();
-    auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+    const auto nodes_per_level        = thandle.get_nodes_per_level();
+    const auto hnodes_per_level       = thandle.get_host_nodes_per_level();
+    const auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+    const auto block_size             = thandle.get_block_size();
+    const auto block_enabled          = false;  // thandle.is_block_enabled();
+    assert(block_size == 0);
+
+    // Set up functor types
+    using LowerRPFunc =
+        FunctorTypeMacro(TriLvlSchedRPSolverFunctor, true, false);
+    using LowerTPFunc =
+        FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, true, false);
 
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
     using namespace KokkosSparse::Experimental;
@@ -2753,44 +1165,29 @@ struct SptrsvWrap {
 #endif
         if (thandle.get_algorithm() ==
             KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) {
+          LowerRPFunc lrpp(row_map, entries, values, lhs, rhs,
+                           nodes_grouped_by_level, block_size);
+
           Kokkos::parallel_for(
               "parfor_fixed_lvl",
               Kokkos::Experimental::require(
                   range_policy(space, node_count, node_count + lvl_nodes),
                   Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-              LowerTriLvlSchedRPSolverFunctor<RowMapType, EntriesType,
-                                              ValuesType, LHSType, RHSType>(
-                  row_map, entries, values, lhs, rhs, nodes_grouped_by_level));
+              lrpp);
         } else if (thandle.get_algorithm() ==
                    KokkosSparse::Experimental::SPTRSVAlgorithm::
                        SEQLVLSCHD_TP1) {
+          LowerTPFunc ltpp(row_map, entries, values, lhs, rhs,
+                           nodes_grouped_by_level, node_count, block_size);
           int team_size = thandle.get_team_size();
-
-#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-          TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                      LHSType, RHSType>
-              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   true, node_count);
-#else
-          LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                           LHSType, RHSType>
-              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   node_count);
-#endif
-          if (team_size == -1)
-            Kokkos::parallel_for(
-                "parfor_l_team",
-                Kokkos::Experimental::require(
-                    team_policy(space, lvl_nodes, Kokkos::AUTO),
-                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                tstf);
-          else
-            Kokkos::parallel_for(
-                "parfor_l_team",
-                Kokkos::Experimental::require(
-                    team_policy(space, lvl_nodes, team_size),
-                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                tstf);
+          auto tp       = team_size == -1
+                        ? team_policy(space, lvl_nodes, Kokkos::AUTO)
+                        : team_policy(space, lvl_nodes, team_size);
+          Kokkos::parallel_for(
+              "parfor_l_team",
+              Kokkos::Experimental::require(
+                  tp, Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+              ltpp);
         }
         // TP2 algorithm has issues with some offset-ordinal combo to be
         // addressed
@@ -2837,6 +1234,8 @@ struct SptrsvWrap {
         else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
                  thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
                  thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) {
+          KK_REQUIRE_MSG(!block_enabled,
+                         "Block matrices not yet supported for supernodal");
 
 #ifdef profile_supernodal_etree
           size_t flops = 0;
@@ -2985,6 +1384,8 @@ struct SptrsvWrap {
                        SPTRSVAlgorithm::SUPERNODAL_SPMV ||
                    thandle.get_algorithm() ==
                        SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
+          KK_REQUIRE_MSG(!block_enabled,
+                         "Block matrices not yet supported for supernodal");
 #ifdef profile_supernodal_etree
           Kokkos::Timer timer;
           timer.reset();
@@ -3067,7 +1468,6 @@ struct SptrsvWrap {
     std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl
               << std::endl;
 #endif
-
   }  // end lower_tri_solve
 
   template <class RowMapType, class EntriesType, class ValuesType,
@@ -3086,12 +1486,18 @@ struct SptrsvWrap {
     // Keep this a host View, create device version and copy to back to host
     // during scheduling This requires making sure the host view in the handle
     // is properly updated after the symbolic phase
-    auto nodes_per_level  = thandle.get_nodes_per_level();
-    auto hnodes_per_level = thandle.get_host_nodes_per_level();
-    // auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level);
-    // Kokkos::deep_copy(hnodes_per_level, nodes_per_level);
-
+    auto nodes_per_level        = thandle.get_nodes_per_level();
+    auto hnodes_per_level       = thandle.get_host_nodes_per_level();
     auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+    const auto block_size       = thandle.get_block_size();
+    const auto block_enabled    = false;  // thandle.is_block_enabled();
+    assert(block_size == 0);
+
+    // Set up functor types
+    using UpperRPFunc =
+        FunctorTypeMacro(TriLvlSchedRPSolverFunctor, false, false);
+    using UpperTPFunc =
+        FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, false, false);
 
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
     using namespace KokkosSparse::Experimental;
@@ -3157,44 +1563,28 @@ struct SptrsvWrap {
 
         if (thandle.get_algorithm() ==
             KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) {
+          UpperRPFunc urpp(row_map, entries, values, lhs, rhs,
+                           nodes_grouped_by_level, block_size);
           Kokkos::parallel_for(
               "parfor_fixed_lvl",
               Kokkos::Experimental::require(
                   range_policy(space, node_count, node_count + lvl_nodes),
                   Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-              UpperTriLvlSchedRPSolverFunctor<RowMapType, EntriesType,
-                                              ValuesType, LHSType, RHSType>(
-                  row_map, entries, values, lhs, rhs, nodes_grouped_by_level));
+              urpp);
         } else if (thandle.get_algorithm() ==
                    KokkosSparse::Experimental::SPTRSVAlgorithm::
                        SEQLVLSCHD_TP1) {
+          UpperTPFunc utpp(row_map, entries, values, lhs, rhs,
+                           nodes_grouped_by_level, node_count, block_size);
           int team_size = thandle.get_team_size();
-
-#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-          TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                      LHSType, RHSType>
-              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   false, node_count);
-#else
-          UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                           LHSType, RHSType>
-              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   node_count);
-#endif
-          if (team_size == -1)
-            Kokkos::parallel_for(
-                "parfor_u_team",
-                Kokkos::Experimental::require(
-                    team_policy(space, lvl_nodes, Kokkos::AUTO),
-                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                tstf);
-          else
-            Kokkos::parallel_for(
-                "parfor_u_team",
-                Kokkos::Experimental::require(
-                    team_policy(space, lvl_nodes, team_size),
-                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                tstf);
+          auto tp       = team_size == -1
+                        ? team_policy(space, lvl_nodes, Kokkos::AUTO)
+                        : team_policy(space, lvl_nodes, team_size);
+          Kokkos::parallel_for(
+              "parfor_u_team",
+              Kokkos::Experimental::require(
+                  tp, Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+              utpp);
         }
         // TP2 algorithm has issues with some offset-ordinal combo to be
         // addressed
@@ -3240,6 +1630,8 @@ tstf); } // end elseif
         else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE ||
                  thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
                  thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG) {
+          KK_REQUIRE_MSG(!block_enabled,
+                         "Block matrices not yet supported for supernodal");
 
 #ifdef profile_supernodal_etree
           size_t flops = 0;
@@ -3493,6 +1885,9 @@ tstf); } // end elseif
                        SPTRSVAlgorithm::SUPERNODAL_SPMV ||
                    thandle.get_algorithm() ==
                        SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
+          KK_REQUIRE_MSG(!block_enabled,
+                         "Block matrices not yet supported for supernodal");
+
 #ifdef profile_supernodal_etree
           Kokkos::Timer timer;
           timer.reset();
@@ -3603,13 +1998,13 @@ tstf); } // end elseif
 
   }  // end upper_tri_solve
 
-  template <class RowMapType, class EntriesType, class ValuesType,
+  template <bool IsLower, class RowMapType, class EntriesType, class ValuesType,
             class RHSType, class LHSType>
   static void tri_solve_chain(execution_space &space, TriSolveHandle &thandle,
                               const RowMapType row_map,
                               const EntriesType entries,
                               const ValuesType values, const RHSType &rhs,
-                              LHSType &lhs, const bool /*is_lowertri_*/) {
+                              LHSType &lhs) {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
     cudaProfilerStop();
 #endif
@@ -3625,28 +2020,14 @@ tstf); } // end elseif
 
     auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
 
-    const bool is_lowertri = thandle.is_lower_tri();
-
     size_type node_count = 0;
 
     // REFACTORED to cleanup; next, need debug and timer routines
     using large_cutoff_policy_type =
         Kokkos::TeamPolicy<LargerCutoffTag, execution_space>;
-    /*
-      using TP1Functor = TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
-      ValuesType, LHSType, RHSType>; using LTP1Functor =
-      LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-      LHSType, RHSType>; using UTP1Functor =
-      UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-      LHSType, RHSType>; using LSingleBlockFunctor =
-      LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
-      LHSType, RHSType>; using USingleBlockFunctor =
-      UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
-      LHSType, RHSType>;
-    */
     using SingleBlockFunctor =
         TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType,
-                                         LHSType, RHSType>;
+                                         LHSType, RHSType, IsLower>;
 
     int team_size = thandle.get_team_size();
     int vector_size =
@@ -3670,315 +2051,105 @@ tstf); } // end elseif
       // team_size_singleblock is unimportant
     }
 
-    // This is only necessary for Lower,UpperTri functor versions; else,
-    // is_lowertri can be passed as arg to the generic Tri functor...
-    if (is_lowertri) {
-      for (size_type chainlink = 0; chainlink < num_chain_entries;
-           ++chainlink) {
-        size_type schain = h_chain_ptr(chainlink);
-        size_type echain = h_chain_ptr(chainlink + 1);
-
-        if (echain - schain == 1) {
-          // if team_size is -1 (unset), get recommended size from Kokkos
-#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-          TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                      LHSType, RHSType>
-              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   true, node_count);
-#else
-          LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                           LHSType, RHSType>
-              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   node_count);
-#endif
-          if (team_size == -1) {
-            team_size =
-                team_policy(space, 1, 1, vector_size)
-                    .team_size_recommended(tstf, Kokkos::ParallelForTag());
-          }
-
-          size_type lvl_nodes = hnodes_per_level(schain);  // lvl == echain????
-          Kokkos::parallel_for(
-              "parfor_l_team_chain1",
-              Kokkos::Experimental::require(
-                  team_policy(space, lvl_nodes, team_size, vector_size),
-                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-              tstf);
-          node_count += lvl_nodes;
-
-        } else {
-          size_type lvl_nodes = 0;
+    for (size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink) {
+      size_type schain = h_chain_ptr(chainlink);
+      size_type echain = h_chain_ptr(chainlink + 1);
+
+      if (echain - schain == 1) {
+        // if team_size is -1 (unset), get recommended size from Kokkos
+        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
+                                    LHSType, RHSType, IsLower, false>
+            tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
+                 node_count);
+        if (team_size == -1) {
+          team_size =
+              team_policy(space, 1, 1, vector_size)
+                  .team_size_recommended(tstf, Kokkos::ParallelForTag());
+        }
 
-          for (size_type i = schain; i < echain; ++i) {
-            lvl_nodes += hnodes_per_level(i);
-          }
+        size_type lvl_nodes = hnodes_per_level(schain);  // lvl == echain????
+        Kokkos::parallel_for(
+            "parfor_l_team_chain1",
+            Kokkos::Experimental::require(
+                team_policy(space, lvl_nodes, team_size, vector_size),
+                Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+            tstf);
+        node_count += lvl_nodes;
 
-          if (team_size_singleblock <= 0) {
-            team_size_singleblock =
-                team_policy(space, 1, 1, vector_size)
-                    .team_size_recommended(
-                        SingleBlockFunctor(row_map, entries, values, lhs, rhs,
-                                           nodes_grouped_by_level,
-                                           nodes_per_level, node_count, schain,
-                                           echain, is_lowertri),
-                        Kokkos::ParallelForTag());
-          }
+      } else {
+        size_type lvl_nodes = 0;
 
-          if (cutoff <= team_size_singleblock) {
-#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-            TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
-                                             ValuesType, LHSType, RHSType>
-                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                     nodes_per_level, node_count, schain, echain, true);
-#else
-            LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
-                                                  ValuesType, LHSType, RHSType>
-                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                     nodes_per_level, node_count, schain, echain);
-#endif
-            Kokkos::parallel_for(
-                "parfor_l_team_chainmulti",
-                Kokkos::Experimental::require(
-                    team_policy(space, 1, team_size_singleblock, vector_size),
-                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                tstf);
-          } else {
-            // team_size_singleblock < cutoff => kernel must allow for a
-            // block-stride internally
-#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-            TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
-                                             ValuesType, LHSType, RHSType>
-                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                     nodes_per_level, node_count, schain, echain, true, 0,
-                     cutoff);
-#else
-            LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
-                                                  ValuesType, LHSType, RHSType>
-                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                     nodes_per_level, node_count, schain, echain, cutoff);
-#endif
-            Kokkos::parallel_for(
-                "parfor_l_team_chainmulti_cutoff",
-                Kokkos::Experimental::require(
-                    large_cutoff_policy_type(1, team_size_singleblock,
-                                             vector_size),
-                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                tstf);
-          }
-          node_count += lvl_nodes;
+        for (size_type i = schain; i < echain; ++i) {
+          lvl_nodes += hnodes_per_level(i);
         }
-        // TODO: space.fence()
-        Kokkos::fence();  // TODO - is this necessary? that is, can the
-                          // parallel_for launch before the s/echain values have
-                          // been updated?
-      }
-
-    } else {
-      for (size_type chainlink = 0; chainlink < num_chain_entries;
-           ++chainlink) {
-        size_type schain = h_chain_ptr(chainlink);
-        size_type echain = h_chain_ptr(chainlink + 1);
 
-        if (echain - schain == 1) {
-          // if team_size is -1 (unset), get recommended size from Kokkos
-#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-          TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                      LHSType, RHSType>
-              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   is_lowertri, node_count);
-#else
-          UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                           LHSType, RHSType>
-              tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                   node_count);
-#endif
-          if (team_size == -1) {
-            team_size =
-                team_policy(space, 1, 1, vector_size)
-                    .team_size_recommended(tstf, Kokkos::ParallelForTag());
-          }
+        if (team_size_singleblock <= 0) {
+          team_size_singleblock =
+              team_policy(space, 1, 1, vector_size)
+                  .team_size_recommended(
+                      SingleBlockFunctor(row_map, entries, values, lhs, rhs,
+                                         nodes_grouped_by_level,
+                                         nodes_per_level, node_count, schain,
+                                         echain),
+                      Kokkos::ParallelForTag());
+        }
 
-          // TODO To use cudagraph here, need to know how many non-unit chains
-          // there are, create a graph for each and launch accordingly
-          size_type lvl_nodes = hnodes_per_level(schain);  // lvl == echain????
+        if (cutoff <= team_size_singleblock) {
+          SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs,
+                                  nodes_grouped_by_level, nodes_per_level,
+                                  node_count, schain, echain);
           Kokkos::parallel_for(
-              "parfor_u_team_chain1",
+              "parfor_l_team_chainmulti",
               Kokkos::Experimental::require(
-                  team_policy(space, lvl_nodes, team_size, vector_size),
+                  team_policy(space, 1, team_size_singleblock, vector_size),
                   Kokkos::Experimental::WorkItemProperty::HintLightWeight),
               tstf);
-          node_count += lvl_nodes;
-
         } else {
-          size_type lvl_nodes = 0;
-
-          for (size_type i = schain; i < echain; ++i) {
-            lvl_nodes += hnodes_per_level(i);
-          }
-
-          if (team_size_singleblock <= 0) {
-            // team_size_singleblock = team_policy(1, 1,
-            // 1).team_size_recommended(SingleBlockFunctor(row_map, entries,
-            // values, lhs, rhs, nodes_grouped_by_level, is_lowertri,
-            // node_count), Kokkos::ParallelForTag());
-            team_size_singleblock =
-                team_policy(space, 1, 1, vector_size)
-                    .team_size_recommended(
-                        SingleBlockFunctor(row_map, entries, values, lhs, rhs,
-                                           nodes_grouped_by_level,
-                                           nodes_per_level, node_count, schain,
-                                           echain, is_lowertri),
-                        Kokkos::ParallelForTag());
-          }
-
-          if (cutoff <= team_size_singleblock) {
-#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-            TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
-                                             ValuesType, LHSType, RHSType>
-                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                     nodes_per_level, node_count, schain, echain, is_lowertri);
-#else
-            UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
-                                                  ValuesType, LHSType, RHSType>
-                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                     nodes_per_level, node_count, schain, echain);
-#endif
-            Kokkos::parallel_for(
-                "parfor_u_team_chainmulti",
-                Kokkos::Experimental::require(
-                    team_policy(space, 1, team_size_singleblock, vector_size),
-                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                tstf);
-          } else {
-            // team_size_singleblock < cutoff => kernel must allow for a
-            // block-stride internally
-#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-            TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
-                                             ValuesType, LHSType, RHSType>
-                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                     nodes_per_level, node_count, schain, echain, is_lowertri,
-                     0, cutoff);
-#else
-            UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType,
-                                                  ValuesType, LHSType, RHSType>
-                tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level,
-                     nodes_per_level, node_count, schain, echain, cutoff);
-#endif
-            Kokkos::parallel_for(
-                "parfor_u_team_chainmulti_cutoff",
-                Kokkos::Experimental::require(
-                    large_cutoff_policy_type(1, team_size_singleblock,
-                                             vector_size),
-                    Kokkos::Experimental::WorkItemProperty::HintLightWeight),
-                tstf);
-          }
-          node_count += lvl_nodes;
+          // team_size_singleblock < cutoff => kernel must allow for a
+          // block-stride internally
+          SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs,
+                                  nodes_grouped_by_level, nodes_per_level,
+                                  node_count, schain, echain, 0, cutoff);
+          Kokkos::parallel_for(
+              "parfor_l_team_chainmulti_cutoff",
+              Kokkos::Experimental::require(
+                  large_cutoff_policy_type(1, team_size_singleblock,
+                                           vector_size),
+                  Kokkos::Experimental::WorkItemProperty::HintLightWeight),
+              tstf);
         }
         // TODO: space.fence()
         Kokkos::fence();  // TODO - is this necessary? that is, can the
                           // parallel_for launch before the s/echain values have
                           // been updated?
       }
+      // TODO: space.fence()
+      Kokkos::fence();  // TODO - is this necessary? that is, can the
+      // parallel_for launch before the s/echain values have
+      // been updated?
     }
   }  // end tri_solve_chain
 
   // --------------------------------
   // Stream interfaces
   // --------------------------------
-  template <class RowMapType, class EntriesType, class ValuesType,
-            class RHSType, class LHSType>
-  static void lower_tri_solve_streams(
-      const std::vector<execution_space> &execspace_v,
-      const std::vector<TriSolveHandle *> &thandle_v,
-      const std::vector<RowMapType> &row_map_v,
-      const std::vector<EntriesType> &entries_v,
-      const std::vector<ValuesType> &values_v,
-      const std::vector<RHSType> &rhs_v, std::vector<LHSType> &lhs_v) {
-    // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment
-    using nodes_per_level_type =
-        typename TriSolveHandle::hostspace_nnz_lno_view_t;
-    using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t;
-
-    // Create vectors for handles' data in streams
-    int nstreams = execspace_v.size();
-    std::vector<size_type> nlevels_v(nstreams);
-    std::vector<nodes_per_level_type> hnodes_per_level_v(nstreams);
-    std::vector<nodes_grouped_by_level_type> nodes_grouped_by_level_v(nstreams);
-    std::vector<size_type> node_count_v(nstreams);
-
-    // Retrieve data from handles and find max. number of levels among streams
-    size_type nlevels_max = 0;
-    for (int i = 0; i < nstreams; i++) {
-      nlevels_v[i]                = thandle_v[i]->get_num_levels();
-      hnodes_per_level_v[i]       = thandle_v[i]->get_host_nodes_per_level();
-      nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level();
-      node_count_v[i]             = 0;
-      if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i];
-    }
-
-    // Main loop must be performed sequential
-    for (size_type lvl = 0; lvl < nlevels_max; lvl++) {
-      // 1. Launch work on all streams
-      for (int i = 0; i < nstreams; i++) {
-        // Only if stream i-th still has this level
-        if (lvl < nlevels_v[i]) {
-          size_type lvl_nodes = hnodes_per_level_v[i](lvl);
-          if (lvl_nodes != 0) {
-            if (thandle_v[i]->get_algorithm() ==
-                KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) {
-              Kokkos::parallel_for(
-                  "parfor_fixed_lvl",
-                  range_policy(execspace_v[i], node_count_v[i],
-                               node_count_v[i] + lvl_nodes),
-                  LowerTriLvlSchedRPSolverFunctor<RowMapType, EntriesType,
-                                                  ValuesType, LHSType, RHSType>(
-                      row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
-                      rhs_v[i], nodes_grouped_by_level_v[i]));
-            } else if (thandle_v[i]->get_algorithm() ==
-                       KokkosSparse::Experimental::SPTRSVAlgorithm::
-                           SEQLVLSCHD_TP1) {
-              int team_size = thandle_v[i]->get_team_size();
-#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-              TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                          LHSType, RHSType>
-                  tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
-                       rhs_v[i], nodes_grouped_by_level_v[i], true,
-                       node_count_v[i]);
-#else
-              LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
-                                               ValuesType, LHSType, RHSType>
-                  tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
-                       rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]);
-#endif
-              if (team_size == -1)
-                Kokkos::parallel_for(
-                    "parfor_l_team",
-                    team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf);
-              else
-                Kokkos::parallel_for(
-                    "parfor_l_team",
-                    team_policy(execspace_v[i], lvl_nodes, team_size), tstf);
-            }
-            node_count_v[i] += lvl_nodes;
-          }  // end if (lvl_nodes != 0)
-        }    // end if (lvl < nlevels_v[i])
-      }      // end for streams
-    }        // end for lvl
-  }          // end lower_tri_solve_streams
-
-  template <class RowMapType, class EntriesType, class ValuesType,
+  template <bool IsLower, class RowMapType, class EntriesType, class ValuesType,
             class RHSType, class LHSType>
-  static void upper_tri_solve_streams(
-      const std::vector<execution_space> &execspace_v,
-      const std::vector<TriSolveHandle *> &thandle_v,
-      const std::vector<RowMapType> &row_map_v,
-      const std::vector<EntriesType> &entries_v,
-      const std::vector<ValuesType> &values_v,
-      const std::vector<RHSType> &rhs_v, std::vector<LHSType> &lhs_v) {
+  static void tri_solve_streams(const std::vector<execution_space> &execspace_v,
+                                const std::vector<TriSolveHandle *> &thandle_v,
+                                const std::vector<RowMapType> &row_map_v,
+                                const std::vector<EntriesType> &entries_v,
+                                const std::vector<ValuesType> &values_v,
+                                const std::vector<RHSType> &rhs_v,
+                                std::vector<LHSType> &lhs_v) {
     // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment
     using nodes_per_level_type =
         typename TriSolveHandle::hostspace_nnz_lno_view_t;
     using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t;
+    using RPPointFunctor =
+        FunctorTypeMacro(TriLvlSchedRPSolverFunctor, IsLower, false);
+    using TPPointFunctor =
+        FunctorTypeMacro(TriLvlSchedTP1SolverFunctor, IsLower, false);
 
     // Create vectors for handles' data in streams
     int nstreams = execspace_v.size();
@@ -4011,41 +2182,28 @@ tstf); } // end elseif
                   "parfor_fixed_lvl",
                   range_policy(execspace_v[i], node_count_v[i],
                                node_count_v[i] + lvl_nodes),
-                  UpperTriLvlSchedRPSolverFunctor<RowMapType, EntriesType,
-                                                  ValuesType, LHSType, RHSType>(
-                      row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
-                      rhs_v[i], nodes_grouped_by_level_v[i]));
+                  RPPointFunctor(row_map_v[i], entries_v[i], values_v[i],
+                                 lhs_v[i], rhs_v[i],
+                                 nodes_grouped_by_level_v[i]));
             } else if (thandle_v[i]->get_algorithm() ==
                        KokkosSparse::Experimental::SPTRSVAlgorithm::
                            SEQLVLSCHD_TP1) {
               int team_size = thandle_v[i]->get_team_size();
-#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
-              TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType,
-                                          LHSType, RHSType>
-                  tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
-                       rhs_v[i], nodes_grouped_by_level_v[i], false,
-                       node_count_v[i]);
-#else
-              UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType,
-                                               ValuesType, LHSType, RHSType>
-                  tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i],
-                       rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]);
-#endif
-              if (team_size == -1)
-                Kokkos::parallel_for(
-                    "parfor_l_team",
-                    team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf);
-              else
-                Kokkos::parallel_for(
-                    "parfor_l_team",
-                    team_policy(execspace_v[i], lvl_nodes, team_size), tstf);
+              auto tp =
+                  team_size == -1
+                      ? team_policy(execspace_v[i], lvl_nodes, Kokkos::AUTO)
+                      : team_policy(execspace_v[i], lvl_nodes, team_size);
+              TPPointFunctor tstf(row_map_v[i], entries_v[i], values_v[i],
+                                  lhs_v[i], rhs_v[i],
+                                  nodes_grouped_by_level_v[i], node_count_v[i]);
+              Kokkos::parallel_for("parfor_l_team", tp, tstf);
             }
             node_count_v[i] += lvl_nodes;
           }  // end if (lvl_nodes != 0)
         }    // end if (lvl < nlevels_v[i])
       }      // end for streams
     }        // end for lvl
-  }          // end upper_tri_solve_streams
+  }          // end tri_solve_streams
 
 };  // struct SptrsvWrap
 
diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp
index d69c499c60..b2c57b1dfa 100644
--- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp
+++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp
@@ -135,19 +135,19 @@ struct SPTRSV_SOLVE<ExecutionSpace, KernelHandle, RowMapType, EntriesType,
       }
       if (sptrsv_handle->get_algorithm() ==
           KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) {
-        Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, values,
-                                b, x, true);
+        Sptrsv::template tri_solve_chain<true>(space, *sptrsv_handle, row_map,
+                                               entries, values, b, x);
       } else {
 #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
         using ExecSpace = typename RowMapType::memory_space::execution_space;
         if (std::is_same<ExecSpace, Kokkos::Cuda>::value)
           // TODO: set stream in thandle's sptrsvCudaGraph
-          Sptrsv::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, values,
+          Sptrsv::tri_solve_cg<true>(*sptrsv_handle, row_map, entries, values,
                                      b, x);
         else
 #endif
-          Sptrsv::lower_tri_solve(space, *sptrsv_handle, row_map, entries,
-                                  values, b, x);
+          Sptrsv::template lower_tri_solve(space, *sptrsv_handle, row_map,
+                                           entries, values, b, x);
       }
     } else {
       if (sptrsv_handle->is_symbolic_complete() == false) {
@@ -156,19 +156,19 @@ struct SPTRSV_SOLVE<ExecutionSpace, KernelHandle, RowMapType, EntriesType,
       }
       if (sptrsv_handle->get_algorithm() ==
           KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) {
-        Sptrsv::tri_solve_chain(space, *sptrsv_handle, row_map, entries, values,
-                                b, x, false);
+        Sptrsv::template tri_solve_chain<false>(space, *sptrsv_handle, row_map,
+                                                entries, values, b, x);
       } else {
 #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
         using ExecSpace = typename RowMapType::memory_space::execution_space;
         if (std::is_same<ExecSpace, Kokkos::Cuda>::value)
           // TODO: set stream in thandle's sptrsvCudaGraph
-          Sptrsv::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, values,
-                                     b, x);
+          Sptrsv::tri_solve_cg<false>(*sptrsv_handle, row_map, entries, values,
+                                      b, x);
         else
 #endif
-          Sptrsv::upper_tri_solve(space, *sptrsv_handle, row_map, entries,
-                                  values, b, x);
+          Sptrsv::template upper_tri_solve(space, *sptrsv_handle, row_map,
+                                           entries, values, b, x);
       }
     }
     Kokkos::Profiling::popRegion();
@@ -202,8 +202,9 @@ struct SPTRSV_SOLVE<ExecutionSpace, KernelHandle, RowMapType, EntriesType,
                                            entries_v[i]);
         }
       }
-      Sptrsv::lower_tri_solve_streams(execspace_v, sptrsv_handle_v, row_map_v,
-                                      entries_v, values_v, b_v, x_v);
+      Sptrsv::template tri_solve_streams<true>(execspace_v, sptrsv_handle_v,
+                                               row_map_v, entries_v, values_v,
+                                               b_v, x_v);
     } else {
       for (int i = 0; i < static_cast<int>(execspace_v.size()); i++) {
         if (sptrsv_handle_v[i]->is_symbolic_complete() == false) {
@@ -212,8 +213,9 @@ struct SPTRSV_SOLVE<ExecutionSpace, KernelHandle, RowMapType, EntriesType,
                                            entries_v[i]);
         }
       }
-      Sptrsv::upper_tri_solve_streams(execspace_v, sptrsv_handle_v, row_map_v,
-                                      entries_v, values_v, b_v, x_v);
+      Sptrsv::template tri_solve_streams<false>(execspace_v, sptrsv_handle_v,
+                                                row_map_v, entries_v, values_v,
+                                                b_v, x_v);
     }
     Kokkos::Profiling::popRegion();
   }
diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp
index b8b35bc422..8beff14592 100644
--- a/sparse/unit_test/Test_Sparse_sptrsv.hpp
+++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp
@@ -47,15 +47,12 @@ template <typename scalar_t, typename lno_t, typename size_type,
           typename device>
 struct SptrsvTest {
   // Define useful types
-  using RowMapType             = Kokkos::View<size_type *, device>;
-  using EntriesType            = Kokkos::View<lno_t *, device>;
-  using ValuesType             = Kokkos::View<scalar_t *, device>;
-  using RowMapType_hostmirror  = typename RowMapType::HostMirror;
-  using EntriesType_hostmirror = typename EntriesType::HostMirror;
-  using ValuesType_hostmirror  = typename ValuesType::HostMirror;
-  using execution_space        = typename device::execution_space;
-  using memory_space           = typename device::memory_space;
-  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
+  using RowMapType      = Kokkos::View<size_type *, device>;
+  using EntriesType     = Kokkos::View<lno_t *, device>;
+  using ValuesType      = Kokkos::View<scalar_t *, device>;
+  using execution_space = typename device::execution_space;
+  using memory_space    = typename device::memory_space;
+  using KernelHandle    = KokkosKernels::Experimental::KokkosKernelsHandle<
       size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>;
 
   using Crs = CrsMatrix<scalar_t, lno_t, device, void, size_type>;
@@ -65,6 +62,9 @@ struct SptrsvTest {
 
   using range_policy_t = Kokkos::RangePolicy<execution_space>;
 
+  static inline const scalar_t ZERO = scalar_t(0);
+  static inline const scalar_t ONE  = scalar_t(1);
+
   static std::vector<std::vector<scalar_t>> get_5x5_ut_ones_fixture() {
     std::vector<std::vector<scalar_t>> A = {{1.00, 0.00, 1.00, 0.00, 0.00},
                                             {0.00, 1.00, 0.00, 0.00, 1.00},
@@ -103,6 +103,17 @@ struct SptrsvTest {
     return A;
   }
 
+  static bool do_cusparse() {
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+    return (
+        std::is_same<size_type, int>::value &&
+        std::is_same<lno_t, int>::value &&
+        std::is_same<typename device::execution_space, Kokkos::Cuda>::value);
+#else
+    return false;
+#endif
+  }
+
   struct ReductionCheck {
     ValuesType lhs;
 
@@ -112,12 +123,83 @@ struct SptrsvTest {
     void operator()(lno_t i, scalar_t &tsum) const { tsum += lhs(i); }
   };
 
-  static void run_test_sptrsv() {
-    scalar_t ZERO = scalar_t(0);
-    scalar_t ONE  = scalar_t(1);
+  static std::tuple<Crs, ValuesType, ValuesType> create_crs_lhs_rhs(
+      const std::vector<std::vector<scalar_t>> &fixture) {
+    RowMapType row_map;
+    EntriesType entries;
+    ValuesType values;
+
+    compress_matrix(row_map, entries, values, fixture);
+    const auto nrows = row_map.size() - 1;
+    const auto nnz   = values.size();
+
+    // Create known_lhs, generate rhs, then solve for lhs to compare to
+    // known_lhs
+    ValuesType known_lhs("known_lhs", nrows);
+    // Create known solution lhs set to all 1's
+    Kokkos::deep_copy(known_lhs, ONE);
+
+    // Solution to find
+    ValuesType lhs("lhs", nrows);
+
+    // A*known_lhs generates rhs: rhs is dense, use spmv
+    ValuesType rhs("rhs", nrows);
+
+    Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries);
+    KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs);
+
+    return std::make_tuple(triMtx, lhs, rhs);
+  }
+
+  template <typename SpMatrix>
+  static void basic_check(const SpMatrix &triMtx, const ValuesType &lhs,
+                          const ValuesType &rhs, const bool is_lower,
+                          const size_type block_size = 0) {
+    // FIXME Issues with some integral type combos for SEQLVLSCHED_TP2,
+    // currently unavailable
+    std::vector<SPTRSVAlgorithm> algs = {SPTRSVAlgorithm::SEQLVLSCHD_RP,
+                                         SPTRSVAlgorithm::SEQLVLSCHD_TP1};
+    if (block_size == 0) {
+      // SEQLVLSCHD_TP1CHAIN and SPTRSV_CUSPARSE are not supported for blocks
+      algs.push_back(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN);
+      if (do_cusparse()) {
+        algs.push_back(SPTRSVAlgorithm::SPTRSV_CUSPARSE);
+      }
+    }
+
+    auto row_map = triMtx.graph.row_map;
+    auto entries = triMtx.graph.entries;
+    auto values  = triMtx.values;
+
+    const size_type nrows = row_map.size() - 1;
+
+    for (auto alg : algs) {
+      KernelHandle kh;
+      kh.create_sptrsv_handle(alg, nrows, is_lower, block_size);
+      if (alg == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) {
+        auto chain_threshold = 1;
+        kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
+      }
 
+      sptrsv_symbolic(&kh, row_map, entries, values);
+      Kokkos::fence();
+
+      sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
+      Kokkos::fence();
+
+      scalar_t sum = 0.0;
+      Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)),
+                              ReductionCheck(lhs), sum);
+      EXPECT_EQ(sum, lhs.extent(0));
+
+      Kokkos::deep_copy(lhs, ZERO);
+
+      kh.destroy_sptrsv_handle();
+    }
+  }
+
+  static void run_test_sptrsv() {
     const size_type nrows = 5;
-    const size_type nnz   = 10;
 
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
     using host_crsmat_t =
@@ -142,121 +224,13 @@ struct SptrsvTest {
 
     // Upper tri
     {
-      RowMapType row_map;
-      EntriesType entries;
-      ValuesType values;
-
-      auto fixture = get_5x5_ut_ones_fixture();
-
-      compress_matrix(row_map, entries, values, fixture);
-
-      // Create known_lhs, generate rhs, then solve for lhs to compare to
-      // known_lhs
-      ValuesType known_lhs("known_lhs", nrows);
-      // Create known solution lhs set to all 1's
-      Kokkos::deep_copy(known_lhs, ONE);
-
-      // Solution to find
-      ValuesType lhs("lhs", nrows);
-
-      // A*known_lhs generates rhs: rhs is dense, use spmv
-      ValuesType rhs("rhs", nrows);
-
-      Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries);
-      KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs);
-
       {
-        KernelHandle kh;
-        bool is_lower_tri = false;
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows,
-                                is_lower_tri);
-
-        sptrsv_symbolic(&kh, row_map, entries);
-        Kokkos::fence();
-
-        sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
-        Kokkos::fence();
-
-        scalar_t sum = 0.0;
-        Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)),
-                                ReductionCheck(lhs), sum);
-        EXPECT_EQ(sum, lhs.extent(0));
-
-        Kokkos::deep_copy(lhs, ZERO);
-        kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP);
-        sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
-        Kokkos::fence();
+        const auto [triMtx, lhs, rhs] =
+            create_crs_lhs_rhs(get_5x5_ut_ones_fixture());
 
-        sum = 0.0;
-        Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)),
-                                ReductionCheck(lhs), sum);
-        EXPECT_EQ(sum, lhs.extent(0));
-
-        // FIXME Issues with various integral type combos - algorithm currently
-        // unavailable and commented out until fixed
-        /*
-          Kokkos::deep_copy(lhs, ZERO);
-          kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2);
-          sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
-          Kokkos::fence();
-
-          sum = 0.0;
-          Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)),
-          ReductionCheck(lhs), sum);
-          EXPECT_EQ(sum, lhs.extent(0) );
-        */
-
-        kh.destroy_sptrsv_handle();
+        basic_check(triMtx, lhs, rhs, false);
       }
 
-      {
-        Kokkos::deep_copy(lhs, ZERO);
-        KernelHandle kh;
-        bool is_lower_tri = false;
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows,
-                                is_lower_tri);
-        auto chain_threshold = 1;
-        kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
-
-        sptrsv_symbolic(&kh, row_map, entries);
-        Kokkos::fence();
-
-        sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
-        Kokkos::fence();
-
-        scalar_t sum = 0.0;
-        Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)),
-                                ReductionCheck(lhs), sum);
-        EXPECT_EQ(sum, lhs.extent(0));
-
-        kh.destroy_sptrsv_handle();
-      }
-
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-      if (std::is_same<size_type, int>::value &&
-          std::is_same<lno_t, int>::value &&
-          std::is_same<typename device::execution_space, Kokkos::Cuda>::value) {
-        Kokkos::deep_copy(lhs, ZERO);
-        KernelHandle kh;
-        bool is_lower_tri = false;
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows,
-                                is_lower_tri);
-
-        sptrsv_symbolic(&kh, row_map, entries, values);
-        Kokkos::fence();
-
-        sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
-        Kokkos::fence();
-
-        scalar_t sum = 0.0;
-        Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)),
-                                ReductionCheck(lhs), sum);
-        EXPECT_EQ(sum, lhs.extent(0));
-
-        kh.destroy_sptrsv_handle();
-      }
-#endif
-
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
       const scalar_t FIVE    = scalar_t(5);
       const size_type nnz_sp = 14;
@@ -367,120 +341,13 @@ struct SptrsvTest {
 
     // Lower tri
     {
-      auto fixture = get_5x5_lt_ones_fixture();
-      RowMapType row_map;
-      EntriesType entries;
-      ValuesType values;
-
-      compress_matrix(row_map, entries, values, fixture);
-
-      // Create known_lhs, generate rhs, then solve for lhs to compare to
-      // known_lhs
-      ValuesType known_lhs("known_lhs", nrows);
-      // Create known solution lhs set to all 1's
-      Kokkos::deep_copy(known_lhs, ONE);
-
-      // Solution to find
-      ValuesType lhs("lhs", nrows);
-
-      // A*known_lhs generates rhs: rhs is dense, use spmv
-      ValuesType rhs("rhs", nrows);
-
-      Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries);
-      KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs);
-
       {
-        KernelHandle kh;
-        bool is_lower_tri = true;
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows,
-                                is_lower_tri);
-
-        sptrsv_symbolic(&kh, row_map, entries);
-        Kokkos::fence();
-
-        sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
-        Kokkos::fence();
+        const auto [triMtx, lhs, rhs] =
+            create_crs_lhs_rhs(get_5x5_lt_ones_fixture());
 
-        scalar_t sum = 0.0;
-        Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)),
-                                ReductionCheck(lhs), sum);
-        EXPECT_EQ(sum, lhs.extent(0));
-
-        Kokkos::deep_copy(lhs, ZERO);
-        kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP);
-        sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
-        Kokkos::fence();
-
-        sum = 0.0;
-        Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)),
-                                ReductionCheck(lhs), sum);
-        EXPECT_EQ(sum, lhs.extent(0));
-
-        // FIXME Issues with various integral type combos - algorithm currently
-        // unavailable and commented out until fixed
-        /*
-          Kokkos::deep_copy(lhs, ZERO);
-          kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2);
-          sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
-          Kokkos::fence();
-
-          sum = 0.0;
-          Kokkos::parallel_reduce( range_policy_t(0, lhs.extent(0)),
-          ReductionCheck(lhs), sum);
-          EXPECT_EQ( sum, lhs.extent(0) );
-        */
-
-        kh.destroy_sptrsv_handle();
+        basic_check(triMtx, lhs, rhs, true);
       }
 
-      {
-        Kokkos::deep_copy(lhs, ZERO);
-        KernelHandle kh;
-        bool is_lower_tri = true;
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows,
-                                is_lower_tri);
-        auto chain_threshold = 1;
-        kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
-
-        sptrsv_symbolic(&kh, row_map, entries);
-        Kokkos::fence();
-
-        sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
-        Kokkos::fence();
-
-        scalar_t sum = 0.0;
-        Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)),
-                                ReductionCheck(lhs), sum);
-        EXPECT_EQ(sum, lhs.extent(0));
-
-        kh.destroy_sptrsv_handle();
-      }
-
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
-      if (std::is_same<size_type, int>::value &&
-          std::is_same<lno_t, int>::value &&
-          std::is_same<typename device::execution_space, Kokkos::Cuda>::value) {
-        Kokkos::deep_copy(lhs, ZERO);
-        KernelHandle kh;
-        bool is_lower_tri = true;
-        kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows,
-                                is_lower_tri);
-
-        sptrsv_symbolic(&kh, row_map, entries, values);
-        Kokkos::fence();
-
-        sptrsv_solve(&kh, row_map, entries, values, rhs, lhs);
-        Kokkos::fence();
-
-        scalar_t sum = 0.0;
-        Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)),
-                                ReductionCheck(lhs), sum);
-        EXPECT_EQ(sum, lhs.extent(0));
-
-        kh.destroy_sptrsv_handle();
-      }
-#endif
-
 #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV)
       {
         // L in csc
@@ -558,7 +425,6 @@ struct SptrsvTest {
         scalar_t sum = 0.0;
         Kokkos::parallel_reduce(range_policy_t(0, X.extent(0)),
                                 ReductionCheck(X), sum);
-        EXPECT_EQ(sum, lhs.extent(0));
         EXPECT_EQ(sum, X.extent(0));
 
         khL.destroy_sptrsv_handle();
@@ -619,7 +485,6 @@ struct SptrsvTest {
         scalar_t sum = 0.0;
         Kokkos::parallel_reduce(range_policy_t(0, X.extent(0)),
                                 ReductionCheck(X), sum);
-        EXPECT_EQ(sum, lhs.extent(0));
         EXPECT_EQ(sum, X.extent(0));
 
         khLd.destroy_sptrsv_handle();
@@ -629,7 +494,8 @@ struct SptrsvTest {
     }
   }
 
-  static void run_test_sptrsv_streams(int test_algo, int nstreams) {
+  static void run_test_sptrsv_streams(SPTRSVAlgorithm test_algo, int nstreams,
+                                      const bool is_lower) {
     // Workaround for OpenMP: skip tests if concurrency < nstreams because of
     // not enough resource to partition
     bool run_streams_test = true;
@@ -645,9 +511,6 @@ struct SptrsvTest {
 #endif
     if (!run_streams_test) return;
 
-    scalar_t ZERO = scalar_t(0);
-    scalar_t ONE  = scalar_t(1);
-
     const size_type nrows = 5;
     const size_type nnz   = 10;
 
@@ -662,150 +525,63 @@ struct SptrsvTest {
     std::vector<ValuesType> rhs_v(nstreams);
     std::vector<ValuesType> lhs_v(nstreams);
 
-    RowMapType_hostmirror hrow_map;
-    EntriesType_hostmirror hentries;
-    ValuesType_hostmirror hvalues;
-
-    // Upper tri
-    {
-      auto fixture = get_5x5_ut_ones_fixture();
-      compress_matrix(hrow_map, hentries, hvalues, fixture);
-
-      for (int i = 0; i < nstreams; i++) {
-        // Allocate U
-        row_map_v[i] = RowMapType("row_map", nrows + 1);
-        entries_v[i] = EntriesType("entries", nnz);
-        values_v[i]  = ValuesType("values", nnz);
-
-        // Copy from host to device
-        Kokkos::deep_copy(row_map_v[i], hrow_map);
-        Kokkos::deep_copy(entries_v[i], hentries);
-        Kokkos::deep_copy(values_v[i], hvalues);
-
-        // Create known_lhs, generate rhs, then solve for lhs to compare to
-        // known_lhs
-        ValuesType known_lhs("known_lhs", nrows);
-        // Create known solution lhs set to all 1's
-        Kokkos::deep_copy(known_lhs, ONE);
-
-        // Solution to find
-        lhs_v[i] = ValuesType("lhs", nrows);
-
-        // A*known_lhs generates rhs: rhs is dense, use spmv
-        rhs_v[i] = ValuesType("rhs", nrows);
-
-        Crs triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i],
-                   entries_v[i]);
-
-        KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]);
-        Kokkos::fence();
-
-        // Create handle
-        kh_v[i]           = KernelHandle();
-        bool is_lower_tri = false;
-        if (test_algo == 0)
-          kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows,
-                                       is_lower_tri);
-        else if (test_algo == 1)
-          kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows,
-                                       is_lower_tri);
-        else
-          kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows,
-                                       is_lower_tri);
-
-        kh_ptr_v[i] = &kh_v[i];
-
-        // Symbolic phase
-        sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]);
-        Kokkos::fence();
-      }  // Done handle creation and sptrsv_symbolic on all streams
-
-      // Solve phase
-      sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v,
-                           rhs_v, lhs_v);
-
-      for (int i = 0; i < nstreams; i++) instances[i].fence();
-
-      // Checking
-      for (int i = 0; i < nstreams; i++) {
-        scalar_t sum = 0.0;
-        Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)),
-                                ReductionCheck(lhs_v[i]), sum);
-        EXPECT_EQ(sum, lhs_v[i].extent(0));
-
-        kh_v[i].destroy_sptrsv_handle();
-      }
-    }
-
-    // Lower tri
-    {
-      auto fixture = get_5x5_lt_ones_fixture();
-      compress_matrix(hrow_map, hentries, hvalues, fixture);
+    auto fixture =
+        is_lower ? get_5x5_lt_ones_fixture() : get_5x5_ut_ones_fixture();
+    const auto [triMtx, lhs, rhs] = create_crs_lhs_rhs(fixture);
 
-      for (int i = 0; i < nstreams; i++) {
-        // Allocate L
-        row_map_v[i] = RowMapType("row_map", nrows + 1);
-        entries_v[i] = EntriesType("entries", nnz);
-        values_v[i]  = ValuesType("values", nnz);
+    auto row_map = triMtx.graph.row_map;
+    auto entries = triMtx.graph.entries;
+    auto values  = triMtx.values;
 
-        // Copy from host to device
-        Kokkos::deep_copy(row_map_v[i], hrow_map);
-        Kokkos::deep_copy(entries_v[i], hentries);
-        Kokkos::deep_copy(values_v[i], hvalues);
+    for (int i = 0; i < nstreams; i++) {
+      // Allocate
+      row_map_v[i] = RowMapType("row_map", nrows + 1);
+      entries_v[i] = EntriesType("entries", nnz);
+      values_v[i]  = ValuesType("values", nnz);
 
-        // Create known_lhs, generate rhs, then solve for lhs to compare to
-        // known_lhs
-        ValuesType known_lhs("known_lhs", nrows);
-        // Create known solution lhs set to all 1's
-        Kokkos::deep_copy(known_lhs, ONE);
+      // Copy
+      Kokkos::deep_copy(row_map_v[i], row_map);
+      Kokkos::deep_copy(entries_v[i], entries);
+      Kokkos::deep_copy(values_v[i], values);
 
-        // Solution to find
-        lhs_v[i] = ValuesType("lhs", nrows);
-
-        // A*known_lhs generates rhs: rhs is dense, use spmv
-        rhs_v[i] = ValuesType("rhs", nrows);
-
-        Crs triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i],
-                   entries_v[i]);
-
-        KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]);
-        Kokkos::fence();
-
-        // Create handle
-        kh_v[i]           = KernelHandle();
-        bool is_lower_tri = true;
-        if (test_algo == 0)
-          kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows,
-                                       is_lower_tri);
-        else if (test_algo == 1)
-          kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows,
-                                       is_lower_tri);
-        else
-          kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows,
-                                       is_lower_tri);
-
-        kh_ptr_v[i] = &kh_v[i];
-
-        // Symbolic phase
-        sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]);
-        Kokkos::fence();
-      }  // Done handle creation and sptrsv_symbolic on all streams
-
-      // Solve phase
-      sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v,
-                           rhs_v, lhs_v);
-
-      for (int i = 0; i < nstreams; i++) instances[i].fence();
+      // Create known_lhs, generate rhs, then solve for lhs to compare to
+      // known_lhs
+      ValuesType known_lhs("known_lhs", nrows);
+      // Create known solution lhs set to all 1's
+      Kokkos::deep_copy(known_lhs, ONE);
 
-      // Checking
-      for (int i = 0; i < nstreams; i++) {
-        scalar_t sum = 0.0;
-        Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)),
-                                ReductionCheck(lhs_v[i]), sum);
-        EXPECT_EQ(sum, lhs_v[i].extent(0));
+      // Solution to find
+      lhs_v[i] = ValuesType("lhs", nrows);
 
-        kh_v[i].destroy_sptrsv_handle();
-      }
+      // A*known_lhs generates rhs: rhs is dense, use spmv
+      rhs_v[i] = ValuesType("rhs", nrows);
+
+      KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]);
+      Kokkos::fence();
+
+      // Create handle
+      kh_v[i] = KernelHandle();
+      kh_v[i].create_sptrsv_handle(test_algo, nrows, is_lower);
+      kh_ptr_v[i] = &kh_v[i];
+
+      // Symbolic phase
+      sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]);
+      Kokkos::fence();
+    }  // Done handle creation and sptrsv_symbolic on all streams
+
+    // Solve phase
+    sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v,
+                         rhs_v, lhs_v);
+
+    for (int i = 0; i < nstreams; i++) instances[i].fence();
+
+    // Checking
+    for (int i = 0; i < nstreams; i++) {
+      scalar_t sum = 0.0;
+      Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)),
+                              ReductionCheck(lhs_v[i]), sum);
+      EXPECT_EQ(sum, lhs_v[i].extent(0));
+      kh_v[i].destroy_sptrsv_handle();
     }
   }
 };
@@ -823,25 +599,18 @@ template <typename scalar_t, typename lno_t, typename size_type,
           typename device>
 void test_sptrsv_streams() {
   using TestStruct = Test::SptrsvTest<scalar_t, lno_t, size_type, device>;
+  std::vector<SPTRSVAlgorithm> algs = {SPTRSVAlgorithm::SEQLVLSCHD_RP,
+                                       SPTRSVAlgorithm::SEQLVLSCHD_TP1};
+  if (TestStruct::do_cusparse()) {
+    algs.push_back(SPTRSVAlgorithm::SPTRSV_CUSPARSE);
+  }
 
-  TestStruct::run_test_sptrsv_streams(0, 1);
-  TestStruct::run_test_sptrsv_streams(0, 2);
-  TestStruct::run_test_sptrsv_streams(0, 3);
-  TestStruct::run_test_sptrsv_streams(0, 4);
-  TestStruct::run_test_sptrsv_streams(1, 1);
-  TestStruct::run_test_sptrsv_streams(1, 2);
-  TestStruct::run_test_sptrsv_streams(1, 3);
-  TestStruct::run_test_sptrsv_streams(1, 4);
-
-#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
-  if (std::is_same<lno_t, int>::value &&
-      std::is_same<typename device::execution_space, Kokkos::Cuda>::value) {
-    TestStruct::run_test_sptrsv_streams(2, 1);
-    TestStruct::run_test_sptrsv_streams(2, 2);
-    TestStruct::run_test_sptrsv_streams(2, 3);
-    TestStruct::run_test_sptrsv_streams(2, 4);
+  for (auto alg : algs) {
+    for (int nstreams = 1; nstreams <= 4; ++nstreams) {
+      TestStruct::run_test_sptrsv_streams(alg, nstreams, true);
+      TestStruct::run_test_sptrsv_streams(alg, nstreams, false);
+    }
   }
-#endif
 }
 
 #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE)        \