From a7117a3825a00f7cdcf5baf794f7460b21c96403 Mon Sep 17 00:00:00 2001 From: vanathig Date: Mon, 22 May 2023 15:35:03 +0530 Subject: [PATCH] Fix date type handling when parsing excel files --- .../io/cdap/directives/parser/ParseExcel.java | 9 ++++---- .../directives/parser/ParseExcelTest.java | 21 +++++++++++++++++- .../resources/date-formats-test-sheet.xlsx | Bin 0 -> 8961 bytes 3 files changed, 24 insertions(+), 6 deletions(-) create mode 100644 wrangler-core/src/test/resources/date-formats-test-sheet.xlsx diff --git a/wrangler-core/src/main/java/io/cdap/directives/parser/ParseExcel.java b/wrangler-core/src/main/java/io/cdap/directives/parser/ParseExcel.java index a253382ac..f3df25078 100644 --- a/wrangler-core/src/main/java/io/cdap/directives/parser/ParseExcel.java +++ b/wrangler-core/src/main/java/io/cdap/directives/parser/ParseExcel.java @@ -42,6 +42,7 @@ import org.apache.poi.hssf.usermodel.HSSFDateUtil; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.CellType; +import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.slf4j.Logger; @@ -101,6 +102,7 @@ public List execute(List records, final ExecutorContext context) throws DirectiveExecutionException, ErrorRowException { List results = new ArrayList<>(); ByteArrayInputStream input = null; + DataFormatter formatter = new DataFormatter(); try { for (Row record : records) { int idx = record.find(column); @@ -159,25 +161,22 @@ public List execute(List records, final ExecutorContext context) String value = ""; switch (cell.getCellTypeEnum()) { case STRING: - newRow.add(name, cell.getStringCellValue()); value = cell.getStringCellValue(); break; case NUMERIC: if (HSSFDateUtil.isCellDateFormatted(cell)) { - newRow.add(name, cell.getDateCellValue()); - value = cell.getDateCellValue().toString(); + value = formatter.formatCellValue(cell); } else { - newRow.add(name, cell.getNumericCellValue()); value = String.valueOf(cell.getNumericCellValue()); } break; case BOOLEAN: - newRow.add(name, cell.getBooleanCellValue()); value = String.valueOf(cell.getBooleanCellValue()); break; } + newRow.add(name, value); if (rows == 0 && firstRowAsHeader) { columnNames.put(cell.getAddress().getColumn(), value); diff --git a/wrangler-core/src/test/java/io/cdap/directives/parser/ParseExcelTest.java b/wrangler-core/src/test/java/io/cdap/directives/parser/ParseExcelTest.java index 653d358f7..addf23eba 100644 --- a/wrangler-core/src/test/java/io/cdap/directives/parser/ParseExcelTest.java +++ b/wrangler-core/src/test/java/io/cdap/directives/parser/ParseExcelTest.java @@ -67,5 +67,24 @@ public void testNoSheetName() throws Exception { Assert.assertEquals(1, pipeline.getSecond().size()); } } -} + @Test + public void testDateFormatting() throws Exception { + try (InputStream stream = + ParseAvroFileTest.class.getClassLoader().getResourceAsStream("date-formats-test-sheet.xlsx")) { + byte[] data = IOUtils.toByteArray(stream); + + String[] directives = new String[]{ + "parse-as-excel :body '0'", + }; + + List rows = new ArrayList<>(); + rows.add(new Row("body", data)); + List results = TestingRig.execute(directives, rows); + + for (Row result : results) { + Assert.assertEquals(result.getValue("A"), result.getValue("B")); + } + } + } +} diff --git a/wrangler-core/src/test/resources/date-formats-test-sheet.xlsx b/wrangler-core/src/test/resources/date-formats-test-sheet.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..41b826b69a601ca8617f77d0c7a21243b9859baf GIT binary patch literal 8961 zcmeHtgD^^cCTk?EZX!=|LH01vOwg2K3s7O*&>S4zSy_8v@IKiW_2a>kYytz{XnuWz#0Xhg`H3SNp~8OXW(*j6hu=}HRVs&dzPlS02D8|j9e-Q5lE*fxE&z_4MK zHI%yx$_Q$EiehhiKpIq#<&CNu^E6i#S$=Wzji`eRr4G*G-B-gSO(fjlXAP9J$@0Yk z&0873)yj z^7yps3u#juLhx^CNula(4AChPb?km`p2$hJT>!3VK6K0X6Yz2OSB922lvii-30j zAH&P5LNU986c-!Z6;W6?f>ezjm0_tjPHymww9YBgP8DBzG2Lg+XD`xU$a&Gbx5qJ- zH<#qe53N&4PajLxVvMtD6Cq;1Ckew7P7l)UR|KvZUsXZPimM+~hSfCj=I$nrX9mot z6n#S#jo??9`6t?Hn@MG44gI9-yuT;nZ^jGc+g=bqkBimMT_7G+EPn84XPp zBs9-Ej!-CoDOTd7(8RZ&#)U7toxEnZ(l6ws`aC(ruSO+_Sd4ROjD^M5wc`&dDiaE2 zWcrZ>#2xCzwKuB`Sex|Pgk0|Pp`NS8Oh^xNvPUznO=M|Uv=ze1sIfXz8ZpdCYa+=d zSWA?Sa`kX3&JQ9osf1%hP!I?y0u?NTs$6sj(Ih9LWpWMdbb#lUUP7>}bQ|wk&(pSP zo(p}59%{nFvGO8eo`b{R;)iJ>-nWbiTn>fLWY=k@!N^V<1~Pm0se3({kbZH~8(>RB zKo*+IAJfH7y^L{|TuO2XM7&I_SP@R~0rY$gC@`Y0Y}A5ASfiWF66QsYVN*|;46ky1 zrk27>t(`vnHYHx}6E>|Cr#ODi)=Te&nt0k>&I;fYwGlT3WWr59{&gglJcd#(P_X#n zmoFoZuhyND%&CG!7Mj#7zkf`771HH%g7md($W%KUx$`n-&4RD4+i`cNh%Bt-ptbR- z?wZEs3i5oTZvc_NB>ZdXX0cIO7GA+roUaIroez@r)QW6f<|-ey3?EikUbSRA5O^FHQz4FgM5)5lOW8F{Q6P$!$Q&kA zR;qbcYvm;u!gJD+a8NND4AnkyEPlGu! zAPQ{BnanAYoP+kJpHXc^n;|%Gzs$61s&%<5frs_@(1_GJ{h-zs{xoMYXn3;cdEd7& z=LV7zA7KMwVQGnEx)Dbn!YWZvYvvEVv4At$=#D2dfmsq(XX(5*>vLE55QYAy$_I%g zrJh1YS|Dmq1VDg-sQiCi!(SEuPd5Pt3ELpV`R{HOYKro`>=^CH_u=dwneMoli>_>x z``2t)M@%d`|+{%7mNj4iKp=H%I69KyVg#s=IkpP_BQV4rs}mAt`%^~Sb65`=RY zI~{{TFg~b!8vFtd8*6`eulxWREyD#HB$6=5oP)#n^@^T`Fub6MQt}Km6~=A-iQSFU z+GT=t7-A+z>bC-KUaRj{2G=(=k!LWNgC9T=%NI}H1$&WAo)RL;3ZZ%dCU<5in6A2{ zE)Y)IewZtIeX{8)xIqDvl*>OYdnW7~=1Lwu_<}5eA;ATlpWdo@^u<;Dc9@}K`Gk4o z?8tm;Xkd07veW-mnISBU5Na3zz!nnm6F~0x!%DhXnwhz}u>HDl{4|%pz`wez#7X;Q za-6Uux&u;-OOAfU{FyiEik122_DgdI8J{-^;G*IAWSC&z#^=G(8#KqRFjd$00UJ zB~mH;=;Ax>Oe2~lF?h;aFN%eu?4h_Osn;n&HaU;glIOiCj7WuMb)KZVh|Z}`+Lj+| z0f92bn_AJV-YDvqqgS>$Mhi#uP9p_aw}}a-ua&1ueAcu3Q0^hy{_yZ?PhAs%Xd&z< zlGSS0(9gx8HJl1r@x>5I4ea0|^gEoep{laj4Ex)h@LI1qVn^ByypM|Jx49`a95dGM zT+kdWP}fkY^<|`Xx-WMdo(;DcpH~qi1jP8`K1J^S!NlY>#F9);-yKI`E3aWpS(+El zNt})p8$XLT1YJ+?oH2O#q+mRy(A1yhME3o`s2%Vm?#?FNK~kg?!Bz4kUh>_!!H6%@ ziL6w~haWp!ALFlM2I+9SOs^*`e%#H~p4v%4ZsZw>6WQc-e#nLlTs(d}+l-@G`#f%&xJqrB zokY^mxUZ_k#hK7Xj15i0f38Mf_&!zKK{#pZISgw1SsWsD{Wr2ny0HbSN-%Y%t(>kq zYSfGS_Tu8p9Dnaq*W;=7H18K_$lVDk1|L;UgtMX-(x|6X0&Mu`xqX#olAv-v>QSgSb({fw9QK)ljUD##Op%{i*N2U=L=e=jUJHcrh11rKtdX_mWIdl zUagXVD&a~$*jUBKkCTFGA2nGnz^AlL`3pz9IBGQgx^`JDO zQ4WXT(+!`gOL{CUf6Hr{yKVd?HoQqKT1oH|poD$y11Ctq&{t>D<>XtJ^%OZmh_26j z1DF!Mf%CmP-HyhoVkk@Q;N+sl1%bU}`?p4@DmSxvt=dPLBY0~Q2CqJVNXz0v=W{gz zYQo4+T0To5SBo)-==FqWn-^Q`&FA76yzH5mKhRfcIWj${ZIn?E+U+rvXm&5VcyGdU%pLcV z!;wK~{g>ps-EK7p-OD68{cvqU9X5+CWXB-t*z{`YTDTRHXmEw@O5S=Y=^p6lx;R z4l$}jBBp61oDFHzSnDN>BSyN-$X_p=jIG&K_mtSo3j3N;89|JS*9+PzHfl-lHw>pYCd{wpzY2t(6Nz;op=_evK7G~8lOXTBGX;tw z;EES^Ux-l=a#>PH80__@MWc|4_Xv^2_l;W=loXaAQK{otX+mu5zB8w6hTF2vNTQ3l zG5bVX+ZLVPPKNIqToYxsB-FUjFc!%rC~#Vuywpmd?i$@s;L{}HqBCK{HL?ww)TT5{ zCf%2-U>f|Sd7fnmWtLE8GsobZioj%cd7NZdW8vTw?1@JoiMd{QO(eN0_lggkrQw|& zi>CPb88nz|D{7iy9&U#{p|wMHF9rc6lRAGUuh~qCb1}a9d_RtuIn-_Q@vwRJZ^9 z;#|x+z0EghnD*#oCr7!h7R*JFbT|Un*DCJe2irCo4bNV(tlB~cvWEIZd6n@jjd>-B zie=M@R+Nr&DQ*HqSHf(yGwY4QxrQefZLUdSxEmAhyNSwMrBlzim}w;oP(@2dDKcEb zwI_pKj6|wj(T=Y0(OTuqp8t-92Qc*s{o*u%k%Bp~svCi-0h&0DzQt?(#dMuXSDi_f z-PAl?X(uoeyB#jBC-~yCR696aY)LH4oFr-61h=?8y)C*5=~KG2cOqA5I+Thr8y9(; ztNLP#lb5xdRav`KAq!Qs>xUTjDfVFeFU*2tDYwA>mh|u$PGyiWh}fBHHwGQkMv9WI zIf#~4eWF*;{<^^$p^1@-+vXa3hoEQP^9hJ*<|TfxevhfxY4eqFd^l7L>K z((n_G_%xrSUKRtEfMiteNs@d!Chb5B5#7y<=S*aoP8hC&gPxpSR90cD1gbeoWI
    k|1POUN2JjWD?Wl-5q*Bq} zOc4(z@fb#}Bm!d)*4y~$SRVWtY~fes7Fd#qj5;thLe%mw8_Pr83A9G;YV>y97|}l) z#;fX~P4C@6=^0H(xm#5a%(EVDf?38wF{7ds3`X%AzT9kl)LZm*I>IE+zHJ!J1=_h= zhnhDC)5Eqa6-?~U354obzC1$pUagrV5&GUvg^IQ`Yp=4$i`3xU^b*0E#>~F}SOT9c zCs@)%e3yGi&SYW;X7Drp2)MLvZG8CGY|UJLMzi(W z>v0=A82;5W4_$8#YrAA+0#!cirbr2V^De7@S*iwtPvoo6e0UxUjMutb7j%by-5Pdq z06uxXAGw`5o_?Haba6q?=UvodD)*_~5yLZLSOKhX_;t-8&L&_ZZwezFyNWv72bZP2 zDe9;rGr4Hv>`SCt2tYi=teT1x{C#7Xsoh{E#*( z-t}QJPnT(g5&~GYq`Ek7_*Zd?(@gQHpZk-lrF|xES-Ms2z+&jn5aG%cao|XVuf;2p zOc#)W`G)d|M?=&*HN*|ar$lKad-#jg-ElwqCb^90SAINb?b$d{uPa!0aGERsMy7oM zYtz_s+b6smF@hva&CIhxM}pyD5uAd+_?@#2rZ4_p&*gRyeneZvStBF|2 zbk9>A+tRvnT7V05k6m{?%(;Sn@hgx5_9u#rIat{=h+IbABp88pNiT{9fEJ3rf=b0X3AjK2D*2%(^e8W>dK@be1V> z3kl~u&wDqTCaw2*_A&SZ_*n(;wgK8}G2Lywd;4*N{^RB7@U`!~4?f~6I{60gZWViI z6f3nC2z=Wmoh{WpDpA_Jk*;j0_Rye!~{$swNe+rH|nX`!!h$T-x<<{_7xIV-sTG;`nX)++co|!Jy26(;koq z;e)t~JI32X^V#ZsAR#=*CF`cQrnSjLftNwI2g>12?&8Yrtv|n#_T}E-JJ^v(GIXnr z;vr`+-SMh(gKn`H&dS}iHE&1$qU$P$j0=IXJQ_=>JZBkbxOJ`iP}8xq0o9-+8eeKOO1k^X_Y|& zW+ir^)Lm!&wEJ3mwBNzi0~-ea8aAVk`RV^Wsy0Jdjq?X@EsdPbOx0bTt?Vs+(brd{ zORkR{qYe39V(@yA`l%%Km>|N_wlcDs`aPT6#Ti@1i=$N}+%rvz)Y1L@(F#%jLe+MY zFKMN-RkgYlZlE|<>+_M*p4mz*y&!J_E~9dMQ$Lhc{2UPDBvzAmiCYU-SSu8^Vdk?|v0eYI1G#5x*SukNJRQ-VBI;Vj zPJhXZT*Be}3rh=05S8wH_GVT5iN*BZTD8~Z@(;vaJ#QVfVLJx2TaJ5F5N^#OdPC9` z@Y5i;=iih4DFXBaZy<|-2Qg-tkfe^OgNcf>gQE+ZiG#D*A0@c|<#HgJ60N4Fya~zW zP#pwB_$p`9`Z8+~7s5F~BSunc`JBdvZ_w1T_{i;a3`9=XzjFHgmG@|Ddi0dJN@R)F z=LK2)GkKnOxXcnTT2>$Q>xyUX?i0;|3RUZBpQPGG1-+UTRl@rWs}qO5CU92BVLCYk z3uQl~ADQ)8sFSIYb2PtO@}%!ZOYOD)SoUjPkhDHtAHPF$`I1-I3T@-U8#A1?Q| z_lHh8&*qE9k>1WC!v%9c(_y>1Cp0iRy(EKve*#Mh^^^wqE&{PPZQXGhSS@ZEtFo_3 z&}sAH_*89Om7!He(wPYdovk~J&`(m#IlWGF*c#f6U*4DsfZlx=SGUqPqk zUMb1*q>iJu;-W>kh?@FBGym>i8BLb?D(VO_@>7X1{;I%6j*kD+U&zAz(X!tDF4@pu zJ|mRD&X`izl-Hx>i(W05IiS=Plkr%(xPCrYUP|z$86aVio8~nK_e_sFP)q}?Z{$3UR6ZrF*2Kmds)JPu# zA6JTgL)#&>qepe4$Kb!`+fEvVK;Qvns{;{0LiP+zgHj)3|L;NEx`&i23q|%&X4x4`qXv6y@;CB%H82Z?A|Ax8}{RjHkk3W|1cUSit4*-M`0|5VU me2?LOH#C2ROF#V!{13CEA`b`IT>t