From f6c2ca0d5831be8bda424ac3da1a4ddd5a3fe941 Mon Sep 17 00:00:00 2001 From: Mandar Marathe Date: Thu, 1 Feb 2024 12:16:30 +0530 Subject: [PATCH 1/7] REST-159 : Introduced Excel reader --- build.sbt | 22 ++ excel/README.md | 296 ++++++++++++++++++ .../scalaxy/reader/excel/ExcelFormat.scala | 19 ++ .../reader/excel/ExcelToDataFrameReader.scala | 74 +++++ excel/src/test/resources/sample_data.xlsx | Bin 0 -> 13645 bytes .../excel/ExcelToDataFrameReaderSpec.scala | 27 ++ 6 files changed, 438 insertions(+) create mode 100644 excel/README.md create mode 100644 excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelFormat.scala create mode 100644 excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala create mode 100644 excel/src/test/resources/sample_data.xlsx create mode 100644 excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala diff --git a/build.sbt b/build.sbt index c8337de..62e0c6c 100644 --- a/build.sbt +++ b/build.sbt @@ -80,6 +80,14 @@ val zioConfigDependencies = Seq( "dev.zio" %% "zio-config-magnolia" % zioConfigVersion ).map(_ excludeAll ("org.scala-lang.modules", "scala-collection-compat")) +val crealyticsDependencies = Seq( + "com.crealytics" %% "spark-excel" % "3.4.1_0.19.0" +).map(_.cross(CrossVersion.for3Use2_13)) + +val poiDependencies = Seq( + "org.apache.poi" % "poi" % "5.2.5" +) + // ----- MODULE DEPENDENCIES ----- // val textDependencies = @@ -89,6 +97,13 @@ val textDependencies = sparkXMLDependencies ++ zioConfigDependencies +val excelDependencies = + dataScalaxyTestUtilDependencies ++ + crealyticsDependencies ++ + poiDependencies ++ + sparkDependencies ++ + zioConfigDependencies + // ----- PROJECTS ----- // lazy val `data-scalaxy-reader` = (project in file(".")) @@ -97,9 +112,16 @@ lazy val `data-scalaxy-reader` = (project in file(".")) publishLocal / skip := true ) .aggregate(`reader-text`) + .aggregate(`reader-excel`) lazy val `reader-text` = (project in file("text")) .settings( version := "2.0.0", libraryDependencies ++= textDependencies ) + +lazy val `reader-excel` = (project in file("excel")) + .settings( + version := "1.0.0", + libraryDependencies ++= excelDependencies + ) \ No newline at end of file diff --git a/excel/README.md b/excel/README.md new file mode 100644 index 0000000..8c3a41d --- /dev/null +++ b/excel/README.md @@ -0,0 +1,296 @@ +# text + +User need to add below dependency to the `build.sbt` file: + +```Scala +ThisBuild / resolvers += "Github Repo" at "https://maven.pkg.github.com/teamclairvoyant/data-scalaxy-reader/" + +ThisBuild / credentials += Credentials( + "GitHub Package Registry", + "maven.pkg.github.com", + System.getenv("GITHUB_USERNAME"), + System.getenv("GITHUB_TOKEN") +) + +ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "reader-text" % "2.0.0" +``` + +Make sure you add `GITHUB_USERNAME` and `GITHUB_TOKEN` to the environment variables. + +`GITHUB_TOKEN` is the Personal Access Token with the permission to read packages. + +## API + +The library provides below `read` APIs in type class `TextToDataFrameReader` in order to parse a text into spark +dataframe: + +```scala +def read( + text: String, + textFormat: T, + originalSchema: Option[StructType] = None, + adaptSchemaColumns: StructType => StructType = identity +)(using sparkSession: SparkSession): DataFrame + +def read( + text: Seq[String], + textFormat: T, + originalSchema: Option[StructType], + adaptSchemaColumns: StructType => StructType +)(using sparkSession: SparkSession): DataFrame +``` + +The `read` method takes below arguments: + +| Argument Name | Default Value | Description | +|:-------------------|:-------------:|:-------------------------------------------------------------| +| text | - | The text in string format to be parsed to dataframe. | +| textFormat | - | The `TextFormat` representation for the format of the text. | +| originalSchema | None | The schema for the dataframe. | +| adaptSchemaColumns | identity | The function to modify the inferred schema of the dataframe. | + +User can use this library to read text data of various formats and parse it to spark dataframe. +Supported text formats are: + +* CSV +* JSON +* XML + +### CSV + +Suppose user wants to read CSV text data `csvText` and parse it to spark dataframe. +Then user need to perform below steps: + +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader +``` + +#### 2. Import type class instance + +```scala + +``` + +#### 3. Define text format + +```scala +import com.clairvoyant.data.scalaxy.reader.text.formats.CSVTextFormat + +val csvTextFormat = CSVTextFormat( + header = false +) +``` + +User can provide below options to the `CSVTextFormat` instance: + +| Parameter Name | Default Value | Description | +|:------------------------------|:---------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| charToEscapeQuoteEscaping | \ | Sets a single character used for escaping the escape for the quote character. | +| columnNameOfCorruptRecord | _corrupt_record | Allows renaming the new field having malformed string created by PERMISSIVE mode.
This overrides `spark.sql.columnNameOfCorruptRecord`. | +| comment | # | Sets a single character used for skipping lines beginning with this character. | +| dateFormat | yyyy-MM-dd | Sets the string that indicates a date format. | +| emptyValue | "" (empty string) | Sets the string representation of an empty value. | +| enableDateTimeParsingFallback | true | Allows falling back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps
if values do not match the set patterns. | +| encoding | UTF-8 | Decodes the CSV files by the given encoding type. | +| enforceSchema | true | If it is set to true, the specified or inferred schema will be forcibly applied to datasource files, and headers in CSV files will be ignored.
If the option is set to false, the schema will be validated against all headers in CSV files in the case when the header option is set to true.
Field names in the schema and column names in CSV headers are checked by their positions taking into account spark.sql.caseSensitive.
Though the default value is true, it is recommended to disable the enforceSchema option to avoid incorrect results. | +| escape | \ | Sets a single character used for escaping quotes inside an already quoted value. | +| header | true | Boolean flag to tell whether csv text contains header names or not. | +| inferSchema | true | Infers the input schema automatically from data. | +| ignoreLeadingWhiteSpace | false | A flag indicating whether or not leading whitespaces from values being read should be skipped. | +| ignoreTrailingWhiteSpace | false | A flag indicating whether or not trailing whitespaces from values being read should be skipped. | +| lineSep | \n | Defines the line separator that should be used for parsing. Maximum length is 1 character. | +| locale | en-US | Sets a locale as language tag in IETF BCP 47 format. For instance, this is used while parsing dates and timestamps. | +| maxCharsPerColumn | -1 | Defines the maximum number of characters allowed for any given value being read. | | +| maxColumns | 20480 | Defines a hard limit of how many columns a record can have. | | +| mode | FAILFAST | Allows a mode for dealing with corrupt records during parsing. Allowed values are PERMISSIVE, DROPMALFORMED and FAILFAST. | +| multiLine | false | Parse one record, which may span multiple lines, per file. | +| nanValue | NaN | Sets the string representation of a non-number value. | +| negativeInf | -Inf | Sets the string representation of a negative infinity value. | +| nullValue | null | Sets the string representation of a null value. | +| positiveInf | Inf | Sets the string representation of a positive infinity value. | +| preferDate | true | During schema inference (inferSchema), attempts to infer string columns that contain dates as Date if the values satisfy the dateFormat option or default date format.
For columns that contain a mixture of dates and timestamps, try inferring them as TimestampType if timestamp format not specified, otherwise infer them as StringType. | +| quote | " | Sets a single character used for escaping quoted values where the separator can be part of the value.
For reading, if you would like to turn off quotations, you need to set not null but an empty string. | +| recordSep | \n | Delimiter by which rows are separated in a csv text. | +| samplingRatio | 1.0 | Defines fraction of rows used for schema inferring. | +| sep | , | Delimiter by which fields in a row are separated in a csv text. | +| timestampFormat | yyyy-MM-dd HH:mm:ss | Sets the string that indicates a timestamp format. | +| timestampNTZFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS] | Sets the string that indicates a timestamp without timezone format. | +| unescapedQuoteHandling | STOP_AT_DELIMITER | Defines how the CsvParser will handle values with unescaped quotes.
Allowed values are STOP_AT_CLOSING_QUOTE, BACK_TO_DELIMITER, STOP_AT_DELIMITER, SKIP_VALUE, RAISE_ERROR | + +#### 4. Call API + +```scala +TextToDataFrameReader[CSVTextFormat] + .read( + text = csvText, + textFormat = csvTextFormat + ) +``` + +### JSON + +Suppose user wants to read JSON text data `jsonText` and parse it to spark dataframe. +Then user need to perform below steps: + +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader +``` + +#### 2. Import type class instance + +```scala + +``` + +#### 3. Define text format + +```scala +import com.clairvoyant.data.scalaxy.reader.text.formats.JSONTextFormat + +val jsonTextFormat = JSONTextFormat( + dropFieldIfAllNull = true +) +``` + +User can provide below options to the `JSONTextFormat` instance: + +| Parameter Name | Default Value | Description | +|:-----------------------------------|:---------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------| +| allowBackslashEscapingAnyCharacter | false | Allows accepting quoting of all character using backslash quoting mechanism. | +| allowComments | false | Ignores Java/C++ style comment in JSON records. | +| allowNonNumericNumbers | true | Allows JSON parser to recognize set of “Not-a-Number” (NaN) tokens as legal floating number values. | +| allowNumericLeadingZeros | false | Allows leading zeros in numbers (e.g. 00012). | +| allowSingleQuotes | true | Allows single quotes in addition to double quotes. | +| allowUnquotedControlChars | false | Allows JSON Strings to contain unquoted control characters
(ASCII characters with value less than 32, including tab and line feed characters) or not. | +| allowUnquotedFieldNames | false | Allows unquoted JSON field names. | +| columnNameOfCorruptRecord | _corrupt_record | Allows renaming the new field having malformed string created by PERMISSIVE mode. This overrides spark.sql.columnNameOfCorruptRecord. | +| dataColumnName | None | The name of column that actually contains dataset. If present, the api will only parse dataset of this column to dataframe. | +| dateFormat | yyyy-MM-dd | Sets the string that indicates a date format. | +| dropFieldIfAllNull | false | Whether to ignore column of all null values or empty array during schema inference. | +| enableDateTimeParsingFallback | true | Allows falling back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps
if values do not match the set patterns. | +| encoding | UTF-8 | Decodes the CSV files by the given encoding type. | +| inferSchema | true | Infers the input schema automatically from data. | +| lineSep | \n | Defines the line separator that should be used for parsing. Maximum length is 1 character. | +| locale | en-US | Sets a locale as language tag in IETF BCP 47 format. For instance, this is used while parsing dates and timestamps. | +| mode | FAILFAST | Allows a mode for dealing with corrupt records during parsing. Allowed values are PERMISSIVE, DROPMALFORMED and FAILFAST. | +| multiLine | false | Parse one record, which may span multiple lines, per file. | +| prefersDecimal | false | Infers all floating-point values as a decimal type. If the values do not fit in decimal, then it infers them as doubles. | +| primitivesAsString | false | Infers all primitive values as a string type. | +| samplingRatio | 1.0 | Defines fraction of rows used for schema inferring. | +| timestampFormat | yyyy-MM-dd HH:mm:ss | Sets the string that indicates a timestamp format. | +| timestampNTZFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS] | Sets the string that indicates a timestamp without timezone format. | +| timeZone | UTC | Sets the string that indicates a time zone ID to be used to format timestamps in the JSON datasources or partition values. | + +#### 4. Call API + +```scala +TextToDataFrameReader[JSONTextFormat] + .read( + text = jsonText, + textFormat = jsonTextFormat + ) +``` + +### XML + +Suppose user wants to read XML text data `xmlText` and parse it to spark dataframe. +Then user need to perform below steps: + +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader +``` + +#### 2. Import type class instance + +```scala + +``` + +#### 3. Define text format + +```scala +import com.clairvoyant.data.scalaxy.reader.text.formats.XMLTextFormat + +val xmlTextFormat = XMLTextFormat( + rowTag = "ROW" +) +``` + +User can provide below options to the `XMLTextFormat` instance: + +| Parameter Name | Default Value | Description | +|:--------------------------|:-------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| attributePrefix | _ | The prefix for attributes so that we can differentiate attributes and elements. | +| charset | UTF-8 | Defaults to 'UTF-8' but can be set to other valid charset names. | +| columnNameOfCorruptRecord | _corrupt_record | Allows renaming the new field having malformed string created by PERMISSIVE mode. This overrides spark.sql.columnNameOfCorruptRecord. | +| dateFormat | yyyy-MM-dd | Sets the string that indicates a date format. | +| excludeAttribute | false | Whether you want to exclude attributes in elements or not. | +| ignoreSurroundingSpaces | false | Defines whether or not surrounding whitespaces from values being read should be skipped. | +| ignoreNamespace | false | If true, namespaces prefixes on XML elements and attributes are ignored.
Tags and would, for example, be treated as if both are just .
Note that, at the moment, namespaces cannot be ignored on the rowTag element, only its children.
Note that XML parsing is in general not namespace-aware even if false. | +| inferSchema | true | Infers the input schema automatically from data. | +| mode | FAILFAST | Allows a mode for dealing with corrupt records during parsing. Allowed values are PERMISSIVE, DROPMALFORMED and FAILFAST. | +| nullValue | null | The value to read as null value | +| rowTag | row | The row tag of your xml files to treat as a row. For example, in this xml ..., the appropriate value would be book. | +| samplingRatio | 1.0 | Defines fraction of rows used for schema inferring. | +| timestampFormat | yyyy-MM-dd HH:mm:ss | Sets the string that indicates a timestamp format. | +| valueTag | _VALUE | The tag used for the value when there are attributes in the element having no child. | +| wildcardColName | xs_any | Name of a column existing in the provided schema which is interpreted as a 'wildcard'. It must have type string or array of strings.
It will match any XML child element that is not otherwise matched by the schema. The XML of the child becomes the string value of the column.
If an array, then all unmatched elements will be returned as an array of strings. As its name implies, it is meant to emulate XSD's xs:any type. | + +#### 4. Call API + +```scala +TextToDataFrameReader[XMLTextFormat] + .read( + text = xmlText, + textFormat = xmlTextFormat + ) +``` + +### HTML Table + +Suppose user wants to read a text `htmlText` containing data in the form of html table and parse it to spark dataframe. +Then user need to perform below steps: + +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader +``` + +#### 2. Import type class instance + +```scala + +``` + +#### 3. Define text format + +```scala +import com.clairvoyant.data.scalaxy.reader.text.formats.HTMLTableTextFormat + +val htmlTableTextFormat = HTMLTableTextFormat( + tableName = "my_table" +) +``` + +User can provide below options to the `HTMLTableTextFormat` instance: + +| Parameter Name | Default Value | Mandatory | Description | +|:---------------|:-------------:|:---------:|:------------------------------------------------------------------------------| +| tableName | None | No | The name of the table in the `table` tag that you want to read the data from. | + +#### 4. Call API + +```scala +TextToDataFrameReader[HTMLTableTextFormat] + .read( + text = htmlText, + textFormat = htmlTableTextFormat + ) +``` diff --git a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelFormat.scala b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelFormat.scala new file mode 100644 index 0000000..d3cb5a5 --- /dev/null +++ b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelFormat.scala @@ -0,0 +1,19 @@ +package com.clairvoyant.data.scalaxy.reader.excel + +import zio.config.derivation.nameWithLabel + +@nameWithLabel +case class ExcelFormat( + header: Boolean = true, + dataAddress: String = "A1", + treatEmptyValuesAsNulls: Boolean = true, + setErrorCellsToFallbackValues: Boolean = false, + usePlainNumberFormat: Boolean = false, + inferSchema: Boolean = false, + addColorColumns: Boolean = false, + timestampFormat: String = "yyyy-mm-dd hh:mm:ss", + excerptSize: Int = 10, + maxRowsInMemory: Option[Long] = None, + maxByteArraySize: Option[Long] = None, + tempFileThreshold: Option[Long] = None +) diff --git a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala new file mode 100644 index 0000000..3ec0e41 --- /dev/null +++ b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala @@ -0,0 +1,74 @@ +package com.clairvoyant.data.scalaxy.reader.excel + +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, SparkSession} + +import java.io.{ByteArrayInputStream, File, FileOutputStream, PrintWriter} +import org.apache.poi.xssf.usermodel.XSSFWorkbook + +implicit object ExcelToDataFrameReader { + + def read( + bytes: Array[Byte], + excelFormat: ExcelFormat, + originalSchema: Option[StructType] = None, + adaptSchemaColumns: StructType => StructType = identity + ) (using sparkSession: SparkSession): DataFrame = + + import sparkSession.implicits.* + + def saveBytesToTempExcelFiles(bytes: Array[Byte]) = { + val workbook = new XSSFWorkbook(new ByteArrayInputStream(bytes)) + + val file = File.createTempFile("excel-data-", ".xlsx") + file.deleteOnExit() + val fileOut = new FileOutputStream(file) + new PrintWriter(file) { + try { + workbook.write(fileOut) + } finally { + close() + } + } + file + } + + val tempExcelFile = saveBytesToTempExcelFiles(bytes) + + val excelDataFrameReader = sparkSession.read + .format("com.crealytics.spark.excel") + .options( + Map( + "header" -> excelFormat.header, + "dataAddress" -> excelFormat.dataAddress, + "treatEmptyValuesAsNulls" -> excelFormat.treatEmptyValuesAsNulls, + "setErrorCellsToFallbackValues" -> excelFormat.setErrorCellsToFallbackValues, + "usePlainNumberFormat" -> excelFormat.usePlainNumberFormat, + "inferSchema" -> excelFormat.inferSchema, + "addColorColumns" -> excelFormat.addColorColumns, + "timestampFormat" -> excelFormat.timestampFormat, + "excerptSize" -> excelFormat.excerptSize + ).map((optionName, optionValue) => (optionName, optionValue.toString)) + ) + .options( + Map( + "maxRowsInMemory" -> excelFormat.maxRowsInMemory, + "maxByteArraySize" -> excelFormat.maxByteArraySize, + "tempFileThreshold" -> excelFormat.tempFileThreshold, + ).collect { + case (optionName, Some(optionValue)) => (optionName, optionValue.toString) + } + ) + + excelDataFrameReader + .schema { + originalSchema.getOrElse { + adaptSchemaColumns { + excelDataFrameReader + .load(tempExcelFile.getAbsolutePath) + .schema + } + } + } + .load(tempExcelFile.getAbsolutePath) +} diff --git a/excel/src/test/resources/sample_data.xlsx b/excel/src/test/resources/sample_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..7929b4f25f82cafd741ca25207380291335d8250 GIT binary patch literal 13645 zcmeHu2Ut_vwk}8d(L_9`_37VvE~}znq$s2<`{F$@=qNNViE=da&mHl zZ?SX_2!5jTCutL9kc)@4i-)E)GhI+ux( zt-aX*=g-^9a?pDDzw$3K%N=mll{6n!eY)o`GDseE??IS|GQZVrVr<4W)=jNie!*uq z`6MY`ER6baJLoF3K8)ntyxf^LR7iJgd6>Y$(bwTI$??+!v%;ne(gsJoJ88sqd$%cp zA+eK#tV?_$_YGLT0HXp8?p*;DJSmb0z`j4)3L47}UVM(mSNB_-^IfWnigAmjk-Lqn z)CwZd(KtiSFLulC9w7k%BPqej{awc)?5n z!nc<%oGmsz-|&#GH3^5kE=j4Pw%)3q zecZf${?s-u%7?sjCvSd-=+A6Jck;wz&?6l;kgL0>rK{_ypJpZwI8KPsbxfbZa)(c` z*N+MEbzROwqAaW}pFkJW1L9Q}yHORF#>^GRE>sLDQ0?s@k)4a#_uj#$(2XeZgqWP9 z)EK~7D(TN6P8t}UK-d94sdzYB0F}{4;unq5=;bzhe6!160*@54nC+ zKX5UVw692Vzpqfe%JK>AXVUoLXi@(dv(kF4#V+dc#cu}|XUi7~ri?ax5*01kHo}cz zk~^HOZ{bq!Z>#P%E)TeAhE8TTHWx57J#{6xVolq9%fc7J8KaAhmwjn2F0UKYl|M4f zF+X`@bUp83e57z+oC8t!AxD+lYC@2tqxzBPY{IQRmZk`Wh2wrgfd?1KNxX~%e#`#S z{sBbe2Jj=@L`Desp-OxxV~Z|k%j_0LgnPnMm(CSN!h0ayw77JpR;HE2#Wf2b4sfSp zv#HtDXj&CnVmUnH4g>|d1Fl#-2vRNT?=Ciuupurdp#(682J9W+XFk?R>B|VW1aYj}&uEI4% zFII_i=q}RGXWA)IN>#xXxz%F#=B>2P{nS0x2;vQ^eguSD%G8n{dh2lb?sOY?e8-=R(jfX(Z)5Vj)DFH@gR8 z1F8u#w(tsRmnnU>+;TdeXm*Z#Xur)Wx5|}OwDH?;>Hz6i``M@1w=wBU{E)guKY@sA zQyzr!UYQIbdt9b>aG~=YF~t~~_I2S5XVXQ)C2LRbX{RTdLpQZi*Sn)5r{1oF0_Q6) zVy?a^l2#0UjUzcVsnv__d4>~{_MH6wdcVPNVp1O`QFl8_H|xjuJ>2YFY~4?-nye25 z4T&)Xs?91I5pLdI2J>r;OJx?myOa4Lt@eUeqvoahshb?o9#5u@%FZRa*M*v`qA6%O zIXHLcLyjO2m2WsxO||t00byQ?D~68#CaT$$_hxx{Q3d4%PnbTUD8#Ot!h>p*cK8Dv zt76w1r#}g`FVnF7cD{~~oo;?y_0ZJ`B;;cJTxyE&phKtSOOmwS#54VhYIy7iN#4v6 zTF$Z{(sy?CN-)!dd3)PCtyAY)NCd>?);^)m`?Htf8(Q5727 z4R)Q+ccVsLeA4nTi|X(peFe*BjqC7aDLcnHV^7LwGfIR@hPDXHBLCxnsP#sY&hV*r|0+c)nWfA-(a%Jt?7B;0NJ2 z-&;ls%-uJ!jyjS@+$oBLr>@o4_pO`tq{}rU`7_s|{&&~%@O85O?o%)$XV8Q+9Ztal zf6F4$lC%)jp5h2ii5`!CO9a#v<`T2uBs0!+;$O&+W!*#!fUHsUx^0kwjtlsFH zhoYJ~%el}6g*3{9-K$r?3xW}tt37nd*C^(M7~HyS4YHn)%$W)md~kDyJ}RKQ|C;{L z-s7zLS&VYROL~oKf*TCyg;Opvi4DHNd#SQU-RCV9$&ikXjToA#sQQ|mpx0n6&`xhF zr}3fuLyggkujI}{YY*iqRi$3shEsKK3Qr(60$W6HJD70DTO}*dQQ0L#7I(!xW7O~R zs8ES?nd;E{Vo$7m#^Zw(Yf-i$qh{mb9T}OXsmU*Owa4ff!(jZ=*+woswrnMGTB=tm$uP`w^yh@0FsOeZpCilG3Z#ql!rZqydp|!l zj;G~*yha$C629Gl_+#L?;T_CtFFAq7fQ0B~%;PMkT)v>CX~yHb7SY{$F~~$7ZTBrP zmnwk}Mx@3Q+6~W+zAo16c$2Knkp_mI7b9W@(#x8bfX}uiq!1pKt`Y&dq8tvEQrzR0 zcbeV^BiIT}y&vz!wiV3z6H+~M%d&A|75B(>&F+W(RRu+#I{5#xd9X?7D?C;Tzb;uIy1&z_{Ak}&l_*fy=(s{$3VZ|uu<+l)6{7U zd}rHXbq(D2fcG0myiSeQJqhm@sU~FFn_I$c)RnMT;r<O7-f?UMj)X;-g=@zjr7UJ6;=Q+NaUbc0If_+>ModGS0cbkoO3gL?PXP77#@ zu!EwIHm+Risu+FRa;Ba0N~2+{dYH5F473Nh^^AEJX4ZFhmm1csY4t(yDJgy8Za(cY z4lUTBvRq*p*bLSzGZ;DczPz%v#N*smju&CJc4XkdeqBLsSc3wX8*E0oyn4MTeQo;L zN--7&co+=z@U1<3@=3iH%4SJ7Z=aRYcLcPGHD7RW zxwe)_SdQRFC~(B5JxeKSQic1ZS7AT*XFbd5+~ee<L^tPOA8o#{ieKma+8O7$LDs+^yQ7o~CSZl27I3pYQiFLP$rwO+NJ4`UnHV&a#J4s`IJ4MH_USpY0eq+%D31&mi=X?)i$4 zd24dQSzTeR#3(nGt(Zn7{l0d&p4ZQTDQWa!w1Y0Hnw{?!Y{%vo&e(V6L()d+T#P4=uRjgZ`Bh-dzntf483R2)6d&h@wa5(E=hIG3@M6*XN516 zX;?`Eti=5}DJHHn$5@0%fD}7hhjV#FsY9#>_a~$2zh2-Myga90uBfuVnviv_8tQEO z*#D`2t-80mU1zxl;6|iStWNO3rz)DzO`RKgl1w=N591sQWB{oA1`m4jQ=$#Ps4Uj# zlGNCzv-aSj4+PKox18ytjZye+VbZx-vMoqxaj^rQmFXE_X!W~i_Le-5d7B%QD|0U> z%fLk$W3&>{j~JqVOQ^lFc$qJ+8bp$sc_erTJ%l3U=t?*a)e{6+rV^>9#i;g@Ut4pn zySz8}Adpeb%zs&2Ihx@!ql)$dqiGHM^>GD;AzZ~LaSGb*b&d!7^D3}>Ib9{D#D}ZL z(TyDr(tRJU=S1eWIU9e1z zYo3oX@!bM_bKg}bsm!VchqkOO`A$71s_{6cJqtRoqJk+^81sC8Rlh_tittGfKrBpYq3NXX?@|w`KUdc$>qG}>m?GGYi5PUU1LPZ zvK2EcA-);*$1DUP?}R>PUP%;#2rrV>*fxAS%j@oZcG<#_Dd#G6(aT;r?+7ZsSg7T# z2!KiWhsHC?{fnWSIW`Ykk|#o0M2d+IrI`zy5oh`qR9?SdPMTve=ds!(z={{az zK1wsJIY_SPc`0P~@d5R1oJuN>oTT&wEa7~mJ@%G(;`1nho3WjF_nMFn`%fyR;L&rR zAJnNC=LO;YjxNTlKK8)G9+@PIXcQv4 z&`Q38^5G6ndA5`Pk+Jlo*t^nFPn!(N#02Ce4_&Mh?Yf4NRex&|-D%X1Gzxv*tu?>bB zvE1!TaxM=NW_$yp512jeUiE!-7waUHHy3)|MD%?0(K06OaQcx;zj%hcV5jG%8f&pT zwf7>~?0qliwSUEAs?LVDpv-}?Q z?snGJ9_}X-#FNYSuWUIIsLg(djSPWJ-cvM!Yr6&-Ug)y3$sOoxzgv3T=+#VeCFJ46 z9UI6v9v(*#en+)bP(YAwXx*>0o<_GP2b+_ zT^YhC;&4ON3fXIWJ?Y!?BMUa3*YTb;>B0N6;9yzbrZM;!u)+X4raxojeZ6TC4FZGX zy=+rUSLW9-3tpbrr%tZHxDknzOpc88jzCm^#fu0P^F@7E})9Fv>D;%Jv{EmfL ztH4^ndO3f8Si{&f^H@bi3fBGz+Oxa1o-@8XmYM`LbD3_01$x!i!!z$T8k?8zH*rPx z*^b(FDmKB#oKh<*3VreCwzi!1db|#qi_klWUQ4X4Jg~CAHXXTOg%2n>viy{-9n2Az zH$V2MpnnmsVhzZmX@>41DG#djk3JgDR~Z}`2?ttN9U85!yE=oXEBdt;@eczI{GYYO0_0(>-f^}QBkfB^Sl1)vw5(U$Fj={s&HmlxHi~Ss>ee%S-7P;m9d47I#hL|` zxcPlHE+lxv)x%6=zlT|1q8B^1C-wokwzba5!=VkF>bRrm1m?S20wZMTi| zmSK=&+TqwtcQ#dF(9Dee^lAZmx@_>kCV71d1AkCF?b~@UuIIO>D73qF5BYj=JyFkZ zP+q8@=u@Iz?<;Rdg*U#YM%xy>Yh9g>d#j~HKQ|h?YXBj`DjL9d!x#UU)ec*006UsO zu6mB?rT)y?^$dkQ|A>v~8D91LBi5@oD(v}345>GI)ss_k>fNxX2C(iYMy-K-J4}5f zIIJfl?Ai7cV`gH4zI{6x5fS9((oIPjV`gNu*V@|3%yeMCNu(%ZW`QGG5VgRCx>259 zzNhuCBwayVAqrJdsv=isBJAbmF(%QE32~xFTqSB%QK~0bXC_?ZNfJ{~s`{rHei^awd3 zfSS@0xjH9dIUkP~$#8tgBt7B=QK*{ICb>EnVJ{z#ABjFVWR)HvPt>ZWgdtbwCS2p= z2_hK=haAx(6nPKTZsVV!xap7ia)WR0fS=*$2mV~BTRDoLls<^2{*-N2ox|+>I`}rs zNx5iuc@kQxzHKas|EIj?kRN%jOsatqK6gTN{-s@n79dIa47om9eQ0*Tn zYoAEgB zpH9ejsHTj!Lvd3d=j8-T?tmk40|>=`K4D(4#|}6iH!z@h{ZLH}k3w;C9_Rhj3CRvS zmGCSmZsOxduY;*}z)x}F2*rOoA)hduNh#hdwR8kG@75IX#0@qMH#>gc&@kTHCk+D8 zP-i=3IvX+r6cV*I1lx@{bOz}v`k%;+7Qaj zd2a~C6$CZC+HW|-U&w0eB#p&JsqIVQYsZ^lna;H<_Mdu={oLKJHyKZ5p!-tlO51p* zK(!#6`#zhBr!zj9=eBlbCwnnP1cZOG5BOJRd=lRme2_rUL>FVnCYI=!CRD`FR>Rh& zugB-A=({H`)>WS7PNepb@(oqd-SN06Cu+3nEqYO|HkZZzEr$J$FxaN=xw1m^Q*QMv z{DGx7ccY`>%EP)0s(Fu_9FQJ+Zz&V_ODg@269xsWbFBMUWW=9z9 zMKp9|VpnUzg)A^)-cO=&%*mx+HE(yP5Gc7av|bC`Lz{Y+s#Grpo5EPjL59xmHSCZy z@Su-Vp2{^7ntjGTgK51yC_GDRqQ}Lr&Mq}QFyX-sko56hCCH|3MSFy42qaWzX8=OG zXcr=9YehHxq_dqB)}v|8#4L^VgHn(szVhgOY*>=b^@F#|_XbQ{^(q*_;CN+_icSuiedeKy7Rybo$#61eswkV~)mg3fuGm`&G6rC04-*?6I)@ zQSo?@JvwUb%r`6{=1EAfr&2w)4i}j^W|;uS{d&6Lj+bIHm#w-v=huDWBTFUBA=RdR zGFQ>F`_&S)`P$SRYlgnw;iBnU?oZ&$Q?v%qd9!Kh32j-q729dfQny!d48h(6`X0cp~YyKZxWNNAV!R#Vnb~6r{#+xoao(Qq!eUxS0If#k9 z;WUj`PrTPk4;RiCylB4ntl!@>Xe=*iYOXz1{_&A-YO1zJLO-<8FIU44wKaBVZFjuo z(%~WdNWr*Spdl%bangG%h(vx*&a5pkyV2i0z(Ebv(ot#{qH6{|4+wtpbueIq)N7R!+d5Y2kEa-b=U559oz>9Wi| z+_}5#UGWW-^TINjqm;p@Ao{^LS={Aa7`}-k>2RulCrob|yXmE~>>n1S6MbVS;>O7Q zOlt8lC=x0@eP8seUvl~WK`E!c_E$fOiqsiD(uyKi`LLB1jJm7BxePGXB3~P(%T+<* zKPLrU70zaWi5BykF~25hBeru~6&U)WS}{7Z+-Db+X95H>eh8`hqFOLwvfN~gZC?Xe zGHAoLEnOA93teToX%^dN0%S92qqb*U6*&5$+A!aRd|v}dGQc6*^sWj-eQM1Za#?P| z#UH}6i+Qb>H?9hoGr+$lpNn(Qv9?hph;z`hwy~aE&at)$om|ed4&9S2%yHEeP(kIB zXENj@@_Oin--|}vp}#Kl{>@zez4Qje8xcqeW$!Wo+JJZ~0x6@^S^-QnAl{2WDkytb z0N4gZvk2rZrPeBd9g64@fxM&aT?O2LA`l`F2&L8|F!+LlIv^Anz%) z)&c2I1WE+bK-s$vfI<-)B9KN(Ei_;PiZ~E~z$tst04x+iAPQ-w)Y<^BHzLSIA+40X z8-N>)2s%;72TH9?fJq~QMHJFO*}DnwZ$$8jLb@omwgBmk2tiTEN6OwU0JISyCJOmP zskIH5Xhg`0Li#9sw*lBjgpw!(L8-L^U=PfVQy5Uqw#(Co3o1ME%~5qKb-3BiTzAQ=JrcEK?dOIa;_$z{NJvize#QX3nk`ud7_X3 z=5Z9I^G<~H;fY7(#K94&e?Rfp{-jRs%V5O^T)|~G@ut4lg$@$uH`RLP`cpc&MYR9! z<^Olztr9YD#d+s0J8V$(U&$YK06_-Wop)5(VFRlFO8&3|8ZvOhdFLKGY)JKA$scy$ zh76o>-nqpdh*14k@`oMF?ei)j1BTYIEwU1A+hcyQ zLs4>NX&tuW^$r{455vQ!Bm)S@_bf^%_-@W_jorDcqJ8+KFfk)5 z?jru%mT=Lu_YK$g19iRWd!XLsUZ;AlPZKLhqTR(Mho%tdX!^R>eu??iD6-IeN?Q>L z>3uN4og8$f0_sv7EH^%SRDhgB(lm^eneSkcSu#DPyO4TkZ}v^N2@J^@7K8u3k6b6A zn}caq_!-fEdZyub0-qH3(+P)PGyaSqKLz~% zPNgRZfnVYOzvliKEPaYH|D8Ne&}jdW`~P6hf6e|gAodhd{X2D@oZ9#!eD$vyeg?js z0%w0GUh*Fr{sTt)Yu2BEJb!8P0IEO4$^EL}XBfpP-sg8(Jh?ro{m-C@Uo-yg1341Zlw|aeDV}e(f!~ literal 0 HcmV?d00001 diff --git a/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala b/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala new file mode 100644 index 0000000..0b1744f --- /dev/null +++ b/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala @@ -0,0 +1,27 @@ +package com.clairvoyant.data.scalaxy.reader.excel + +import com.clairvoyant.data.scalaxy.test.util.matchers.DataFrameMatcher +import com.clairvoyant.data.scalaxy.test.util.readers.DataFrameReader + +import java.io.FileInputStream +import scala.util.Using + +class ExcelToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher { + + val excelToDataFrameReader: ExcelToDataFrameReader.type = ExcelToDataFrameReader + + "read() - with excel filepath" should "return a dataframe with correct count and schema" in { + val file = new java.io.File("excel/src/test/resources/sample_data.xlsx") + val byteArray: Array[Byte] = Using(new FileInputStream(file)) { fis => + val byteArray = new Array[Byte](file.length.toInt) + fis.read(byteArray) + byteArray + }.get + + val df = excelToDataFrameReader.read( + byteArray, + ExcelFormat(dataAddress = "'Transactions Report'!A2:G4") + ) + df.count() shouldBe 2 + } +} From 000c4424680c0e2dc2e7f9246da9bbeb7a60d890 Mon Sep 17 00:00:00 2001 From: Mandar Marathe Date: Thu, 1 Feb 2024 15:16:53 +0530 Subject: [PATCH 2/7] REST-159 : Updated the documentation --- README.md | 7 ++ excel/README.md | 293 +++++------------------------------------------- 2 files changed, 35 insertions(+), 265 deletions(-) diff --git a/README.md b/README.md index 8440f36..5f902a7 100644 --- a/README.md +++ b/README.md @@ -17,3 +17,10 @@ Supported text formats are: * HTML Table Please see the detailed documentation [here](text/README.md). + +* Excel + +## excel + +User can use this library to read the data from an excel file and parse it to the spark dataframe. +Please see the detailed documentation [here](excel/README.md). \ No newline at end of file diff --git a/excel/README.md b/excel/README.md index 8c3a41d..ceeae2b 100644 --- a/excel/README.md +++ b/excel/README.md @@ -1,6 +1,6 @@ -# text +# Excel -User need to add below dependency to the `build.sbt` file: +User needs to add below dependency to the `build.sbt` file: ```Scala ThisBuild / resolvers += "Github Repo" at "https://maven.pkg.github.com/teamclairvoyant/data-scalaxy-reader/" @@ -12,7 +12,7 @@ ThisBuild / credentials += Credentials( System.getenv("GITHUB_TOKEN") ) -ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "reader-text" % "2.0.0" +ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "reader-excel" % "1.0.0" ``` Make sure you add `GITHUB_USERNAME` and `GITHUB_TOKEN` to the environment variables. @@ -21,276 +21,39 @@ Make sure you add `GITHUB_USERNAME` and `GITHUB_TOKEN` to the environment variab ## API -The library provides below `read` APIs in type class `TextToDataFrameReader` in order to parse a text into spark -dataframe: +The library provides below `read` APIs in type class `ExcelToDataFrameReader` in order to parse an Excel file into spark dataframe: ```scala -def read( - text: String, - textFormat: T, - originalSchema: Option[StructType] = None, - adaptSchemaColumns: StructType => StructType = identity -)(using sparkSession: SparkSession): DataFrame - -def read( - text: Seq[String], - textFormat: T, - originalSchema: Option[StructType], - adaptSchemaColumns: StructType => StructType -)(using sparkSession: SparkSession): DataFrame + def read( + bytes: Array[Byte], + excelFormat: ExcelFormat, + originalSchema: Option[StructType] = None, + adaptSchemaColumns: StructType => StructType = identity + ) (using sparkSession: SparkSession): DataFrame ``` The `read` method takes below arguments: | Argument Name | Default Value | Description | |:-------------------|:-------------:|:-------------------------------------------------------------| -| text | - | The text in string format to be parsed to dataframe. | -| textFormat | - | The `TextFormat` representation for the format of the text. | +| bytes | - | An Excel file in bytes to be parsed to the dataframe. | +| excelFormat | - | The `ExcelFormat` representation for the format of the text. | | originalSchema | None | The schema for the dataframe. | | adaptSchemaColumns | identity | The function to modify the inferred schema of the dataframe. | -User can use this library to read text data of various formats and parse it to spark dataframe. -Supported text formats are: - -* CSV -* JSON -* XML - -### CSV - -Suppose user wants to read CSV text data `csvText` and parse it to spark dataframe. -Then user need to perform below steps: - -#### 1. Import type class - -```scala -import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader -``` - -#### 2. Import type class instance - -```scala - -``` - -#### 3. Define text format - -```scala -import com.clairvoyant.data.scalaxy.reader.text.formats.CSVTextFormat - -val csvTextFormat = CSVTextFormat( - header = false -) -``` - -User can provide below options to the `CSVTextFormat` instance: - -| Parameter Name | Default Value | Description | -|:------------------------------|:---------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| charToEscapeQuoteEscaping | \ | Sets a single character used for escaping the escape for the quote character. | -| columnNameOfCorruptRecord | _corrupt_record | Allows renaming the new field having malformed string created by PERMISSIVE mode.
This overrides `spark.sql.columnNameOfCorruptRecord`. | -| comment | # | Sets a single character used for skipping lines beginning with this character. | -| dateFormat | yyyy-MM-dd | Sets the string that indicates a date format. | -| emptyValue | "" (empty string) | Sets the string representation of an empty value. | -| enableDateTimeParsingFallback | true | Allows falling back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps
if values do not match the set patterns. | -| encoding | UTF-8 | Decodes the CSV files by the given encoding type. | -| enforceSchema | true | If it is set to true, the specified or inferred schema will be forcibly applied to datasource files, and headers in CSV files will be ignored.
If the option is set to false, the schema will be validated against all headers in CSV files in the case when the header option is set to true.
Field names in the schema and column names in CSV headers are checked by their positions taking into account spark.sql.caseSensitive.
Though the default value is true, it is recommended to disable the enforceSchema option to avoid incorrect results. | -| escape | \ | Sets a single character used for escaping quotes inside an already quoted value. | -| header | true | Boolean flag to tell whether csv text contains header names or not. | -| inferSchema | true | Infers the input schema automatically from data. | -| ignoreLeadingWhiteSpace | false | A flag indicating whether or not leading whitespaces from values being read should be skipped. | -| ignoreTrailingWhiteSpace | false | A flag indicating whether or not trailing whitespaces from values being read should be skipped. | -| lineSep | \n | Defines the line separator that should be used for parsing. Maximum length is 1 character. | -| locale | en-US | Sets a locale as language tag in IETF BCP 47 format. For instance, this is used while parsing dates and timestamps. | -| maxCharsPerColumn | -1 | Defines the maximum number of characters allowed for any given value being read. | | -| maxColumns | 20480 | Defines a hard limit of how many columns a record can have. | | -| mode | FAILFAST | Allows a mode for dealing with corrupt records during parsing. Allowed values are PERMISSIVE, DROPMALFORMED and FAILFAST. | -| multiLine | false | Parse one record, which may span multiple lines, per file. | -| nanValue | NaN | Sets the string representation of a non-number value. | -| negativeInf | -Inf | Sets the string representation of a negative infinity value. | -| nullValue | null | Sets the string representation of a null value. | -| positiveInf | Inf | Sets the string representation of a positive infinity value. | -| preferDate | true | During schema inference (inferSchema), attempts to infer string columns that contain dates as Date if the values satisfy the dateFormat option or default date format.
For columns that contain a mixture of dates and timestamps, try inferring them as TimestampType if timestamp format not specified, otherwise infer them as StringType. | -| quote | " | Sets a single character used for escaping quoted values where the separator can be part of the value.
For reading, if you would like to turn off quotations, you need to set not null but an empty string. | -| recordSep | \n | Delimiter by which rows are separated in a csv text. | -| samplingRatio | 1.0 | Defines fraction of rows used for schema inferring. | -| sep | , | Delimiter by which fields in a row are separated in a csv text. | -| timestampFormat | yyyy-MM-dd HH:mm:ss | Sets the string that indicates a timestamp format. | -| timestampNTZFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS] | Sets the string that indicates a timestamp without timezone format. | -| unescapedQuoteHandling | STOP_AT_DELIMITER | Defines how the CsvParser will handle values with unescaped quotes.
Allowed values are STOP_AT_CLOSING_QUOTE, BACK_TO_DELIMITER, STOP_AT_DELIMITER, SKIP_VALUE, RAISE_ERROR | - -#### 4. Call API - -```scala -TextToDataFrameReader[CSVTextFormat] - .read( - text = csvText, - textFormat = csvTextFormat - ) -``` - -### JSON - -Suppose user wants to read JSON text data `jsonText` and parse it to spark dataframe. -Then user need to perform below steps: - -#### 1. Import type class - -```scala -import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader -``` - -#### 2. Import type class instance - -```scala - -``` - -#### 3. Define text format - -```scala -import com.clairvoyant.data.scalaxy.reader.text.formats.JSONTextFormat - -val jsonTextFormat = JSONTextFormat( - dropFieldIfAllNull = true -) -``` - -User can provide below options to the `JSONTextFormat` instance: - -| Parameter Name | Default Value | Description | -|:-----------------------------------|:---------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------| -| allowBackslashEscapingAnyCharacter | false | Allows accepting quoting of all character using backslash quoting mechanism. | -| allowComments | false | Ignores Java/C++ style comment in JSON records. | -| allowNonNumericNumbers | true | Allows JSON parser to recognize set of “Not-a-Number” (NaN) tokens as legal floating number values. | -| allowNumericLeadingZeros | false | Allows leading zeros in numbers (e.g. 00012). | -| allowSingleQuotes | true | Allows single quotes in addition to double quotes. | -| allowUnquotedControlChars | false | Allows JSON Strings to contain unquoted control characters
(ASCII characters with value less than 32, including tab and line feed characters) or not. | -| allowUnquotedFieldNames | false | Allows unquoted JSON field names. | -| columnNameOfCorruptRecord | _corrupt_record | Allows renaming the new field having malformed string created by PERMISSIVE mode. This overrides spark.sql.columnNameOfCorruptRecord. | -| dataColumnName | None | The name of column that actually contains dataset. If present, the api will only parse dataset of this column to dataframe. | -| dateFormat | yyyy-MM-dd | Sets the string that indicates a date format. | -| dropFieldIfAllNull | false | Whether to ignore column of all null values or empty array during schema inference. | -| enableDateTimeParsingFallback | true | Allows falling back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps
if values do not match the set patterns. | -| encoding | UTF-8 | Decodes the CSV files by the given encoding type. | -| inferSchema | true | Infers the input schema automatically from data. | -| lineSep | \n | Defines the line separator that should be used for parsing. Maximum length is 1 character. | -| locale | en-US | Sets a locale as language tag in IETF BCP 47 format. For instance, this is used while parsing dates and timestamps. | -| mode | FAILFAST | Allows a mode for dealing with corrupt records during parsing. Allowed values are PERMISSIVE, DROPMALFORMED and FAILFAST. | -| multiLine | false | Parse one record, which may span multiple lines, per file. | -| prefersDecimal | false | Infers all floating-point values as a decimal type. If the values do not fit in decimal, then it infers them as doubles. | -| primitivesAsString | false | Infers all primitive values as a string type. | -| samplingRatio | 1.0 | Defines fraction of rows used for schema inferring. | -| timestampFormat | yyyy-MM-dd HH:mm:ss | Sets the string that indicates a timestamp format. | -| timestampNTZFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS] | Sets the string that indicates a timestamp without timezone format. | -| timeZone | UTC | Sets the string that indicates a time zone ID to be used to format timestamps in the JSON datasources or partition values. | - -#### 4. Call API - -```scala -TextToDataFrameReader[JSONTextFormat] - .read( - text = jsonText, - textFormat = jsonTextFormat - ) -``` - -### XML - -Suppose user wants to read XML text data `xmlText` and parse it to spark dataframe. -Then user need to perform below steps: - -#### 1. Import type class - -```scala -import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader -``` - -#### 2. Import type class instance - -```scala - -``` - -#### 3. Define text format - -```scala -import com.clairvoyant.data.scalaxy.reader.text.formats.XMLTextFormat - -val xmlTextFormat = XMLTextFormat( - rowTag = "ROW" -) -``` - -User can provide below options to the `XMLTextFormat` instance: - -| Parameter Name | Default Value | Description | -|:--------------------------|:-------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| attributePrefix | _ | The prefix for attributes so that we can differentiate attributes and elements. | -| charset | UTF-8 | Defaults to 'UTF-8' but can be set to other valid charset names. | -| columnNameOfCorruptRecord | _corrupt_record | Allows renaming the new field having malformed string created by PERMISSIVE mode. This overrides spark.sql.columnNameOfCorruptRecord. | -| dateFormat | yyyy-MM-dd | Sets the string that indicates a date format. | -| excludeAttribute | false | Whether you want to exclude attributes in elements or not. | -| ignoreSurroundingSpaces | false | Defines whether or not surrounding whitespaces from values being read should be skipped. | -| ignoreNamespace | false | If true, namespaces prefixes on XML elements and attributes are ignored.
Tags and would, for example, be treated as if both are just .
Note that, at the moment, namespaces cannot be ignored on the rowTag element, only its children.
Note that XML parsing is in general not namespace-aware even if false. | -| inferSchema | true | Infers the input schema automatically from data. | -| mode | FAILFAST | Allows a mode for dealing with corrupt records during parsing. Allowed values are PERMISSIVE, DROPMALFORMED and FAILFAST. | -| nullValue | null | The value to read as null value | -| rowTag | row | The row tag of your xml files to treat as a row. For example, in this xml ..., the appropriate value would be book. | -| samplingRatio | 1.0 | Defines fraction of rows used for schema inferring. | -| timestampFormat | yyyy-MM-dd HH:mm:ss | Sets the string that indicates a timestamp format. | -| valueTag | _VALUE | The tag used for the value when there are attributes in the element having no child. | -| wildcardColName | xs_any | Name of a column existing in the provided schema which is interpreted as a 'wildcard'. It must have type string or array of strings.
It will match any XML child element that is not otherwise matched by the schema. The XML of the child becomes the string value of the column.
If an array, then all unmatched elements will be returned as an array of strings. As its name implies, it is meant to emulate XSD's xs:any type. | - -#### 4. Call API - -```scala -TextToDataFrameReader[XMLTextFormat] - .read( - text = xmlText, - textFormat = xmlTextFormat - ) -``` - -### HTML Table - -Suppose user wants to read a text `htmlText` containing data in the form of html table and parse it to spark dataframe. -Then user need to perform below steps: - -#### 1. Import type class - -```scala -import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader -``` - -#### 2. Import type class instance - -```scala - -``` - -#### 3. Define text format - -```scala -import com.clairvoyant.data.scalaxy.reader.text.formats.HTMLTableTextFormat - -val htmlTableTextFormat = HTMLTableTextFormat( - tableName = "my_table" -) -``` - -User can provide below options to the `HTMLTableTextFormat` instance: - -| Parameter Name | Default Value | Mandatory | Description | -|:---------------|:-------------:|:---------:|:------------------------------------------------------------------------------| -| tableName | None | No | The name of the table in the `table` tag that you want to read the data from. | - -#### 4. Call API - -```scala -TextToDataFrameReader[HTMLTableTextFormat] - .read( - text = htmlText, - textFormat = htmlTableTextFormat - ) -``` +User can provide below options to the `ExcelFormat` instance: + +| Parameter Name | Default Value | Description | +|:------------------------------|:---------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| header | true | Boolean flag to tell whether given excel sheet contains header names or not. | +| dataAddress | A1 | The location of the data to read from. Following address styles are supported:
`B3:` Start cell of the data. Returns all rows below and all columns to the right.
`B3:F35:` Cell range of data. Reading will return only rows and columns in the specified range.
`'My Sheet'!B3:F35:` Same as above, but with a specific sheet.
`MyTable[#All]:` Table of data. Returns all rows and columns in this table. | +| treatEmptyValuesAsNulls | true | Treats empty values as null | +| setErrorCellsToFallbackValues | false | If set false errors will be converted to null. If true, any ERROR cell values (e.g. #N/A) will be converted to the zero values of the column's data type. | +| usePlainNumberFormat | false | If true, format the cells without rounding and scientific notations | +| inferSchema | false | Infers the input schema automatically from data. | +| addColorColumns | false | If it is set to true, adds field with coloured format | +| timestampFormat | "yyyy-mm-dd hh:mm:ss" | String timestamp format | +| excerptSize | 10 | If set and if schema inferred, number of rows to infer schema from | +| maxRowsInMemory | None | If set, uses a streaming reader which can help with big files (will fail if used with xls format files) | +| maxByteArraySize | None | See https://poi.apache.org/apidocs/5.0/org/apache/poi/util/IOUtils.html#setByteArrayMaxOverride-int- | +| tempFileThreshold | None | Number of bytes at which a zip entry is regarded as too large for holding in memory and the data is put in a temp file instead | \ No newline at end of file From 525da3d0168c9043a0dd480424a50f4c65ae3a58 Mon Sep 17 00:00:00 2001 From: Mandar Marathe Date: Thu, 1 Feb 2024 15:18:22 +0530 Subject: [PATCH 3/7] REST-159 : Updated gitignore file --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index a04ff5b..ab276ab 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ project target docs/build +.bsp +.idea +.DS_Store From 3364148a615d9481f34b30e3e4efcfe62ef3e021 Mon Sep 17 00:00:00 2001 From: Mandar Marathe Date: Thu, 1 Feb 2024 16:18:07 +0530 Subject: [PATCH 4/7] REST-159 : Formatting changes --- .../reader/excel/ExcelToDataFrameReader.scala | 9 +++++---- .../reader/excel/ExcelToDataFrameReaderSpec.scala | 12 +++++++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala index 3ec0e41..bd6a3bf 100644 --- a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala +++ b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala @@ -13,7 +13,7 @@ implicit object ExcelToDataFrameReader { excelFormat: ExcelFormat, originalSchema: Option[StructType] = None, adaptSchemaColumns: StructType => StructType = identity - ) (using sparkSession: SparkSession): DataFrame = + )(using sparkSession: SparkSession): DataFrame = import sparkSession.implicits.* @@ -54,9 +54,9 @@ implicit object ExcelToDataFrameReader { Map( "maxRowsInMemory" -> excelFormat.maxRowsInMemory, "maxByteArraySize" -> excelFormat.maxByteArraySize, - "tempFileThreshold" -> excelFormat.tempFileThreshold, - ).collect { - case (optionName, Some(optionValue)) => (optionName, optionValue.toString) + "tempFileThreshold" -> excelFormat.tempFileThreshold + ).collect { case (optionName, Some(optionValue)) => + (optionName, optionValue.toString) } ) @@ -71,4 +71,5 @@ implicit object ExcelToDataFrameReader { } } .load(tempExcelFile.getAbsolutePath) + } diff --git a/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala b/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala index 0b1744f..7d28111 100644 --- a/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala +++ b/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala @@ -12,11 +12,12 @@ class ExcelToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher { "read() - with excel filepath" should "return a dataframe with correct count and schema" in { val file = new java.io.File("excel/src/test/resources/sample_data.xlsx") - val byteArray: Array[Byte] = Using(new FileInputStream(file)) { fis => - val byteArray = new Array[Byte](file.length.toInt) - fis.read(byteArray) - byteArray - }.get + val byteArray: Array[Byte] = + Using(new FileInputStream(file)) { fis => + val byteArray = new Array[Byte](file.length.toInt) + fis.read(byteArray) + byteArray + }.get val df = excelToDataFrameReader.read( byteArray, @@ -24,4 +25,5 @@ class ExcelToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher { ) df.count() shouldBe 2 } + } From 26da76c8b3549f000ed080b4f4c5d4b37ec1c5ca Mon Sep 17 00:00:00 2001 From: Mandar Marathe Date: Thu, 1 Feb 2024 16:22:39 +0530 Subject: [PATCH 5/7] REST-159 : Ran Scala Fix --- .../data/scalaxy/reader/excel/ExcelToDataFrameReader.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala index bd6a3bf..62e8e71 100644 --- a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala +++ b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala @@ -1,10 +1,10 @@ package com.clairvoyant.data.scalaxy.reader.excel +import org.apache.poi.xssf.usermodel.XSSFWorkbook import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SparkSession} import java.io.{ByteArrayInputStream, File, FileOutputStream, PrintWriter} -import org.apache.poi.xssf.usermodel.XSSFWorkbook implicit object ExcelToDataFrameReader { From 32e18b605d7119e23a5ce91475ae1c3ec295d7de Mon Sep 17 00:00:00 2001 From: Mandar Marathe Date: Thu, 8 Feb 2024 14:23:07 +0530 Subject: [PATCH 6/7] REST-159 : Incorporated the review comments --- README.md | 3 +- build.sbt | 11 +++--- .../reader/excel/ExcelToDataFrameReader.scala | 2 +- excel/src/test/resources/sample_data.xlsx | Bin 13645 -> 15576 bytes .../excel/ExcelToDataFrameReaderSpec.scala | 34 ++++++++++++++---- 5 files changed, 36 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 5f902a7..293cc5d 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ This library contains several APIs to read data from various sources of differen This library supports below source systems: * Text +* Excel ## text @@ -18,8 +19,6 @@ Supported text formats are: Please see the detailed documentation [here](text/README.md). -* Excel - ## excel User can use this library to read the data from an excel file and parse it to the spark dataframe. diff --git a/build.sbt b/build.sbt index 62e0c6c..db9b2a0 100644 --- a/build.sbt +++ b/build.sbt @@ -50,6 +50,8 @@ val scalaParserCombinatorsVersion = "2.3.0" val sparkVersion = "3.4.1" val sparkXMLVersion = "0.16.0" val zioConfigVersion = "4.0.0-RC16" +val crealyticsVersion = "3.4.1_0.19.0" +val poiVersion = "5.2.5" // ----- TOOL DEPENDENCIES ----- // @@ -81,11 +83,11 @@ val zioConfigDependencies = Seq( ).map(_ excludeAll ("org.scala-lang.modules", "scala-collection-compat")) val crealyticsDependencies = Seq( - "com.crealytics" %% "spark-excel" % "3.4.1_0.19.0" + "com.crealytics" %% "spark-excel" % crealyticsVersion ).map(_.cross(CrossVersion.for3Use2_13)) val poiDependencies = Seq( - "org.apache.poi" % "poi" % "5.2.5" + "org.apache.poi" % "poi" % poiVersion ) // ----- MODULE DEPENDENCIES ----- // @@ -111,8 +113,7 @@ lazy val `data-scalaxy-reader` = (project in file(".")) publish / skip := true, publishLocal / skip := true ) - .aggregate(`reader-text`) - .aggregate(`reader-excel`) + .aggregate(`reader-text`, `reader-excel`) lazy val `reader-text` = (project in file("text")) .settings( @@ -124,4 +125,4 @@ lazy val `reader-excel` = (project in file("excel")) .settings( version := "1.0.0", libraryDependencies ++= excelDependencies - ) \ No newline at end of file + ) diff --git a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala index 62e8e71..1a58d70 100644 --- a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala +++ b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala @@ -6,7 +6,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import java.io.{ByteArrayInputStream, File, FileOutputStream, PrintWriter} -implicit object ExcelToDataFrameReader { +object ExcelToDataFrameReader { def read( bytes: Array[Byte], diff --git a/excel/src/test/resources/sample_data.xlsx b/excel/src/test/resources/sample_data.xlsx index 7929b4f25f82cafd741ca25207380291335d8250..862ff58698ae8001a5674fbc8495360909eede8d 100644 GIT binary patch literal 15576 zcmeHu2Ut{FvMw1Bq=}L{(Y|5|%fRV(bOhf0`OWN0{OSJ2SVn9xWl zXJ^dN(a!G7iu}#;3tts z-uF*r-+Y#EB$^YAT=Vq372O*Zz+!uQJ{t@s5&vdTaOEtV*?qMQ-Pej?aUL!j3}SS5 z3LGvdwI3A{1>DApBJdWr9?!mT`+Cd4?Mr5=0JTd{wX(DA0m}R?E-=v^{=qmawRl+$ zP#ULLQ1oPXtB>F)sQHko)HTNqF1HW!Wru^1Igg=_?)$C60b#+H;3MmcRHIq`%>)PX~T4QO0A_~ zA!1o-lIC=m+0)3(rU@pS+(>%v&GjIqQ=J(|a>75L(7~2*2-d3r?)9X4whH656m;%( zy$EIaFpC+Qi=8`uiC6SDQgc4P+>k zKlkw|y3eYWmmqNGWkbMVy<;@;VfS6HTQBRYrf_-7NP<*sO!Fi>;YqRELZ$B$pjoUy z7+>#GxX8gG9gFMiE%uMJSap-h{gqb@k5zS!8DCcl)EyI&huteJ;^q+=n6Q4JY6g*H z&by;-{I(?^R#TrAXb#FyN@(wpa;7%r3@8TF$?1MOO$}=l%t*2fNmV0q|3=;FOVD*{ zZ#xHk93Ii&O9S1b!tV;kkI8t$_3?!z!tlB-2DWEfra{a?O9!T|EXa{ohc#EJ7E1#m z&}+h9ZQUF>O5BIvp$WR1EuhcC=3LG}5aF|aR>leJlQ?qHzF5cqmF5hQJFIk~T&oif z8X5ql5-832z0SPXnzCEuCGs({ykOe*TE7EDQyaGGF%PYPNJHwywA=YwtxTTT;=ag7 z!rx$KJ~XVe#cH@F0upe3T2qrBI<4<$8yJnp{0x1+#*Bm9>1_sNDtJyItxPiqs1tqD zqUBm$9pY@+r08JSAnzss?fw0SPi?78Z63AbTlH&pu|2x3NHNl^%~N(uFgLI3PPN>d z7RJ{OQZHro10OZw%5YtOu@_~c^*MXyg+-bM{(;I3#o>44o8k`GqpDhK0$Q~buS_L3xj1PtC2GraZBj`tajlVF-1oy1xhmyD z9dOL#TK0`|!Kvn1b{61c$kC%gPD(sbxV)rZxVFiaS=_)aY5`%YGycgW*&@zy_U7o86J7k6(cel=!8XL=mSbL^C6IU%{-xEybNpNoORQp!qn zJv~k701_pF` z?ZOkT)Ry2ZhP!X@?k);VIwdzPK^9)qfEdRsbQ!<&rr9W(qUJFkQ0QZT-D0>H9YpB> z$kt!ql0Ij}5zc~LMBb{$VGA&PiL3?-+PsGv;whR5Y}Izn5Chwfb{6*;P%8bm?otI# z!I*@Fh89SShIZ{YrFVAmv@v)7?j{Q#+1gJD5g|dw7g7&jJ;DCA$`)@5i4UE8Q;P9e z?KTy!F&&QHJ6dM4H8o8u_X+S^RIE+iUg$b2!|f=to$cBc!_KhT!mI0ICTb-8Cc-rb zk8#Ump9ON693Mm854)$TGsiP^Q@xoHKW> ze%0%7N@b>N(C6ibro6~2$Ou(`Mt&f>S}2<(I{+Hgh}7JTUfC!4`dn<_en@(f=q?Vu zVs*6SzGw7{PFM@wCZ}W0`{J9sENTa}OT+~s?<2Eg_CxMxI14N%#<^1|AJr9`K6<-P zfH+u^AWtX(^G1ZG^Kc`S<}To)8xr7_FGlIUlMuvO+(x*9VUfL9w@CSjG?Aq+fPwO& zm!$G(m|A+c!ZO7Qv05TtY-FqkH*HyBhh6l^nW~C_rZMabnHoQZEGH17m&d|OhTExSxI}%rFSFws^FDhY2x;0LkM0G+yt5;&lO|Yf;n(6MR!%oIB z0bruNFGo1##?-x&J)?3T-|W1Q?E3glAO0aQg*nzv{q(B3u1c7~!$a2VxBdM!8s~}< zhztp?RuE@3itZgsXSfo{ZwbeyL@Vo-C2kK)oMvJV@Kc4n;!~DDTTTfjm{isHC=E5v zX=e#>#?!p|u|`L=D`8i3MO`|WK7#+n^%SQ3^@rtIZ4XwTLTgNfm&jf4pl%sRKjmzT z#xy;q#B>uy`?yw#eL8}IX^z^50IlhQ^#Vixw6oC(YaG8N=#ACSxI!WpweL3oT6`VM zgQ?mD&~gsxs;uB6m>VKK8k@7!iqtq}c2~_ISR+pr&jiJvz2@Ew>n`6! z4+=sThJ<(JSC70rjAYXph|?18)j-%fY%R&^WXfEpm!^@VDe|cd9SftDPw%!f-y>av zsA2WoWhb;Qdv|y8jIwD$&4J<>bABg`Jpf9zF2uZfEK7f);`2DF{LVvU!c{}oVC4f>JCS8_TSTz%q(g=yGLI; z!fb-#7}6!bNaLrn>V=bV8}7PrNSPeZC3hVfyTt^iHb|_w*Xi6@CwSLYNa~s!^W`?x zNv6d4KpnrtQgfmG$-}&W?b7!SMb`xe*;X*1-e&D!za$}u<3b2BxJM5`sC6HZ4q#!ob9Awg(4lQjVz zR~V(xAQmO!lf5_v$xgQTX7p2KJCw(p;s#*^$w%g0@sZ?_|-K&DE-Cq)hcX6=S$K$dRUON@P z8C8?Fb1p;o!{L9YzMa?cLY~!c(ttF6TQ>hR5}U1~0qY|if7hY;*)8yJWzue`Q@N$G z)VHVeUS!$mZ5kBWVYZ_zSw?I(q`uw0Cd~VKeSWV%^-)xl>8*%>Q0&hlX`Q1xnru^n zWRXqi#Y8>)F1PEM6C55oQl7t{ZLU8UuUuoz~D z2=jl=&lvVf@8(i=T%BHuj@H>D{mEIv{t4eS7Y19eKln{KnRGr4Gii(ikZuV_H~+)#qa-e$`tv$!N(F$FT>ukh zGlWeTSKrhAtE7vL8MPenVb$i?l_NPUzZRO3RYb^eI$%ryCf-@E(u`~0akPl*PeP-L zd)NO72YCf`Odl_Ke3am!J2X*oYO6+kz48htiMkuvRPQxfm5Q7I`=`Sb%Z&aHu1+6+ zB;GceF*cfZRjszFr|AJEr>)h5Y$we;J#$l;H5mn}1;v%wee&vR-}n|cE|ux#7MxJ+ z8>&~HD5D}^H&uGH=`@mk6heW)NgBiAS>Pr6>_*zu>66(fqw6?oGn{*r^B2-^fI$NU z8&>1LagcV)vUm_2%jmEsK(Rk5qva~7WdZF051+F zfg4$LgAuh`!u;sXmBn`!6E5QH*GY?FbXz+14j1q=^H}4($;KR(F(7+LJCA3om0WRc z+3%8=FnTCs#rQNe1v$e1oHTz*+qK>(! z6T@;`urdD=fOYI$$r@8&=IyI< zbQuYtYE0wa&B6`y*9z4@{lbVLY zS{8N+tO7@GuH3lO)wyMXq%U9wYB@%q$J^OPzB1Tyo3xFBDy8h?wsu(h?CrQu3(V;y{mS=0<2YKePo$Kv zgubqFRLFfrtZ8PGwnzS?N4>YZP0Up!X|N<$YPSb?!ej2B?k}%=FZ3ivu<@6=ZsA27 z(SsVc;DYFAlz*e^Keaq1X6nr(FVU+V&O;K;ZT>iS@jml=J;fa2Zk*RLxPqhF)2#>(o9 z)yE@aNFVK2AWoh*LZ4c5qERqrqHt2Izs2eq`kM%iTpKx2%?mdn1Segq3`f_lkWxf# z?DFR)ez>oYnCwMacN;s1qR;)R)wmSFy>Gl={or;&VB5~;TTiPXPn#>Ph{~?3*J$9g z`7gr8Rf?|ZD;2^ibHi{I0@AS>0RhSzjq2(zXlz~Aq*;f#S2p&axyVy7+#f%jdl{@@ zWz2)XPE+fGtB2&WaEi3h&*JsSC)y{x-I0xoX;iyVhPL|#-%#632wh>&<-Y1KyjgIS zAXem*cA&90hcB~7PE@K9i*~aOZOyDBWuA`i?iyCM=wzf4 z!x*;l#S~hFOjB+$x+Q~O72vg_mkfG=B{_69L46E2gO02;h30CaLjmg6J$s7qgh|)6 z+k2Exb*as}Cg44GgH=7h&3}GqQsIog9c>4d-(b-d1HZO9L;}AVdYPrj6cHIyf529g zW{TM{Rnm*r(6o0jd0=SU+B)(TCav3X!BqNOq?6+sdH6=6q$! z$ezc@M_-4>7wK?n0v1?WZ-anmd{4F#8XGP;<}Ri^4i_a%8-0$q=8#@698PS2Sj*^T#EEL++T`kNVBc=tXl4P zravn-wLjd##ls_dWVv%=5(4pTfOO-ECaAh?pT2%BwivX+a<=z6d4;8aiC6-}z*o!w z@Pxf&TH1?DZ{RiarQWwVzpmH#U8)W-t9rBJk?Qqbe3_#d@=pv-B1#ei`({!0+d+jA z1C1N@`(0NrJTA8Ob|w1A{ga_D4f;jJ_(~j-z zbfg6Jz`i0*7nXnth9Is5ZzqfyZ(8SbhqT;{6NFCR%??=Kd89%RVvTSdc z&@$3L4al2*>wCaki)M6(asc9sLf%imbk z$AdKy?LNTmEJucLxW%->66gmj0Dh+?;#b5m9FuCiCeksS6m^Pn3}-=|q8-BpsKzJp zN(9LaH`KRNQq;dev!~zI`n;@*%~Yy|3j-lN-%C{A$>0Z24!8Nb`VRuS$hvtpK&Y}| zx~>i^1{7_`z|NozAYdtgYg2|44s8bkKLUz2Wdz{Reh{z%z_lf#0f&x(fK`B^Eg5Gx zbQT1x0dQ^0#K587K)^ac(Y8!E9J&nxegbgq$PB}wM<8GWplC-128SX+z$O6Kt_&#x ziX#DR1{Cee2q2(@62Mjf*Pe_90(xBn*bXS#lW|5s=_P=j0Iq$R7zFf&1h5-Wv@cVR zfO1O!djVW9nPCJ}NCMaoD1ynr5Kxc=5DMTrkRd&X%1cxZs@fiC(43F_{o+7o$?E9$ z+XIk|nJu}gvwXuJB3~&Qth3T5S968V^1^+sveFUz?zRUpMrb8Ju6}pp|G#2_&{=l4 zuVt3e`AiJ%dK-P?BEX8|b{R*Lwm~=WF#!T)DO{qJY(9T8n zCmIRK7qGgN28?LIZCZxit%byZjOGJ!s1lEqw>qEE7x!7WeP}mutxhI}JgCo;TzCQ~PrWf%vC|-cMRIG1 zjNIRLyJl|2(LJ=IUi&LuO2U{XP)S{}>59HmdGYF0M8X$wHjsmcv-vg61GF>{w>^l< zZOqmipXdrnScwIVv=09_Is7DetDEt|;YYciO*++28=RXqX76#v=URRSCcGjH9D3YR zs=|UTre8K{+>GX!;n8&B!zn5;R8fPITRUX~d0Sd1)2hfpMJOIi!qq)qS@x!c>|6c7 zH-pIf7SWCsH<#fa9x=;XDTh^fc+p>oVCH9lO@BweC~jEe)tJ7w0q#`z9@5JU__rX* zk52L2;8KtHJ}>S#Eo+aFpU2RbP;KiP3=Q+IBZrpX#Xnj|BzJ^r9yn`*%s=GX)UN8U z<_4xph{jAraas$j?S-VN2YNhYG_fbTfCmN>2ubx1u_a0EGs zSy`NDzZ6&yt4(g5+uG*rbfwufjSVRRXsePSu9Zb}pY0MD8e^Kq-luON*3Psy2#$9o zD!Vvom~#)X$UW6G+c#$# z&Fl3zJ=tDZI8TNjAD0**k9r!@&X3mZrp3Jwhsc$Vjdb56_kr1Zu&eO#OnLQ%xA#Sl z1ag1(xN*j?*6v`ZY-bL6Fn;mL)59CS3p=drfq5REEl*8Rg6pn1%~;KdLy&Dths%Hx z9v&lakMsSusexp~DeMM(yX|mv!wdYj=}@IHy=h`#<#bSFy8ZOdFpmc1oXW!R^=e(P zhmUXioN#$9Jx1TR=Pv20n}@8RF2|Yn)1#Vi?j47gp@(}lW4$W{jW)Y|WsgE033;uxNeA1PwY&IVE6VohPiuzou2D(zP|R%$Mtxv>cTqnQLX|z zUDux9SXElpMRm5Lwgqt~TRjmJ^P+VB|alJdAv*3?~$$aaA;-b2$^*En5|hr9(TpHA%s-Y7%8z5xF4;%fNrfqH%4k7A7ViI^4+_M2*Dg6q*-4H0 z*p$(RAU52;f)6T$v)C>0`?1KtT4!zm*Sq2Zy;sa+$J|7Bkh9(S;!PjsFNA z^e&D`hvXJzCmm+JDWeAgS%m*rKBySZyAH`i%1(OB8B<1Yg4hWE13oASXR$-_oU)Sv z^W5|uSot6#7|Ay%apSB*_9AhWf%Fpd(~OS1#7PH39{@2b_Os6VHJTrF*TsK3Klo&e zf?E&&?dl)H{<5wAS*G;Gqg4j{bJI-l?SqJy$TFzJ^|Ox4%w6Q)z4e#+H6ZmUgd+;X z$0-g}r+?#1> zfeRn+>=<4ASbP1LbIU$Zy4sA2gz4tREp=mh@gCiEtDCgdEeU5>IZ=7RhGJaKXMgI~BJyy9eI)u$jXJ>*nZ)Sx+omx;?c3 zR{+|}o~ER={_pt%qjVcf3BB*`yjQ}Bm%N{f^MFo1;nfgCc_Wyjr(9XqYVJp0E@wC> z9_MJXqds#VqJG4v;B7N|(}zy>4$eHL_D<&CKlY>Y9R3rtje2!9QASEFyku>2*o`t> zUda`lccrKNyVGu8c|bNN?Np7O3}hWFo|^?GyCOXaR;K5~kKgOjX(o94xqciM>=t-I z0{ujxzGHPzp_{7|duO1+_;E?5JjKc&h({(BP3s1lA^e48@ix| z1f6rkH7z@~*U4p;xz}xn_JbD3j*CfSez2^j{Y~nn2J3`hYWbfHE)Zd%c zGL2WTR`eQiB50tkf)Wz>r>KzDzm1{@^UC2!K&ciVDvE;W_v-b;!QnsJg;J}(94RqU z*3-O1FLz!daWlQN`>O&!c3x$(yq}q=G=9II)brEzJw<5&KFIlDobAWG8$r_cS#LXF zxxybCrg!cRnle*(-x{EJNw#YHA$2Z(zxd4eNbai1uAs|j(_4Uh`$OJ{jPYBt%A1k0 zRlAzq@gl{s%u|bm7Ev7_)uc$brqjeK4q$k>2i+!&P`tog&47Jc;EZ~s+yfXJvy{Jb zAy^bZce2`Wx0S%=V4PC)nLmuHuc-XiLyJe#FI>|wD;K9`O>##Kas*azww#^3d0%S7 zvSuHx?s9;efly+r!%3*{6&Y`9hv2Yn&(W;qNUT~3k)~|22+2EsaUX-eHaX+anhifG zgxpK%mAR)n!pv_Ohppj!du7sd3zWRf-r1Jtz?&$!KTlTXG|hBwH5Ped)XLJ*D!PE! z_32bf#iUm&z)EW(%!@e*=VSGCkK#)XJHT1Yg{itw6gXT9<6%k{I?Q<)=sMfuh}ZF| zq3blqjgGeGKLY$Qq3o}M-_NNi3;A7o*=4~$CfWR6uo;y+@%K}1F5_HIx%i2MK;@%e zO2D`*d^vRer|=#szXK)wGm!nV=;eUXpQ2SnzYHI}3~)I_?I*xXRCv~3I{v>4Si1~( zIl${DU?TZ%4|6%(>oUUSNTQzzjwt>4W0!u$68d(L_9`_37VvE~}znq$s2<`{F$@=qNNViE=da&mHl zZ?SX_2!5jTCutL9kc)@4i-)E)GhI+ux( zt-aX*=g-^9a?pDDzw$3K%N=mll{6n!eY)o`GDseE??IS|GQZVrVr<4W)=jNie!*uq z`6MY`ER6baJLoF3K8)ntyxf^LR7iJgd6>Y$(bwTI$??+!v%;ne(gsJoJ88sqd$%cp zA+eK#tV?_$_YGLT0HXp8?p*;DJSmb0z`j4)3L47}UVM(mSNB_-^IfWnigAmjk-Lqn z)CwZd(KtiSFLulC9w7k%BPqej{awc)?5n z!nc<%oGmsz-|&#GH3^5kE=j4Pw%)3q zecZf${?s-u%7?sjCvSd-=+A6Jck;wz&?6l;kgL0>rK{_ypJpZwI8KPsbxfbZa)(c` z*N+MEbzROwqAaW}pFkJW1L9Q}yHORF#>^GRE>sLDQ0?s@k)4a#_uj#$(2XeZgqWP9 z)EK~7D(TN6P8t}UK-d94sdzYB0F}{4;unq5=;bzhe6!160*@54nC+ zKX5UVw692Vzpqfe%JK>AXVUoLXi@(dv(kF4#V+dc#cu}|XUi7~ri?ax5*01kHo}cz zk~^HOZ{bq!Z>#P%E)TeAhE8TTHWx57J#{6xVolq9%fc7J8KaAhmwjn2F0UKYl|M4f zF+X`@bUp83e57z+oC8t!AxD+lYC@2tqxzBPY{IQRmZk`Wh2wrgfd?1KNxX~%e#`#S z{sBbe2Jj=@L`Desp-OxxV~Z|k%j_0LgnPnMm(CSN!h0ayw77JpR;HE2#Wf2b4sfSp zv#HtDXj&CnVmUnH4g>|d1Fl#-2vRNT?=Ciuupurdp#(682J9W+XFk?R>B|VW1aYj}&uEI4% zFII_i=q}RGXWA)IN>#xXxz%F#=B>2P{nS0x2;vQ^eguSD%G8n{dh2lb?sOY?e8-=R(jfX(Z)5Vj)DFH@gR8 z1F8u#w(tsRmnnU>+;TdeXm*Z#Xur)Wx5|}OwDH?;>Hz6i``M@1w=wBU{E)guKY@sA zQyzr!UYQIbdt9b>aG~=YF~t~~_I2S5XVXQ)C2LRbX{RTdLpQZi*Sn)5r{1oF0_Q6) zVy?a^l2#0UjUzcVsnv__d4>~{_MH6wdcVPNVp1O`QFl8_H|xjuJ>2YFY~4?-nye25 z4T&)Xs?91I5pLdI2J>r;OJx?myOa4Lt@eUeqvoahshb?o9#5u@%FZRa*M*v`qA6%O zIXHLcLyjO2m2WsxO||t00byQ?D~68#CaT$$_hxx{Q3d4%PnbTUD8#Ot!h>p*cK8Dv zt76w1r#}g`FVnF7cD{~~oo;?y_0ZJ`B;;cJTxyE&phKtSOOmwS#54VhYIy7iN#4v6 zTF$Z{(sy?CN-)!dd3)PCtyAY)NCd>?);^)m`?Htf8(Q5727 z4R)Q+ccVsLeA4nTi|X(peFe*BjqC7aDLcnHV^7LwGfIR@hPDXHBLCxnsP#sY&hV*r|0+c)nWfA-(a%Jt?7B;0NJ2 z-&;ls%-uJ!jyjS@+$oBLr>@o4_pO`tq{}rU`7_s|{&&~%@O85O?o%)$XV8Q+9Ztal zf6F4$lC%)jp5h2ii5`!CO9a#v<`T2uBs0!+;$O&+W!*#!fUHsUx^0kwjtlsFH zhoYJ~%el}6g*3{9-K$r?3xW}tt37nd*C^(M7~HyS4YHn)%$W)md~kDyJ}RKQ|C;{L z-s7zLS&VYROL~oKf*TCyg;Opvi4DHNd#SQU-RCV9$&ikXjToA#sQQ|mpx0n6&`xhF zr}3fuLyggkujI}{YY*iqRi$3shEsKK3Qr(60$W6HJD70DTO}*dQQ0L#7I(!xW7O~R zs8ES?nd;E{Vo$7m#^Zw(Yf-i$qh{mb9T}OXsmU*Owa4ff!(jZ=*+woswrnMGTB=tm$uP`w^yh@0FsOeZpCilG3Z#ql!rZqydp|!l zj;G~*yha$C629Gl_+#L?;T_CtFFAq7fQ0B~%;PMkT)v>CX~yHb7SY{$F~~$7ZTBrP zmnwk}Mx@3Q+6~W+zAo16c$2Knkp_mI7b9W@(#x8bfX}uiq!1pKt`Y&dq8tvEQrzR0 zcbeV^BiIT}y&vz!wiV3z6H+~M%d&A|75B(>&F+W(RRu+#I{5#xd9X?7D?C;Tzb;uIy1&z_{Ak}&l_*fy=(s{$3VZ|uu<+l)6{7U zd}rHXbq(D2fcG0myiSeQJqhm@sU~FFn_I$c)RnMT;r<O7-f?UMj)X;-g=@zjr7UJ6;=Q+NaUbc0If_+>ModGS0cbkoO3gL?PXP77#@ zu!EwIHm+Risu+FRa;Ba0N~2+{dYH5F473Nh^^AEJX4ZFhmm1csY4t(yDJgy8Za(cY z4lUTBvRq*p*bLSzGZ;DczPz%v#N*smju&CJc4XkdeqBLsSc3wX8*E0oyn4MTeQo;L zN--7&co+=z@U1<3@=3iH%4SJ7Z=aRYcLcPGHD7RW zxwe)_SdQRFC~(B5JxeKSQic1ZS7AT*XFbd5+~ee<L^tPOA8o#{ieKma+8O7$LDs+^yQ7o~CSZl27I3pYQiFLP$rwO+NJ4`UnHV&a#J4s`IJ4MH_USpY0eq+%D31&mi=X?)i$4 zd24dQSzTeR#3(nGt(Zn7{l0d&p4ZQTDQWa!w1Y0Hnw{?!Y{%vo&e(V6L()d+T#P4=uRjgZ`Bh-dzntf483R2)6d&h@wa5(E=hIG3@M6*XN516 zX;?`Eti=5}DJHHn$5@0%fD}7hhjV#FsY9#>_a~$2zh2-Myga90uBfuVnviv_8tQEO z*#D`2t-80mU1zxl;6|iStWNO3rz)DzO`RKgl1w=N591sQWB{oA1`m4jQ=$#Ps4Uj# zlGNCzv-aSj4+PKox18ytjZye+VbZx-vMoqxaj^rQmFXE_X!W~i_Le-5d7B%QD|0U> z%fLk$W3&>{j~JqVOQ^lFc$qJ+8bp$sc_erTJ%l3U=t?*a)e{6+rV^>9#i;g@Ut4pn zySz8}Adpeb%zs&2Ihx@!ql)$dqiGHM^>GD;AzZ~LaSGb*b&d!7^D3}>Ib9{D#D}ZL z(TyDr(tRJU=S1eWIU9e1z zYo3oX@!bM_bKg}bsm!VchqkOO`A$71s_{6cJqtRoqJk+^81sC8Rlh_tittGfKrBpYq3NXX?@|w`KUdc$>qG}>m?GGYi5PUU1LPZ zvK2EcA-);*$1DUP?}R>PUP%;#2rrV>*fxAS%j@oZcG<#_Dd#G6(aT;r?+7ZsSg7T# z2!KiWhsHC?{fnWSIW`Ykk|#o0M2d+IrI`zy5oh`qR9?SdPMTve=ds!(z={{az zK1wsJIY_SPc`0P~@d5R1oJuN>oTT&wEa7~mJ@%G(;`1nho3WjF_nMFn`%fyR;L&rR zAJnNC=LO;YjxNTlKK8)G9+@PIXcQv4 z&`Q38^5G6ndA5`Pk+Jlo*t^nFPn!(N#02Ce4_&Mh?Yf4NRex&|-D%X1Gzxv*tu?>bB zvE1!TaxM=NW_$yp512jeUiE!-7waUHHy3)|MD%?0(K06OaQcx;zj%hcV5jG%8f&pT zwf7>~?0qliwSUEAs?LVDpv-}?Q z?snGJ9_}X-#FNYSuWUIIsLg(djSPWJ-cvM!Yr6&-Ug)y3$sOoxzgv3T=+#VeCFJ46 z9UI6v9v(*#en+)bP(YAwXx*>0o<_GP2b+_ zT^YhC;&4ON3fXIWJ?Y!?BMUa3*YTb;>B0N6;9yzbrZM;!u)+X4raxojeZ6TC4FZGX zy=+rUSLW9-3tpbrr%tZHxDknzOpc88jzCm^#fu0P^F@7E})9Fv>D;%Jv{EmfL ztH4^ndO3f8Si{&f^H@bi3fBGz+Oxa1o-@8XmYM`LbD3_01$x!i!!z$T8k?8zH*rPx z*^b(FDmKB#oKh<*3VreCwzi!1db|#qi_klWUQ4X4Jg~CAHXXTOg%2n>viy{-9n2Az zH$V2MpnnmsVhzZmX@>41DG#djk3JgDR~Z}`2?ttN9U85!yE=oXEBdt;@eczI{GYYO0_0(>-f^}QBkfB^Sl1)vw5(U$Fj={s&HmlxHi~Ss>ee%S-7P;m9d47I#hL|` zxcPlHE+lxv)x%6=zlT|1q8B^1C-wokwzba5!=VkF>bRrm1m?S20wZMTi| zmSK=&+TqwtcQ#dF(9Dee^lAZmx@_>kCV71d1AkCF?b~@UuIIO>D73qF5BYj=JyFkZ zP+q8@=u@Iz?<;Rdg*U#YM%xy>Yh9g>d#j~HKQ|h?YXBj`DjL9d!x#UU)ec*006UsO zu6mB?rT)y?^$dkQ|A>v~8D91LBi5@oD(v}345>GI)ss_k>fNxX2C(iYMy-K-J4}5f zIIJfl?Ai7cV`gH4zI{6x5fS9((oIPjV`gNu*V@|3%yeMCNu(%ZW`QGG5VgRCx>259 zzNhuCBwayVAqrJdsv=isBJAbmF(%QE32~xFTqSB%QK~0bXC_?ZNfJ{~s`{rHei^awd3 zfSS@0xjH9dIUkP~$#8tgBt7B=QK*{ICb>EnVJ{z#ABjFVWR)HvPt>ZWgdtbwCS2p= z2_hK=haAx(6nPKTZsVV!xap7ia)WR0fS=*$2mV~BTRDoLls<^2{*-N2ox|+>I`}rs zNx5iuc@kQxzHKas|EIj?kRN%jOsatqK6gTN{-s@n79dIa47om9eQ0*Tn zYoAEgB zpH9ejsHTj!Lvd3d=j8-T?tmk40|>=`K4D(4#|}6iH!z@h{ZLH}k3w;C9_Rhj3CRvS zmGCSmZsOxduY;*}z)x}F2*rOoA)hduNh#hdwR8kG@75IX#0@qMH#>gc&@kTHCk+D8 zP-i=3IvX+r6cV*I1lx@{bOz}v`k%;+7Qaj zd2a~C6$CZC+HW|-U&w0eB#p&JsqIVQYsZ^lna;H<_Mdu={oLKJHyKZ5p!-tlO51p* zK(!#6`#zhBr!zj9=eBlbCwnnP1cZOG5BOJRd=lRme2_rUL>FVnCYI=!CRD`FR>Rh& zugB-A=({H`)>WS7PNepb@(oqd-SN06Cu+3nEqYO|HkZZzEr$J$FxaN=xw1m^Q*QMv z{DGx7ccY`>%EP)0s(Fu_9FQJ+Zz&V_ODg@269xsWbFBMUWW=9z9 zMKp9|VpnUzg)A^)-cO=&%*mx+HE(yP5Gc7av|bC`Lz{Y+s#Grpo5EPjL59xmHSCZy z@Su-Vp2{^7ntjGTgK51yC_GDRqQ}Lr&Mq}QFyX-sko56hCCH|3MSFy42qaWzX8=OG zXcr=9YehHxq_dqB)}v|8#4L^VgHn(szVhgOY*>=b^@F#|_XbQ{^(q*_;CN+_icSuiedeKy7Rybo$#61eswkV~)mg3fuGm`&G6rC04-*?6I)@ zQSo?@JvwUb%r`6{=1EAfr&2w)4i}j^W|;uS{d&6Lj+bIHm#w-v=huDWBTFUBA=RdR zGFQ>F`_&S)`P$SRYlgnw;iBnU?oZ&$Q?v%qd9!Kh32j-q729dfQny!d48h(6`X0cp~YyKZxWNNAV!R#Vnb~6r{#+xoao(Qq!eUxS0If#k9 z;WUj`PrTPk4;RiCylB4ntl!@>Xe=*iYOXz1{_&A-YO1zJLO-<8FIU44wKaBVZFjuo z(%~WdNWr*Spdl%bangG%h(vx*&a5pkyV2i0z(Ebv(ot#{qH6{|4+wtpbueIq)N7R!+d5Y2kEa-b=U559oz>9Wi| z+_}5#UGWW-^TINjqm;p@Ao{^LS={Aa7`}-k>2RulCrob|yXmE~>>n1S6MbVS;>O7Q zOlt8lC=x0@eP8seUvl~WK`E!c_E$fOiqsiD(uyKi`LLB1jJm7BxePGXB3~P(%T+<* zKPLrU70zaWi5BykF~25hBeru~6&U)WS}{7Z+-Db+X95H>eh8`hqFOLwvfN~gZC?Xe zGHAoLEnOA93teToX%^dN0%S92qqb*U6*&5$+A!aRd|v}dGQc6*^sWj-eQM1Za#?P| z#UH}6i+Qb>H?9hoGr+$lpNn(Qv9?hph;z`hwy~aE&at)$om|ed4&9S2%yHEeP(kIB zXENj@@_Oin--|}vp}#Kl{>@zez4Qje8xcqeW$!Wo+JJZ~0x6@^S^-QnAl{2WDkytb z0N4gZvk2rZrPeBd9g64@fxM&aT?O2LA`l`F2&L8|F!+LlIv^Anz%) z)&c2I1WE+bK-s$vfI<-)B9KN(Ei_;PiZ~E~z$tst04x+iAPQ-w)Y<^BHzLSIA+40X z8-N>)2s%;72TH9?fJq~QMHJFO*}DnwZ$$8jLb@omwgBmk2tiTEN6OwU0JISyCJOmP zskIH5Xhg`0Li#9sw*lBjgpw!(L8-L^U=PfVQy5Uqw#(Co3o1ME%~5qKb-3BiTzAQ=JrcEK?dOIa;_$z{NJvize#QX3nk`ud7_X3 z=5Z9I^G<~H;fY7(#K94&e?Rfp{-jRs%V5O^T)|~G@ut4lg$@$uH`RLP`cpc&MYR9! z<^Olztr9YD#d+s0J8V$(U&$YK06_-Wop)5(VFRlFO8&3|8ZvOhdFLKGY)JKA$scy$ zh76o>-nqpdh*14k@`oMF?ei)j1BTYIEwU1A+hcyQ zLs4>NX&tuW^$r{455vQ!Bm)S@_bf^%_-@W_jorDcqJ8+KFfk)5 z?jru%mT=Lu_YK$g19iRWd!XLsUZ;AlPZKLhqTR(Mho%tdX!^R>eu??iD6-IeN?Q>L z>3uN4og8$f0_sv7EH^%SRDhgB(lm^eneSkcSu#DPyO4TkZ}v^N2@J^@7K8u3k6b6A zn}caq_!-fEdZyub0-qH3(+P)PGyaSqKLz~% zPNgRZfnVYOzvliKEPaYH|D8Ne&}jdW`~P6hf6e|gAodhd{X2D@oZ9#!eD$vyeg?js z0%w0GUh*Fr{sTt)Yu2BEJb!8P0IEO4$^EL}XBfpP-sg8(Jh?ro{m-C@Uo-yg1341Zlw|aeDV}e(f!~ diff --git a/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala b/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala index 7d28111..4c0307d 100644 --- a/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala +++ b/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala @@ -8,9 +8,33 @@ import scala.util.Using class ExcelToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher { - val excelToDataFrameReader: ExcelToDataFrameReader.type = ExcelToDataFrameReader - "read() - with excel filepath" should "return a dataframe with correct count and schema" in { + + val expectedDF = readJSONFromText( + """ + | [ + | { + | "Created": "2021-07-29 10:35:12", + | "Advertiser": "Zola", + | "Transaction ID": "1210730000580100000", + | "Earnings": "$0.68", + | "SID": "wlus9", + | "Status": "CONFIRMED", + | "ClickPage": "https://www.zola.com/" + | }, + | { + | "Created": "2022-04-18 07:23:54", + | "Advertiser": "TradeInn", + | "Transaction ID": "1220419021230020000", + | "Earnings": "$12.48", + | "SID": "wles7", + | "Status": "CONFIRMED", + | "ClickPage": "https://www.tradeinn.com/" + | } + | ] + |""".stripMargin + ) + val file = new java.io.File("excel/src/test/resources/sample_data.xlsx") val byteArray: Array[Byte] = Using(new FileInputStream(file)) { fis => @@ -19,11 +43,9 @@ class ExcelToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher { byteArray }.get - val df = excelToDataFrameReader.read( + ExcelToDataFrameReader.read( byteArray, ExcelFormat(dataAddress = "'Transactions Report'!A2:G4") - ) - df.count() shouldBe 2 + ) should matchExpectedDataFrame(expectedDF) } - } From 5d29483d9676cf802e8d39dbc324017c17cae814 Mon Sep 17 00:00:00 2001 From: Mandar Marathe Date: Thu, 8 Feb 2024 14:27:46 +0530 Subject: [PATCH 7/7] REST-159 : Resolved code formatting issues --- .../data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala b/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala index 4c0307d..e28ed41 100644 --- a/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala +++ b/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala @@ -48,4 +48,5 @@ class ExcelToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher { ExcelFormat(dataAddress = "'Transactions Report'!A2:G4") ) should matchExpectedDataFrame(expectedDF) } + }