diff --git a/.gitignore b/.gitignore
index a04ff5b..ab276ab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,6 @@ project
target
docs/build
+.bsp
+.idea
+.DS_Store
diff --git a/README.md b/README.md
index 8440f36..293cc5d 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@ This library contains several APIs to read data from various sources of differen
This library supports below source systems:
* Text
+* Excel
## text
@@ -17,3 +18,8 @@ Supported text formats are:
* HTML Table
Please see the detailed documentation [here](text/README.md).
+
+## excel
+
+User can use this library to read the data from an excel file and parse it to the spark dataframe.
+Please see the detailed documentation [here](excel/README.md).
\ No newline at end of file
diff --git a/build.sbt b/build.sbt
index c8337de..db9b2a0 100644
--- a/build.sbt
+++ b/build.sbt
@@ -50,6 +50,8 @@ val scalaParserCombinatorsVersion = "2.3.0"
val sparkVersion = "3.4.1"
val sparkXMLVersion = "0.16.0"
val zioConfigVersion = "4.0.0-RC16"
+val crealyticsVersion = "3.4.1_0.19.0"
+val poiVersion = "5.2.5"
// ----- TOOL DEPENDENCIES ----- //
@@ -80,6 +82,14 @@ val zioConfigDependencies = Seq(
"dev.zio" %% "zio-config-magnolia" % zioConfigVersion
).map(_ excludeAll ("org.scala-lang.modules", "scala-collection-compat"))
+val crealyticsDependencies = Seq(
+ "com.crealytics" %% "spark-excel" % crealyticsVersion
+).map(_.cross(CrossVersion.for3Use2_13))
+
+val poiDependencies = Seq(
+ "org.apache.poi" % "poi" % poiVersion
+)
+
// ----- MODULE DEPENDENCIES ----- //
val textDependencies =
@@ -89,6 +99,13 @@ val textDependencies =
sparkXMLDependencies ++
zioConfigDependencies
+val excelDependencies =
+ dataScalaxyTestUtilDependencies ++
+ crealyticsDependencies ++
+ poiDependencies ++
+ sparkDependencies ++
+ zioConfigDependencies
+
// ----- PROJECTS ----- //
lazy val `data-scalaxy-reader` = (project in file("."))
@@ -96,10 +113,16 @@ lazy val `data-scalaxy-reader` = (project in file("."))
publish / skip := true,
publishLocal / skip := true
)
- .aggregate(`reader-text`)
+ .aggregate(`reader-text`, `reader-excel`)
lazy val `reader-text` = (project in file("text"))
.settings(
version := "2.0.0",
libraryDependencies ++= textDependencies
)
+
+lazy val `reader-excel` = (project in file("excel"))
+ .settings(
+ version := "1.0.0",
+ libraryDependencies ++= excelDependencies
+ )
diff --git a/excel/README.md b/excel/README.md
new file mode 100644
index 0000000..ceeae2b
--- /dev/null
+++ b/excel/README.md
@@ -0,0 +1,59 @@
+# Excel
+
+User needs to add below dependency to the `build.sbt` file:
+
+```Scala
+ThisBuild / resolvers += "Github Repo" at "https://maven.pkg.github.com/teamclairvoyant/data-scalaxy-reader/"
+
+ThisBuild / credentials += Credentials(
+ "GitHub Package Registry",
+ "maven.pkg.github.com",
+ System.getenv("GITHUB_USERNAME"),
+ System.getenv("GITHUB_TOKEN")
+)
+
+ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "reader-excel" % "1.0.0"
+```
+
+Make sure you add `GITHUB_USERNAME` and `GITHUB_TOKEN` to the environment variables.
+
+`GITHUB_TOKEN` is the Personal Access Token with the permission to read packages.
+
+## API
+
+The library provides below `read` APIs in type class `ExcelToDataFrameReader` in order to parse an Excel file into spark dataframe:
+
+```scala
+ def read(
+ bytes: Array[Byte],
+ excelFormat: ExcelFormat,
+ originalSchema: Option[StructType] = None,
+ adaptSchemaColumns: StructType => StructType = identity
+ ) (using sparkSession: SparkSession): DataFrame
+```
+
+The `read` method takes below arguments:
+
+| Argument Name | Default Value | Description |
+|:-------------------|:-------------:|:-------------------------------------------------------------|
+| bytes | - | An Excel file in bytes to be parsed to the dataframe. |
+| excelFormat | - | The `ExcelFormat` representation for the format of the text. |
+| originalSchema | None | The schema for the dataframe. |
+| adaptSchemaColumns | identity | The function to modify the inferred schema of the dataframe. |
+
+User can provide below options to the `ExcelFormat` instance:
+
+| Parameter Name | Default Value | Description |
+|:------------------------------|:---------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| header | true | Boolean flag to tell whether given excel sheet contains header names or not. |
+| dataAddress | A1 | The location of the data to read from. Following address styles are supported:
`B3:` Start cell of the data. Returns all rows below and all columns to the right.
`B3:F35:` Cell range of data. Reading will return only rows and columns in the specified range.
`'My Sheet'!B3:F35:` Same as above, but with a specific sheet.
`MyTable[#All]:` Table of data. Returns all rows and columns in this table. |
+| treatEmptyValuesAsNulls | true | Treats empty values as null |
+| setErrorCellsToFallbackValues | false | If set false errors will be converted to null. If true, any ERROR cell values (e.g. #N/A) will be converted to the zero values of the column's data type. |
+| usePlainNumberFormat | false | If true, format the cells without rounding and scientific notations |
+| inferSchema | false | Infers the input schema automatically from data. |
+| addColorColumns | false | If it is set to true, adds field with coloured format |
+| timestampFormat | "yyyy-mm-dd hh:mm:ss" | String timestamp format |
+| excerptSize | 10 | If set and if schema inferred, number of rows to infer schema from |
+| maxRowsInMemory | None | If set, uses a streaming reader which can help with big files (will fail if used with xls format files) |
+| maxByteArraySize | None | See https://poi.apache.org/apidocs/5.0/org/apache/poi/util/IOUtils.html#setByteArrayMaxOverride-int- |
+| tempFileThreshold | None | Number of bytes at which a zip entry is regarded as too large for holding in memory and the data is put in a temp file instead |
\ No newline at end of file
diff --git a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelFormat.scala b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelFormat.scala
new file mode 100644
index 0000000..d3cb5a5
--- /dev/null
+++ b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelFormat.scala
@@ -0,0 +1,19 @@
+package com.clairvoyant.data.scalaxy.reader.excel
+
+import zio.config.derivation.nameWithLabel
+
+@nameWithLabel
+case class ExcelFormat(
+ header: Boolean = true,
+ dataAddress: String = "A1",
+ treatEmptyValuesAsNulls: Boolean = true,
+ setErrorCellsToFallbackValues: Boolean = false,
+ usePlainNumberFormat: Boolean = false,
+ inferSchema: Boolean = false,
+ addColorColumns: Boolean = false,
+ timestampFormat: String = "yyyy-mm-dd hh:mm:ss",
+ excerptSize: Int = 10,
+ maxRowsInMemory: Option[Long] = None,
+ maxByteArraySize: Option[Long] = None,
+ tempFileThreshold: Option[Long] = None
+)
diff --git a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala
new file mode 100644
index 0000000..1a58d70
--- /dev/null
+++ b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala
@@ -0,0 +1,75 @@
+package com.clairvoyant.data.scalaxy.reader.excel
+
+import org.apache.poi.xssf.usermodel.XSSFWorkbook
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+import java.io.{ByteArrayInputStream, File, FileOutputStream, PrintWriter}
+
+object ExcelToDataFrameReader {
+
+ def read(
+ bytes: Array[Byte],
+ excelFormat: ExcelFormat,
+ originalSchema: Option[StructType] = None,
+ adaptSchemaColumns: StructType => StructType = identity
+ )(using sparkSession: SparkSession): DataFrame =
+
+ import sparkSession.implicits.*
+
+ def saveBytesToTempExcelFiles(bytes: Array[Byte]) = {
+ val workbook = new XSSFWorkbook(new ByteArrayInputStream(bytes))
+
+ val file = File.createTempFile("excel-data-", ".xlsx")
+ file.deleteOnExit()
+ val fileOut = new FileOutputStream(file)
+ new PrintWriter(file) {
+ try {
+ workbook.write(fileOut)
+ } finally {
+ close()
+ }
+ }
+ file
+ }
+
+ val tempExcelFile = saveBytesToTempExcelFiles(bytes)
+
+ val excelDataFrameReader = sparkSession.read
+ .format("com.crealytics.spark.excel")
+ .options(
+ Map(
+ "header" -> excelFormat.header,
+ "dataAddress" -> excelFormat.dataAddress,
+ "treatEmptyValuesAsNulls" -> excelFormat.treatEmptyValuesAsNulls,
+ "setErrorCellsToFallbackValues" -> excelFormat.setErrorCellsToFallbackValues,
+ "usePlainNumberFormat" -> excelFormat.usePlainNumberFormat,
+ "inferSchema" -> excelFormat.inferSchema,
+ "addColorColumns" -> excelFormat.addColorColumns,
+ "timestampFormat" -> excelFormat.timestampFormat,
+ "excerptSize" -> excelFormat.excerptSize
+ ).map((optionName, optionValue) => (optionName, optionValue.toString))
+ )
+ .options(
+ Map(
+ "maxRowsInMemory" -> excelFormat.maxRowsInMemory,
+ "maxByteArraySize" -> excelFormat.maxByteArraySize,
+ "tempFileThreshold" -> excelFormat.tempFileThreshold
+ ).collect { case (optionName, Some(optionValue)) =>
+ (optionName, optionValue.toString)
+ }
+ )
+
+ excelDataFrameReader
+ .schema {
+ originalSchema.getOrElse {
+ adaptSchemaColumns {
+ excelDataFrameReader
+ .load(tempExcelFile.getAbsolutePath)
+ .schema
+ }
+ }
+ }
+ .load(tempExcelFile.getAbsolutePath)
+
+}
diff --git a/excel/src/test/resources/sample_data.xlsx b/excel/src/test/resources/sample_data.xlsx
new file mode 100644
index 0000000..862ff58
Binary files /dev/null and b/excel/src/test/resources/sample_data.xlsx differ
diff --git a/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala b/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala
new file mode 100644
index 0000000..e28ed41
--- /dev/null
+++ b/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala
@@ -0,0 +1,52 @@
+package com.clairvoyant.data.scalaxy.reader.excel
+
+import com.clairvoyant.data.scalaxy.test.util.matchers.DataFrameMatcher
+import com.clairvoyant.data.scalaxy.test.util.readers.DataFrameReader
+
+import java.io.FileInputStream
+import scala.util.Using
+
+class ExcelToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher {
+
+ "read() - with excel filepath" should "return a dataframe with correct count and schema" in {
+
+ val expectedDF = readJSONFromText(
+ """
+ | [
+ | {
+ | "Created": "2021-07-29 10:35:12",
+ | "Advertiser": "Zola",
+ | "Transaction ID": "1210730000580100000",
+ | "Earnings": "$0.68",
+ | "SID": "wlus9",
+ | "Status": "CONFIRMED",
+ | "ClickPage": "https://www.zola.com/"
+ | },
+ | {
+ | "Created": "2022-04-18 07:23:54",
+ | "Advertiser": "TradeInn",
+ | "Transaction ID": "1220419021230020000",
+ | "Earnings": "$12.48",
+ | "SID": "wles7",
+ | "Status": "CONFIRMED",
+ | "ClickPage": "https://www.tradeinn.com/"
+ | }
+ | ]
+ |""".stripMargin
+ )
+
+ val file = new java.io.File("excel/src/test/resources/sample_data.xlsx")
+ val byteArray: Array[Byte] =
+ Using(new FileInputStream(file)) { fis =>
+ val byteArray = new Array[Byte](file.length.toInt)
+ fis.read(byteArray)
+ byteArray
+ }.get
+
+ ExcelToDataFrameReader.read(
+ byteArray,
+ ExcelFormat(dataAddress = "'Transactions Report'!A2:G4")
+ ) should matchExpectedDataFrame(expectedDF)
+ }
+
+}