Merge pull request #9 from teamclairvoyant/REST-159_excel_reader

Introduced Excel Reader
teamclairvoyant · Feb 8, 2024 · 36b0f48 · 36b0f48
2 parents 4585381 + 5d29483
commit 36b0f48
Show file tree

Hide file tree

Showing 8 changed files with 238 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,6 @@ project
 target
 
 docs/build
+.bsp
+.idea
+.DS_Store
diff --git a/README.md b/README.md
@@ -5,6 +5,7 @@ This library contains several APIs to read data from various sources of differen
 This library supports below source systems:
 
 * Text
+* Excel
 
 ## text
 
@@ -17,3 +18,8 @@ Supported text formats are:
 * HTML Table
 
 Please see the detailed documentation [here](text/README.md).
+
+## excel
+
+User can use this library to read the data from an excel file and parse it to the spark dataframe.
+Please see the detailed documentation [here](excel/README.md).
diff --git a/build.sbt b/build.sbt
@@ -50,6 +50,8 @@ val scalaParserCombinatorsVersion = "2.3.0"
 val sparkVersion = "3.4.1"
 val sparkXMLVersion = "0.16.0"
 val zioConfigVersion = "4.0.0-RC16"
+val crealyticsVersion = "3.4.1_0.19.0"
+val poiVersion = "5.2.5"
 
 // ----- TOOL DEPENDENCIES ----- //
 
@@ -80,6 +82,14 @@ val zioConfigDependencies = Seq(
   "dev.zio" %% "zio-config-magnolia" % zioConfigVersion
 ).map(_ excludeAll ("org.scala-lang.modules", "scala-collection-compat"))
 
+val crealyticsDependencies = Seq(
+  "com.crealytics" %% "spark-excel" % crealyticsVersion
+).map(_.cross(CrossVersion.for3Use2_13))
+
+val poiDependencies = Seq(
+  "org.apache.poi" % "poi" % poiVersion
+)
+
 // ----- MODULE DEPENDENCIES ----- //
 
 val textDependencies =
@@ -89,17 +99,30 @@ val textDependencies =
     sparkXMLDependencies ++
     zioConfigDependencies
 
+val excelDependencies =
+  dataScalaxyTestUtilDependencies ++
+    crealyticsDependencies ++
+    poiDependencies ++
+    sparkDependencies ++
+    zioConfigDependencies
+
 // ----- PROJECTS ----- //
 
 lazy val `data-scalaxy-reader` = (project in file("."))
   .settings(
     publish / skip := true,
     publishLocal / skip := true
   )
-  .aggregate(`reader-text`)
+  .aggregate(`reader-text`, `reader-excel`)
 
 lazy val `reader-text` = (project in file("text"))
   .settings(
     version := "2.0.0",
     libraryDependencies ++= textDependencies
   )
+
+lazy val `reader-excel` = (project in file("excel"))
+  .settings(
+    version := "1.0.0",
+    libraryDependencies ++= excelDependencies
+  )
diff --git a/excel/README.md b/excel/README.md
@@ -0,0 +1,59 @@
+# Excel
+
+User needs to add below dependency to the `build.sbt` file:
+
+```Scala
+ThisBuild / resolvers += "Github Repo" at "https://maven.pkg.github.com/teamclairvoyant/data-scalaxy-reader/"
+
+ThisBuild / credentials += Credentials(
+  "GitHub Package Registry",
+  "maven.pkg.github.com",
+  System.getenv("GITHUB_USERNAME"),
+  System.getenv("GITHUB_TOKEN")
+)
+
+ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "reader-excel" % "1.0.0"
+```
+
+Make sure you add `GITHUB_USERNAME` and `GITHUB_TOKEN` to the environment variables.
+
+`GITHUB_TOKEN` is the Personal Access Token with the permission to read packages.
+
+## API
+
+The library provides below `read` APIs in type class `ExcelToDataFrameReader` in order to parse an Excel file into spark dataframe:
+
+```scala
+ def read(
+           bytes: Array[Byte],
+           excelFormat: ExcelFormat,
+           originalSchema: Option[StructType] = None,
+           adaptSchemaColumns: StructType => StructType = identity
+         ) (using sparkSession: SparkSession): DataFrame
+```
+
+The `read` method takes below arguments:
+
+| Argument Name      | Default Value | Description                                                  |
+|:-------------------|:-------------:|:-------------------------------------------------------------|
+| bytes              |       -       | An Excel file in bytes to be parsed to the dataframe.        |
+| excelFormat        |       -       | The `ExcelFormat` representation for the format of the text. |
+| originalSchema     |     None      | The schema for the dataframe.                                |
+| adaptSchemaColumns |   identity    | The function to modify the inferred schema of the dataframe. |
+
+User can provide below options to the `ExcelFormat` instance:
+
+| Parameter Name                |     Default Value     | Description                                                                                                                                                                                                                                                                                                                                                                                                                            |
+|:------------------------------|:---------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| header                        |         true          | Boolean flag to tell whether given excel sheet contains header names or not.                                                                                                                                                                                                                                                                                                                                                           |
+| dataAddress                   |          A1           | The location of the data to read from. Following address styles are supported: <br/> `B3:` Start cell of the data. Returns all rows below and all columns to the right. <br/> `B3:F35:` Cell range of data. Reading will return only rows and columns in the specified range. <br/> `'My Sheet'!B3:F35:` Same as above, but with a specific sheet. <br/> `MyTable[#All]:` Table of data. Returns all rows and columns in this table.   |
+| treatEmptyValuesAsNulls       |         true          | Treats empty values as null                                                                                                                                                                                                                                                                                                                                                                                                            |
+| setErrorCellsToFallbackValues |         false         | If set false errors will be converted to null. If true, any ERROR cell values (e.g. #N/A) will be converted to the zero values of the column's data type.                                                                                                                                                                                                                                                                              |
+| usePlainNumberFormat          |         false         | If true, format the cells without rounding and scientific notations                                                                                                                                                                                                                                                                                                                                                                    |
+| inferSchema                   |         false         | Infers the input schema automatically from data.                                                                                                                                                                                                                                                                                                                                                                                       |
+| addColorColumns               |         false         | If it is set to true, adds field with coloured format                                                                                                                                                                                                                                                                                                                                                                                  |
+| timestampFormat               | "yyyy-mm-dd hh:mm:ss" | String timestamp format                                                                                                                                                                                                                                                                                                                                                                                                                |
+| excerptSize                   |          10           | If set and if schema inferred, number of rows to infer schema from                                                                                                                                                                                                                                                                                                                                                                     |
+| maxRowsInMemory               |         None          | If set, uses a streaming reader which can help with big files (will fail if used with xls format files)                                                                                                                                                                                                                                                                                                                                |
+| maxByteArraySize              |         None          | See https://poi.apache.org/apidocs/5.0/org/apache/poi/util/IOUtils.html#setByteArrayMaxOverride-int-                                                                                                                                                                                                                                                                                                                                   |
+| tempFileThreshold             |         None          | Number of bytes at which a zip entry is regarded as too large for holding in memory and the data is put in a temp file instead                                                                                                                                                                                                                                                                                                         |
diff --git a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelFormat.scala b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelFormat.scala
@@ -0,0 +1,19 @@
+package com.clairvoyant.data.scalaxy.reader.excel
+
+import zio.config.derivation.nameWithLabel
+
+@nameWithLabel
+case class ExcelFormat(
+    header: Boolean = true,
+    dataAddress: String = "A1",
+    treatEmptyValuesAsNulls: Boolean = true,
+    setErrorCellsToFallbackValues: Boolean = false,
+    usePlainNumberFormat: Boolean = false,
+    inferSchema: Boolean = false,
+    addColorColumns: Boolean = false,
+    timestampFormat: String = "yyyy-mm-dd hh:mm:ss",
+    excerptSize: Int = 10,
+    maxRowsInMemory: Option[Long] = None,
+    maxByteArraySize: Option[Long] = None,
+    tempFileThreshold: Option[Long] = None
+)
diff --git a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala
@@ -0,0 +1,75 @@
+package com.clairvoyant.data.scalaxy.reader.excel
+
+import org.apache.poi.xssf.usermodel.XSSFWorkbook
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+import java.io.{ByteArrayInputStream, File, FileOutputStream, PrintWriter}
+
+object ExcelToDataFrameReader {
+
+  def read(
+      bytes: Array[Byte],
+      excelFormat: ExcelFormat,
+      originalSchema: Option[StructType] = None,
+      adaptSchemaColumns: StructType => StructType = identity
+  )(using sparkSession: SparkSession): DataFrame =
+
+    import sparkSession.implicits.*
+
+    def saveBytesToTempExcelFiles(bytes: Array[Byte]) = {
+      val workbook = new XSSFWorkbook(new ByteArrayInputStream(bytes))
+
+      val file = File.createTempFile("excel-data-", ".xlsx")
+      file.deleteOnExit()
+      val fileOut = new FileOutputStream(file)
+      new PrintWriter(file) {
+        try {
+          workbook.write(fileOut)
+        } finally {
+          close()
+        }
+      }
+      file
+    }
+
+    val tempExcelFile = saveBytesToTempExcelFiles(bytes)
+
+    val excelDataFrameReader = sparkSession.read
+      .format("com.crealytics.spark.excel")
+      .options(
+        Map(
+          "header" -> excelFormat.header,
+          "dataAddress" -> excelFormat.dataAddress,
+          "treatEmptyValuesAsNulls" -> excelFormat.treatEmptyValuesAsNulls,
+          "setErrorCellsToFallbackValues" -> excelFormat.setErrorCellsToFallbackValues,
+          "usePlainNumberFormat" -> excelFormat.usePlainNumberFormat,
+          "inferSchema" -> excelFormat.inferSchema,
+          "addColorColumns" -> excelFormat.addColorColumns,
+          "timestampFormat" -> excelFormat.timestampFormat,
+          "excerptSize" -> excelFormat.excerptSize
+        ).map((optionName, optionValue) => (optionName, optionValue.toString))
+      )
+      .options(
+        Map(
+          "maxRowsInMemory" -> excelFormat.maxRowsInMemory,
+          "maxByteArraySize" -> excelFormat.maxByteArraySize,
+          "tempFileThreshold" -> excelFormat.tempFileThreshold
+        ).collect { case (optionName, Some(optionValue)) =>
+          (optionName, optionValue.toString)
+        }
+      )
+
+    excelDataFrameReader
+      .schema {
+        originalSchema.getOrElse {
+          adaptSchemaColumns {
+            excelDataFrameReader
+              .load(tempExcelFile.getAbsolutePath)
+              .schema
+          }
+        }
+      }
+      .load(tempExcelFile.getAbsolutePath)
+
+}
diff --git a/excel/src/test/resources/sample_data.xlsx b/excel/src/test/resources/sample_data.xlsx
diff --git a/...src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala b/...src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala
@@ -0,0 +1,52 @@
+package com.clairvoyant.data.scalaxy.reader.excel
+
+import com.clairvoyant.data.scalaxy.test.util.matchers.DataFrameMatcher
+import com.clairvoyant.data.scalaxy.test.util.readers.DataFrameReader
+
+import java.io.FileInputStream
+import scala.util.Using
+
+class ExcelToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher {
+
+  "read() - with excel filepath" should "return a dataframe with correct count and schema" in {
+
+    val expectedDF = readJSONFromText(
+      """
+        |  [
+        |    {
+        |      "Created": "2021-07-29 10:35:12",
+        |      "Advertiser": "Zola",
+        |      "Transaction ID": "1210730000580100000",
+        |      "Earnings": "$0.68",
+        |      "SID": "wlus9",
+        |      "Status": "CONFIRMED",
+        |      "ClickPage": "https://www.zola.com/"
+        |    },
+        |    {
+        |      "Created": "2022-04-18 07:23:54",
+        |      "Advertiser": "TradeInn",
+        |      "Transaction ID": "1220419021230020000",
+        |      "Earnings": "$12.48",
+        |      "SID": "wles7",
+        |      "Status": "CONFIRMED",
+        |      "ClickPage": "https://www.tradeinn.com/"
+        |    }
+        |  ]
+        |""".stripMargin
+    )
+
+    val file = new java.io.File("excel/src/test/resources/sample_data.xlsx")
+    val byteArray: Array[Byte] =
+      Using(new FileInputStream(file)) { fis =>
+        val byteArray = new Array[Byte](file.length.toInt)
+        fis.read(byteArray)
+        byteArray
+      }.get
+
+    ExcelToDataFrameReader.read(
+      byteArray,
+      ExcelFormat(dataAddress = "'Transactions Report'!A2:G4")
+    ) should matchExpectedDataFrame(expectedDF)
+  }
+
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,6 @@ project @@
     target
     docs/build
+    .bsp
+    .idea
+    .DS_Store