diff --git a/.gitignore b/.gitignore index a04ff5b..ab276ab 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ project target docs/build +.bsp +.idea +.DS_Store diff --git a/README.md b/README.md index 8440f36..293cc5d 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ This library contains several APIs to read data from various sources of differen This library supports below source systems: * Text +* Excel ## text @@ -17,3 +18,8 @@ Supported text formats are: * HTML Table Please see the detailed documentation [here](text/README.md). + +## excel + +User can use this library to read the data from an excel file and parse it to the spark dataframe. +Please see the detailed documentation [here](excel/README.md). \ No newline at end of file diff --git a/build.sbt b/build.sbt index c8337de..db9b2a0 100644 --- a/build.sbt +++ b/build.sbt @@ -50,6 +50,8 @@ val scalaParserCombinatorsVersion = "2.3.0" val sparkVersion = "3.4.1" val sparkXMLVersion = "0.16.0" val zioConfigVersion = "4.0.0-RC16" +val crealyticsVersion = "3.4.1_0.19.0" +val poiVersion = "5.2.5" // ----- TOOL DEPENDENCIES ----- // @@ -80,6 +82,14 @@ val zioConfigDependencies = Seq( "dev.zio" %% "zio-config-magnolia" % zioConfigVersion ).map(_ excludeAll ("org.scala-lang.modules", "scala-collection-compat")) +val crealyticsDependencies = Seq( + "com.crealytics" %% "spark-excel" % crealyticsVersion +).map(_.cross(CrossVersion.for3Use2_13)) + +val poiDependencies = Seq( + "org.apache.poi" % "poi" % poiVersion +) + // ----- MODULE DEPENDENCIES ----- // val textDependencies = @@ -89,6 +99,13 @@ val textDependencies = sparkXMLDependencies ++ zioConfigDependencies +val excelDependencies = + dataScalaxyTestUtilDependencies ++ + crealyticsDependencies ++ + poiDependencies ++ + sparkDependencies ++ + zioConfigDependencies + // ----- PROJECTS ----- // lazy val `data-scalaxy-reader` = (project in file(".")) @@ -96,10 +113,16 @@ lazy val `data-scalaxy-reader` = (project in file(".")) publish / skip := true, publishLocal / skip := true ) - .aggregate(`reader-text`) + .aggregate(`reader-text`, `reader-excel`) lazy val `reader-text` = (project in file("text")) .settings( version := "2.0.0", libraryDependencies ++= textDependencies ) + +lazy val `reader-excel` = (project in file("excel")) + .settings( + version := "1.0.0", + libraryDependencies ++= excelDependencies + ) diff --git a/excel/README.md b/excel/README.md new file mode 100644 index 0000000..ceeae2b --- /dev/null +++ b/excel/README.md @@ -0,0 +1,59 @@ +# Excel + +User needs to add below dependency to the `build.sbt` file: + +```Scala +ThisBuild / resolvers += "Github Repo" at "https://maven.pkg.github.com/teamclairvoyant/data-scalaxy-reader/" + +ThisBuild / credentials += Credentials( + "GitHub Package Registry", + "maven.pkg.github.com", + System.getenv("GITHUB_USERNAME"), + System.getenv("GITHUB_TOKEN") +) + +ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "reader-excel" % "1.0.0" +``` + +Make sure you add `GITHUB_USERNAME` and `GITHUB_TOKEN` to the environment variables. + +`GITHUB_TOKEN` is the Personal Access Token with the permission to read packages. + +## API + +The library provides below `read` APIs in type class `ExcelToDataFrameReader` in order to parse an Excel file into spark dataframe: + +```scala + def read( + bytes: Array[Byte], + excelFormat: ExcelFormat, + originalSchema: Option[StructType] = None, + adaptSchemaColumns: StructType => StructType = identity + ) (using sparkSession: SparkSession): DataFrame +``` + +The `read` method takes below arguments: + +| Argument Name | Default Value | Description | +|:-------------------|:-------------:|:-------------------------------------------------------------| +| bytes | - | An Excel file in bytes to be parsed to the dataframe. | +| excelFormat | - | The `ExcelFormat` representation for the format of the text. | +| originalSchema | None | The schema for the dataframe. | +| adaptSchemaColumns | identity | The function to modify the inferred schema of the dataframe. | + +User can provide below options to the `ExcelFormat` instance: + +| Parameter Name | Default Value | Description | +|:------------------------------|:---------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| header | true | Boolean flag to tell whether given excel sheet contains header names or not. | +| dataAddress | A1 | The location of the data to read from. Following address styles are supported:
`B3:` Start cell of the data. Returns all rows below and all columns to the right.
`B3:F35:` Cell range of data. Reading will return only rows and columns in the specified range.
`'My Sheet'!B3:F35:` Same as above, but with a specific sheet.
`MyTable[#All]:` Table of data. Returns all rows and columns in this table. | +| treatEmptyValuesAsNulls | true | Treats empty values as null | +| setErrorCellsToFallbackValues | false | If set false errors will be converted to null. If true, any ERROR cell values (e.g. #N/A) will be converted to the zero values of the column's data type. | +| usePlainNumberFormat | false | If true, format the cells without rounding and scientific notations | +| inferSchema | false | Infers the input schema automatically from data. | +| addColorColumns | false | If it is set to true, adds field with coloured format | +| timestampFormat | "yyyy-mm-dd hh:mm:ss" | String timestamp format | +| excerptSize | 10 | If set and if schema inferred, number of rows to infer schema from | +| maxRowsInMemory | None | If set, uses a streaming reader which can help with big files (will fail if used with xls format files) | +| maxByteArraySize | None | See https://poi.apache.org/apidocs/5.0/org/apache/poi/util/IOUtils.html#setByteArrayMaxOverride-int- | +| tempFileThreshold | None | Number of bytes at which a zip entry is regarded as too large for holding in memory and the data is put in a temp file instead | \ No newline at end of file diff --git a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelFormat.scala b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelFormat.scala new file mode 100644 index 0000000..d3cb5a5 --- /dev/null +++ b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelFormat.scala @@ -0,0 +1,19 @@ +package com.clairvoyant.data.scalaxy.reader.excel + +import zio.config.derivation.nameWithLabel + +@nameWithLabel +case class ExcelFormat( + header: Boolean = true, + dataAddress: String = "A1", + treatEmptyValuesAsNulls: Boolean = true, + setErrorCellsToFallbackValues: Boolean = false, + usePlainNumberFormat: Boolean = false, + inferSchema: Boolean = false, + addColorColumns: Boolean = false, + timestampFormat: String = "yyyy-mm-dd hh:mm:ss", + excerptSize: Int = 10, + maxRowsInMemory: Option[Long] = None, + maxByteArraySize: Option[Long] = None, + tempFileThreshold: Option[Long] = None +) diff --git a/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala new file mode 100644 index 0000000..1a58d70 --- /dev/null +++ b/excel/src/main/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReader.scala @@ -0,0 +1,75 @@ +package com.clairvoyant.data.scalaxy.reader.excel + +import org.apache.poi.xssf.usermodel.XSSFWorkbook +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, SparkSession} + +import java.io.{ByteArrayInputStream, File, FileOutputStream, PrintWriter} + +object ExcelToDataFrameReader { + + def read( + bytes: Array[Byte], + excelFormat: ExcelFormat, + originalSchema: Option[StructType] = None, + adaptSchemaColumns: StructType => StructType = identity + )(using sparkSession: SparkSession): DataFrame = + + import sparkSession.implicits.* + + def saveBytesToTempExcelFiles(bytes: Array[Byte]) = { + val workbook = new XSSFWorkbook(new ByteArrayInputStream(bytes)) + + val file = File.createTempFile("excel-data-", ".xlsx") + file.deleteOnExit() + val fileOut = new FileOutputStream(file) + new PrintWriter(file) { + try { + workbook.write(fileOut) + } finally { + close() + } + } + file + } + + val tempExcelFile = saveBytesToTempExcelFiles(bytes) + + val excelDataFrameReader = sparkSession.read + .format("com.crealytics.spark.excel") + .options( + Map( + "header" -> excelFormat.header, + "dataAddress" -> excelFormat.dataAddress, + "treatEmptyValuesAsNulls" -> excelFormat.treatEmptyValuesAsNulls, + "setErrorCellsToFallbackValues" -> excelFormat.setErrorCellsToFallbackValues, + "usePlainNumberFormat" -> excelFormat.usePlainNumberFormat, + "inferSchema" -> excelFormat.inferSchema, + "addColorColumns" -> excelFormat.addColorColumns, + "timestampFormat" -> excelFormat.timestampFormat, + "excerptSize" -> excelFormat.excerptSize + ).map((optionName, optionValue) => (optionName, optionValue.toString)) + ) + .options( + Map( + "maxRowsInMemory" -> excelFormat.maxRowsInMemory, + "maxByteArraySize" -> excelFormat.maxByteArraySize, + "tempFileThreshold" -> excelFormat.tempFileThreshold + ).collect { case (optionName, Some(optionValue)) => + (optionName, optionValue.toString) + } + ) + + excelDataFrameReader + .schema { + originalSchema.getOrElse { + adaptSchemaColumns { + excelDataFrameReader + .load(tempExcelFile.getAbsolutePath) + .schema + } + } + } + .load(tempExcelFile.getAbsolutePath) + +} diff --git a/excel/src/test/resources/sample_data.xlsx b/excel/src/test/resources/sample_data.xlsx new file mode 100644 index 0000000..862ff58 Binary files /dev/null and b/excel/src/test/resources/sample_data.xlsx differ diff --git a/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala b/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala new file mode 100644 index 0000000..e28ed41 --- /dev/null +++ b/excel/src/test/scala/com/clairvoyant/data/scalaxy/reader/excel/ExcelToDataFrameReaderSpec.scala @@ -0,0 +1,52 @@ +package com.clairvoyant.data.scalaxy.reader.excel + +import com.clairvoyant.data.scalaxy.test.util.matchers.DataFrameMatcher +import com.clairvoyant.data.scalaxy.test.util.readers.DataFrameReader + +import java.io.FileInputStream +import scala.util.Using + +class ExcelToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher { + + "read() - with excel filepath" should "return a dataframe with correct count and schema" in { + + val expectedDF = readJSONFromText( + """ + | [ + | { + | "Created": "2021-07-29 10:35:12", + | "Advertiser": "Zola", + | "Transaction ID": "1210730000580100000", + | "Earnings": "$0.68", + | "SID": "wlus9", + | "Status": "CONFIRMED", + | "ClickPage": "https://www.zola.com/" + | }, + | { + | "Created": "2022-04-18 07:23:54", + | "Advertiser": "TradeInn", + | "Transaction ID": "1220419021230020000", + | "Earnings": "$12.48", + | "SID": "wles7", + | "Status": "CONFIRMED", + | "ClickPage": "https://www.tradeinn.com/" + | } + | ] + |""".stripMargin + ) + + val file = new java.io.File("excel/src/test/resources/sample_data.xlsx") + val byteArray: Array[Byte] = + Using(new FileInputStream(file)) { fis => + val byteArray = new Array[Byte](file.length.toInt) + fis.read(byteArray) + byteArray + }.get + + ExcelToDataFrameReader.read( + byteArray, + ExcelFormat(dataAddress = "'Transactions Report'!A2:G4") + ) should matchExpectedDataFrame(expectedDF) + } + +}