From 801707c9446e07e53d37661339d671ad62300cf2 Mon Sep 17 00:00:00 2001 From: rahulbhatia023 Date: Mon, 4 Dec 2023 00:56:36 -0500 Subject: [PATCH] Refactoring type class hierarchy --- build.sbt | 2 +- text/README.md | 183 ++++++++++-------- .../reader/text/TextToDataFrameReader.scala | 26 +-- .../instances/CSVTextToDataFrameReader.scala | 3 +- .../HTMLTableTextToDataFrameReader.scala | 14 +- .../instances/JSONTextToDataFrameReader.scala | 3 +- .../TextFormatToDataFrameReader.scala | 13 -- .../instances/XMLTextToDataFrameReader.scala | 3 +- .../text/CSVTextToDataFrameReaderSpec.scala | 43 ++-- .../HTMLTableTextToDataFrameReaderSpec.scala | 10 +- .../text/JSONTextToDataFrameReaderSpec.scala | 29 +-- .../text/XMLTextToDataFrameReaderSpec.scala | 22 ++- 12 files changed, 180 insertions(+), 171 deletions(-) delete mode 100644 text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/TextFormatToDataFrameReader.scala diff --git a/build.sbt b/build.sbt index 7088a70..c8337de 100644 --- a/build.sbt +++ b/build.sbt @@ -100,6 +100,6 @@ lazy val `data-scalaxy-reader` = (project in file(".")) lazy val `reader-text` = (project in file("text")) .settings( - version := "1.1.0", + version := "2.0.0", libraryDependencies ++= textDependencies ) diff --git a/text/README.md b/text/README.md index e800f6e..e28c4fb 100644 --- a/text/README.md +++ b/text/README.md @@ -12,37 +12,38 @@ ThisBuild / credentials += Credentials( System.getenv("GITHUB_TOKEN") ) -ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "reader-text" % "1.0.0" +ThisBuild / libraryDependencies += "com.clairvoyant.data.scalaxy" %% "reader-text" % "2.0.0" ``` Make sure you add `GITHUB_USERNAME` and `GITHUB_TOKEN` to the environment variables. `GITHUB_TOKEN` is the Personal Access Token with the permission to read packages. -## API +## API -The library provides below `read` APIs in class `TextToDataFrameReader` in order to parse a text into spark dataframe: +The library provides below `read` APIs in type class `TextToDataFrameReader` in order to parse a text into spark +dataframe: ```scala - def read[T <: TextFormat]( - text: String, - textFormat: T, - originalSchema: Option[StructType] = None, - adaptSchemaColumns: StructType => StructType = identity - )(using textFormatToDataFrameReader: TextFormatToDataFrameReader[T], sparkSession: SparkSession): DataFrame - - def read[T <: TextFormat]( - text: Seq[String], - textFormat: T, - originalSchema: Option[StructType], - adaptSchemaColumns: StructType => StructType - )(using textFormatToDataFrameReader: TextFormatToDataFrameReader[T], sparkSession: SparkSession): DataFrame -`````` +def read( + text: String, + textFormat: T, + originalSchema: Option[StructType] = None, + adaptSchemaColumns: StructType => StructType = identity +)(using sparkSession: SparkSession): DataFrame + +def read( + text: Seq[String], + textFormat: T, + originalSchema: Option[StructType], + adaptSchemaColumns: StructType => StructType +)(using sparkSession: SparkSession): DataFrame +``` The `read` method takes below arguments: | Argument Name | Default Value | Description | -| :----------------- | :-----------: | :----------------------------------------------------------- | +|:-------------------|:-------------:|:-------------------------------------------------------------| | text | - | The text in string format to be parsed to dataframe. | | textFormat | - | The `TextFormat` representation for the format of the text. | | originalSchema | None | The schema for the dataframe. | @@ -60,7 +61,19 @@ Supported text formats are: Suppose user wants to read CSV text data `csvText` and parse it to spark dataframe. Then user need to perform below steps: -#### 1. Define file format +#### 1. Import type class + +```scala +import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader +``` + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.reader.text.instances.CSVTextToDataFrameReader +``` + +#### 3. Define text format ```scala import com.clairvoyant.data.scalaxy.reader.text.formats.CSVTextFormat @@ -73,7 +86,7 @@ val csvTextFormat = CSVTextFormat( User can provide below options to the `CSVTextFormat` instance: | Parameter Name | Default Value | Description | -| :---------------------------- | :-------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +|:------------------------------|:---------------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | charToEscapeQuoteEscaping | \ | Sets a single character used for escaping the escape for the quote character. | | columnNameOfCorruptRecord | _corrupt_record | Allows renaming the new field having malformed string created by PERMISSIVE mode.
This overrides `spark.sql.columnNameOfCorruptRecord`. | | comment | # | Sets a single character used for skipping lines beginning with this character. | @@ -106,30 +119,34 @@ User can provide below options to the `CSVTextFormat` instance: | timestampNTZFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS] | Sets the string that indicates a timestamp without timezone format. | | unescapedQuoteHandling | STOP_AT_DELIMITER | Defines how the CsvParser will handle values with unescaped quotes.
Allowed values are STOP_AT_CLOSING_QUOTE, BACK_TO_DELIMITER, STOP_AT_DELIMITER, SKIP_VALUE, RAISE_ERROR | -#### 2. Import type class instance +#### 4. Call API ```scala -import com.clairvoyant.data.scalaxy.reader.text.instances.CSVTextToDataFrameReader -`````` +TextToDataFrameReader[CSVTextFormat] + .read( + text = csvText, + textFormat = csvTextFormat + ) +``` + +### JSON + +Suppose user wants to read JSON text data `jsonText` and parse it to spark dataframe. +Then user need to perform below steps: -#### 3. Call API +#### 1. Import type class ```scala import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader +``` -TextToDataFrameReader - .read( - text = csvText, - textFormat = csvTextFormat - ) -`````` - -### JSON +#### 2. Import type class instance -Suppose user wants to read JSON text data `jsonText` and parse it to spark dataframe. -Then user need to perform below steps: +```scala +import com.clairvoyant.data.scalaxy.reader.text.instances.JSONTextToDataFrameReader +``` -#### 1. Define file format +#### 3. Define text format ```scala import com.clairvoyant.data.scalaxy.reader.text.formats.JSONTextFormat @@ -142,7 +159,7 @@ val jsonTextFormat = JSONTextFormat( User can provide below options to the `JSONTextFormat` instance: | Parameter Name | Default Value | Description | -| :--------------------------------- | :-------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------- | +|:-----------------------------------|:---------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------| | allowBackslashEscapingAnyCharacter | false | Allows accepting quoting of all character using backslash quoting mechanism. | | allowComments | false | Ignores Java/C++ style comment in JSON records. | | allowNonNumericNumbers | true | Allows JSON parser to recognize set of “Not-a-Number” (NaN) tokens as legal floating number values. | @@ -168,30 +185,34 @@ User can provide below options to the `JSONTextFormat` instance: | timestampNTZFormat | yyyy-MM-dd'T'HH:mm:ss[.SSS] | Sets the string that indicates a timestamp without timezone format. | | timeZone | UTC | Sets the string that indicates a time zone ID to be used to format timestamps in the JSON datasources or partition values. | -#### 2. Import type class instance +#### 4. Call API ```scala -import com.clairvoyant.data.scalaxy.reader.text.instances.JSONTextToDataFrameReader -`````` +TextToDataFrameReader[JSONTextFormat] + .read( + text = jsonText, + textFormat = jsonTextFormat + ) +``` -#### 3. Call API +### XML + +Suppose user wants to read XML text data `xmlText` and parse it to spark dataframe. +Then user need to perform below steps: + +#### 1. Import type class ```scala import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader +``` -TextToDataFrameReader - .read( - text = jsonText, - textFormat = jsonTextFormat - ) -`````` - -### XML +#### 2. Import type class instance -Suppose user wants to read XML text data `xmlText` and parse it to spark dataframe. -Then user need to perform below steps: +```scala +import com.clairvoyant.data.scalaxy.reader.text.instances.XMLTextToDataFrameReader +``` -#### 1. Define file format +#### 3. Define text format ```scala import com.clairvoyant.data.scalaxy.reader.text.formats.XMLTextFormat @@ -204,7 +225,7 @@ val xmlTextFormat = XMLTextFormat( User can provide below options to the `XMLTextFormat` instance: | Parameter Name | Default Value | Description | -| :------------------------ | :-----------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +|:--------------------------|:-------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | attributePrefix | _ | The prefix for attributes so that we can differentiate attributes and elements. | | charset | UTF-8 | Defaults to 'UTF-8' but can be set to other valid charset names. | | columnNameOfCorruptRecord | _corrupt_record | Allows renaming the new field having malformed string created by PERMISSIVE mode. This overrides spark.sql.columnNameOfCorruptRecord. | @@ -221,30 +242,34 @@ User can provide below options to the `XMLTextFormat` instance: | valueTag | _VALUE | The tag used for the value when there are attributes in the element having no child. | | wildcardColName | xs_any | Name of a column existing in the provided schema which is interpreted as a 'wildcard'. It must have type string or array of strings.
It will match any XML child element that is not otherwise matched by the schema. The XML of the child becomes the string value of the column.
If an array, then all unmatched elements will be returned as an array of strings. As its name implies, it is meant to emulate XSD's xs:any type. | -#### 2. Import type class instance +#### 4. Call API ```scala -import com.clairvoyant.data.scalaxy.reader.text.instances.XMLTextToDataFrameReader -`````` +TextToDataFrameReader[XMLTextFormat] + .read( + text = xmlText, + textFormat = xmlTextFormat + ) +``` + +### HTML Table + +Suppose user wants to read a text `htmlText` containing data in the form of html table and parse it to spark dataframe. +Then user need to perform below steps: -#### 3. Call API +#### 1. Import type class ```scala import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader +``` -TextToDataFrameReader - .read( - text = xmlText, - textFormat = xmlTextFormat - ) -`````` - -### HTML Table +#### 2. Import type class instance -Suppose user wants to read a text `htmlText` containing data in the form of html table and parse it to spark dataframe. -Then user need to perform below steps: +```scala +import com.clairvoyant.data.scalaxy.reader.text.instances.HTMLTableTextToDataFrameReader +``` -#### 1. Define file format +#### 3. Define text format ```scala import com.clairvoyant.data.scalaxy.reader.text.formats.HTMLTableTextFormat @@ -257,23 +282,15 @@ val htmlTableTextFormat = HTMLTableTextFormat( User can provide below options to the `HTMLTableTextFormat` instance: | Parameter Name | Default Value | Mandatory | Description | -| :------------- | :-----------: | :-------: | :---------------------------------------------------------------------------- | +|:---------------|:-------------:|:---------:|:------------------------------------------------------------------------------| | tableName | None | No | The name of the table in the `table` tag that you want to read the data from. | -#### 2. Import type class instance - -```scala -import com.clairvoyant.data.scalaxy.reader.text.instances.HTMLTableTextToDataFrameReader -`````` - -#### 3. Call API +#### 4. Call API ```scala -import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader - -TextToDataFrameReader - .read( - text = htmlText, - textFormat = htmlTableTextFormat - ) -`````` +TextToDataFrameReader[HTMLTableTextFormat] + .read( + text = htmlText, + textFormat = htmlTableTextFormat + ) +``` diff --git a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/TextToDataFrameReader.scala b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/TextToDataFrameReader.scala index 1ac62fa..a1dda5f 100644 --- a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/TextToDataFrameReader.scala +++ b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/TextToDataFrameReader.scala @@ -1,26 +1,26 @@ package com.clairvoyant.data.scalaxy.reader.text import com.clairvoyant.data.scalaxy.reader.text.formats.TextFormat -import com.clairvoyant.data.scalaxy.reader.text.instances.TextFormatToDataFrameReader import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SparkSession} -object TextToDataFrameReader { +trait TextToDataFrameReader[T]: - def read[T <: TextFormat]( + def read( + text: Seq[String], + textFormat: T, + originalSchema: Option[StructType], + adaptSchemaColumns: StructType => StructType + )(using sparkSession: SparkSession): DataFrame + + def read( text: String, textFormat: T, originalSchema: Option[StructType] = None, adaptSchemaColumns: StructType => StructType = identity - )(using textFormatToDataFrameReader: TextFormatToDataFrameReader[T], sparkSession: SparkSession): DataFrame = - read(Seq(text), textFormat, originalSchema, adaptSchemaColumns) + )(using sparkSession: SparkSession): DataFrame = read(Seq(text), textFormat, originalSchema, adaptSchemaColumns) - def read[T <: TextFormat]( - text: Seq[String], - textFormat: T, - originalSchema: Option[StructType], - adaptSchemaColumns: StructType => StructType - )(using textFormatToDataFrameReader: TextFormatToDataFrameReader[T], sparkSession: SparkSession): DataFrame = - textFormatToDataFrameReader.read(text, textFormat, originalSchema, adaptSchemaColumns) +object TextToDataFrameReader: -} + def apply[T <: TextFormat](using textToDataFrameReader: TextToDataFrameReader[T]): TextToDataFrameReader[T] = + textToDataFrameReader diff --git a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/CSVTextToDataFrameReader.scala b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/CSVTextToDataFrameReader.scala index b041502..06433e2 100644 --- a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/CSVTextToDataFrameReader.scala +++ b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/CSVTextToDataFrameReader.scala @@ -1,11 +1,12 @@ package com.clairvoyant.data.scalaxy.reader.text.instances +import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader import com.clairvoyant.data.scalaxy.reader.text.formats.CSVTextFormat import org.apache.spark.sql.catalyst.csv.CSVOptions.* import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, SparkSession} -implicit object CSVTextToDataFrameReader extends TextFormatToDataFrameReader[CSVTextFormat] { +implicit object CSVTextToDataFrameReader extends TextToDataFrameReader[CSVTextFormat] { override def read( text: Seq[String], diff --git a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/HTMLTableTextToDataFrameReader.scala b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/HTMLTableTextToDataFrameReader.scala index dab7619..f9cbe61 100644 --- a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/HTMLTableTextToDataFrameReader.scala +++ b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/HTMLTableTextToDataFrameReader.scala @@ -11,9 +11,9 @@ import org.jsoup.select.Elements import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable` import scala.util.Try -implicit object HTMLTableTextToDataFrameReader extends TextFormatToDataFrameReader[HTMLTableTextFormat] { +implicit object HTMLTableTextToDataFrameReader extends TextToDataFrameReader[HTMLTableTextFormat] { - val VALUE_SEPARATOR = "~" + private val VALUE_SEPARATOR = "~" override def read( text: Seq[String], @@ -21,7 +21,7 @@ implicit object HTMLTableTextToDataFrameReader extends TextFormatToDataFrameRead originalSchema: Option[StructType], adaptSchemaColumns: StructType => StructType )(using sparkSession: SparkSession): DataFrame = - TextToDataFrameReader.read( + TextToDataFrameReader[CSVTextFormat].read( text = text.map(htmlText => convertHTMLTableToCSV(htmlText, textFormat.tableName)), textFormat = CSVTextFormat( sep = VALUE_SEPARATOR @@ -30,7 +30,7 @@ implicit object HTMLTableTextToDataFrameReader extends TextFormatToDataFrameRead adaptSchemaColumns = adaptSchemaColumns ) - def convertHTMLTableToCSV(htmlText: String, tableName: Option[String] = None): String = { + private def convertHTMLTableToCSV(htmlText: String, tableName: Option[String] = None): String = { Try { val parsedDocument = Jsoup.parse(htmlText) @@ -47,7 +47,7 @@ implicit object HTMLTableTextToDataFrameReader extends TextFormatToDataFrameRead }.get } - def getTableFromParsedDocument(parsedDocument: Document, tableName: Option[String]): Element = + private def getTableFromParsedDocument(parsedDocument: Document, tableName: Option[String]): Element = tableName .map { tblName => val tables = parsedDocument.select(tblName) @@ -58,12 +58,12 @@ implicit object HTMLTableTextToDataFrameReader extends TextFormatToDataFrameRead } .getOrElse(parsedDocument.getElementsByTag("table").first()) - def getTableHeader(rows: Elements): String = + private def getTableHeader(rows: Elements): String = Seq(rows.select("th").map(_.text)).flatten .mkString(VALUE_SEPARATOR) .concat("\n") - def getTableRows(rows: Elements): String = + private def getTableRows(rows: Elements): String = Seq( rows.map { row => val flattenRows = Seq(row.select("td").map(_.text())).flatten diff --git a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/JSONTextToDataFrameReader.scala b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/JSONTextToDataFrameReader.scala index e2459d3..3d082cc 100644 --- a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/JSONTextToDataFrameReader.scala +++ b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/JSONTextToDataFrameReader.scala @@ -1,12 +1,13 @@ package com.clairvoyant.data.scalaxy.reader.text.instances +import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader import com.clairvoyant.data.scalaxy.reader.text.formats.JSONTextFormat import org.apache.spark.sql.catalyst.json.JSONOptions.* import org.apache.spark.sql.functions.{col, explode} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SparkSession} -implicit object JSONTextToDataFrameReader extends TextFormatToDataFrameReader[JSONTextFormat] { +implicit object JSONTextToDataFrameReader extends TextToDataFrameReader[JSONTextFormat] { override def read( text: Seq[String], diff --git a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/TextFormatToDataFrameReader.scala b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/TextFormatToDataFrameReader.scala deleted file mode 100644 index 4426a2c..0000000 --- a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/TextFormatToDataFrameReader.scala +++ /dev/null @@ -1,13 +0,0 @@ -package com.clairvoyant.data.scalaxy.reader.text.instances - -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.{DataFrame, SparkSession} - -trait TextFormatToDataFrameReader[T]: - - def read( - text: Seq[String], - textFormat: T, - originalSchema: Option[StructType], - adaptSchemaColumns: StructType => StructType - )(using sparkSession: SparkSession): DataFrame diff --git a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/XMLTextToDataFrameReader.scala b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/XMLTextToDataFrameReader.scala index e77d267..475713f 100644 --- a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/XMLTextToDataFrameReader.scala +++ b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/XMLTextToDataFrameReader.scala @@ -1,5 +1,6 @@ package com.clairvoyant.data.scalaxy.reader.text.instances +import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader import com.clairvoyant.data.scalaxy.reader.text.formats.XMLTextFormat import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SparkSession} @@ -7,7 +8,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import java.io.{File, PrintWriter} import java.util.concurrent.Semaphore -implicit object XMLTextToDataFrameReader extends TextFormatToDataFrameReader[XMLTextFormat] { +implicit object XMLTextToDataFrameReader extends TextToDataFrameReader[XMLTextFormat] { override def read( text: Seq[String], diff --git a/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/CSVTextToDataFrameReaderSpec.scala b/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/CSVTextToDataFrameReaderSpec.scala index fc0a73c..582e23b 100644 --- a/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/CSVTextToDataFrameReaderSpec.scala +++ b/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/CSVTextToDataFrameReaderSpec.scala @@ -1,6 +1,5 @@ package com.clairvoyant.data.scalaxy.reader.text -import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader import com.clairvoyant.data.scalaxy.reader.text.formats.CSVTextFormat import com.clairvoyant.data.scalaxy.reader.text.instances.CSVTextToDataFrameReader import com.clairvoyant.data.scalaxy.test.util.SparkUtil @@ -9,6 +8,8 @@ import org.apache.spark.sql.types.* class CSVTextToDataFrameReaderSpec extends SparkUtil { + val csvTextToDataFrameReader = TextToDataFrameReader[CSVTextFormat] + "read() - with csv text and headers" should "return a dataframe with correct count and schema" in { val csvText = """|col1,col2 @@ -18,7 +19,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { val csvTextFormat = CSVTextFormat() - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -38,7 +39,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { inferSchema = false ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -57,7 +58,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { header = false ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -77,7 +78,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { sep = ";" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -94,7 +95,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { recordSep = "#" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -112,7 +113,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { recordSep = "#" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -132,7 +133,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { timestampFormat = "dd-MM-yyyy HH:mm:ss" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -153,7 +154,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { locale = "fr-FR" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -175,7 +176,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { mode = "PERMISSIVE" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -208,7 +209,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { escape = "#" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -235,7 +236,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { nullValue = "Invalid" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -263,7 +264,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { quote = "~" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -285,7 +286,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { enforceSchema = true ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat, originalSchema = Some( @@ -315,7 +316,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { ignoreLeadingWhiteSpace = true ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -343,7 +344,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { ignoreTrailingWhiteSpace = true ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -372,7 +373,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { nanValue = "NA" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -392,7 +393,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { positiveInf = "PositiveInfiniteValue" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -417,7 +418,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { negativeInf = "NegativeInfiniteValue" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -444,7 +445,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { comment = "*" ) - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat ) @@ -462,7 +463,7 @@ class CSVTextToDataFrameReaderSpec extends SparkUtil { val csvTextFormat = CSVTextFormat() - val df = TextToDataFrameReader.read( + val df = csvTextToDataFrameReader.read( text = csvText, textFormat = csvTextFormat, adaptSchemaColumns = diff --git a/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/HTMLTableTextToDataFrameReaderSpec.scala b/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/HTMLTableTextToDataFrameReaderSpec.scala index 9950833..d537248 100644 --- a/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/HTMLTableTextToDataFrameReaderSpec.scala +++ b/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/HTMLTableTextToDataFrameReaderSpec.scala @@ -1,16 +1,14 @@ package com.clairvoyant.data.scalaxy.reader.text -import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader -import com.clairvoyant.data.scalaxy.reader.text.formats.{CSVTextFormat, HTMLTableTextFormat} +import com.clairvoyant.data.scalaxy.reader.text.formats.HTMLTableTextFormat import com.clairvoyant.data.scalaxy.reader.text.instances.HTMLTableTextToDataFrameReader -import com.clairvoyant.data.scalaxy.test.util.SparkUtil import com.clairvoyant.data.scalaxy.test.util.matchers.DataFrameMatcher import com.clairvoyant.data.scalaxy.test.util.readers.DataFrameReader -import org.apache.spark.sql.functions.col -import org.apache.spark.sql.types.* class HTMLTableTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher { + val htmlTableTextToDataFrameReader = TextToDataFrameReader[HTMLTableTextFormat] + "read() - with html text" should "return a dataframe with correct count and schema" in { val htmlText = """ @@ -98,7 +96,7 @@ class HTMLTableTextToDataFrameReaderSpec extends DataFrameReader with DataFrameM val htmlTableTextFormat = HTMLTableTextFormat() - val actualDF = TextToDataFrameReader.read( + val actualDF = htmlTableTextToDataFrameReader.read( text = htmlText, textFormat = htmlTableTextFormat ) diff --git a/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/JSONTextToDataFrameReaderSpec.scala b/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/JSONTextToDataFrameReaderSpec.scala index a41eea5..0caedcb 100644 --- a/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/JSONTextToDataFrameReaderSpec.scala +++ b/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/JSONTextToDataFrameReaderSpec.scala @@ -1,6 +1,5 @@ package com.clairvoyant.data.scalaxy.reader.text -import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader import com.clairvoyant.data.scalaxy.reader.text.formats.JSONTextFormat import com.clairvoyant.data.scalaxy.reader.text.instances.JSONTextToDataFrameReader import com.clairvoyant.data.scalaxy.test.util.matchers.DataFrameMatcher @@ -9,6 +8,8 @@ import org.apache.spark.sql.types.* class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher { + val jsonTextToDataFrameReader = TextToDataFrameReader[JSONTextFormat] + "read() - with json text having array of records" should "return a dataframe with correct count and schema" in { val jsonText = """|[ @@ -24,7 +25,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche val jsonTextFormat = JSONTextFormat() - val df = TextToDataFrameReader.read( + val df = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat ) @@ -42,7 +43,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche val jsonTextFormat = JSONTextFormat() - val df = TextToDataFrameReader.read( + val df = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat ) @@ -56,7 +57,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche val jsonTextFormat = JSONTextFormat() - val df = TextToDataFrameReader.read( + val df = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat ) @@ -77,7 +78,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche allowBackslashEscapingAnyCharacter = true ) - val actualDF = TextToDataFrameReader.read( + val actualDF = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat ) @@ -113,7 +114,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche allowComments = true ) - val actualDF = TextToDataFrameReader.read( + val actualDF = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat ) @@ -148,7 +149,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche allowNonNumericNumbers = true ) - val df = TextToDataFrameReader.read( + val df = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat ) @@ -168,7 +169,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche allowNumericLeadingZeros = true ) - val actualDF = TextToDataFrameReader.read( + val actualDF = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat ) @@ -196,7 +197,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche allowSingleQuotes = true ) - val actualDF = TextToDataFrameReader.read( + val actualDF = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat ) @@ -224,7 +225,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche allowUnquotedFieldNames = true ) - val actualDF = TextToDataFrameReader.read( + val actualDF = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat ) @@ -259,7 +260,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche dropFieldIfAllNull = true ) - val actualDF = TextToDataFrameReader.read( + val actualDF = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat ) @@ -292,7 +293,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche prefersDecimal = true ) - val df = TextToDataFrameReader.read( + val df = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat ) @@ -312,7 +313,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche primitivesAsString = true ) - val df = TextToDataFrameReader.read( + val df = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat ) @@ -330,7 +331,7 @@ class JSONTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatche val jsonTextFormat = JSONTextFormat() - val actualDF = TextToDataFrameReader.read( + val actualDF = jsonTextToDataFrameReader.read( text = jsonText, textFormat = jsonTextFormat, adaptSchemaColumns = diff --git a/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/XMLTextToDataFrameReaderSpec.scala b/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/XMLTextToDataFrameReaderSpec.scala index 952bf0c..ffb2d6d 100644 --- a/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/XMLTextToDataFrameReaderSpec.scala +++ b/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/XMLTextToDataFrameReaderSpec.scala @@ -8,6 +8,8 @@ import org.apache.spark.sql.types.* class XMLTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher { + val xmlTextToDataFrameReader = TextToDataFrameReader[XMLTextFormat] + "read() - with xml text" should "return a dataframe with correct count and schema" in { val xmlText = """| @@ -19,7 +21,7 @@ class XMLTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher val xmlTextFormat = XMLTextFormat() - val df = TextToDataFrameReader.read( + val df = xmlTextToDataFrameReader.read( text = xmlText, textFormat = xmlTextFormat ) @@ -41,7 +43,7 @@ class XMLTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher attributePrefix = "attr_" ) - val actualDF = TextToDataFrameReader.read( + val actualDF = xmlTextToDataFrameReader.read( text = xmlText, textFormat = xmlTextFormat ) @@ -89,7 +91,7 @@ class XMLTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher rowTag = "ROW" ) - val df = TextToDataFrameReader.read( + val df = xmlTextToDataFrameReader.read( text = xmlText, textFormat = xmlTextFormat ) @@ -127,7 +129,7 @@ class XMLTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher excludeAttribute = true ) - val actualDF = TextToDataFrameReader.read( + val actualDF = xmlTextToDataFrameReader.read( text = xmlText, textFormat = xmlTextFormat ) @@ -157,7 +159,7 @@ class XMLTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher nullValue = "" ) - val actualDF = TextToDataFrameReader.read( + val actualDF = xmlTextToDataFrameReader.read( text = xmlText, textFormat = xmlTextFormat ) @@ -187,7 +189,7 @@ class XMLTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher rowTag = "cols" ) - val df = TextToDataFrameReader.read( + val df = xmlTextToDataFrameReader.read( text = xmlText, textFormat = xmlTextFormat ) @@ -225,7 +227,7 @@ class XMLTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher rowTag = "person" ) - val df = TextToDataFrameReader.read( + val df = xmlTextToDataFrameReader.read( text = xmlText, textFormat = xmlTextFormat ) @@ -261,7 +263,7 @@ class XMLTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher rowTag = "book" ) - val df = TextToDataFrameReader.read( + val df = xmlTextToDataFrameReader.read( text = xmlText, textFormat = xmlTextFormat ) @@ -287,7 +289,7 @@ class XMLTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher valueTag = "#VALUE" ) - val actualDF = TextToDataFrameReader.read( + val actualDF = xmlTextToDataFrameReader.read( text = xmlText, textFormat = xmlTextFormat ) @@ -320,7 +322,7 @@ class XMLTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher val xmlTextFormat = XMLTextFormat() - val df = TextToDataFrameReader.read( + val df = xmlTextToDataFrameReader.read( text = xmlText, textFormat = xmlTextFormat, adaptSchemaColumns =