diff --git a/README.md b/README.md index cd39298..8440f36 100644 --- a/README.md +++ b/README.md @@ -14,5 +14,6 @@ Supported text formats are: * CSV * JSON * XML +* HTML Table Please see the detailed documentation [here](text/README.md). diff --git a/build.sbt b/build.sbt index 7097091..7088a70 100644 --- a/build.sbt +++ b/build.sbt @@ -37,12 +37,15 @@ ThisBuild / wartremoverErrors ++= Warts.allBut( Wart.LeakingSealed, Wart.Null, Wart.Overloading, + Wart.Throw, + Wart.TryPartial, Wart.ToString ) // ----- TOOL VERSIONS ----- // val dataScalaxyTestUtilVersion = "1.0.0" +val jsoupVersion = "1.16.1" val scalaParserCombinatorsVersion = "2.3.0" val sparkVersion = "3.4.1" val sparkXMLVersion = "0.16.0" @@ -54,6 +57,10 @@ val dataScalaxyTestUtilDependencies = Seq( "com.clairvoyant.data.scalaxy" %% "test-util" % dataScalaxyTestUtilVersion % Test ) +val jsoupDependencies = Seq( + "org.jsoup" % "jsoup" % jsoupVersion +) + val scalaParserCombinatorsDependencies = Seq( "org.scala-lang.modules" %% "scala-parser-combinators" % scalaParserCombinatorsVersion ) @@ -77,6 +84,7 @@ val zioConfigDependencies = Seq( val textDependencies = dataScalaxyTestUtilDependencies ++ + jsoupDependencies ++ sparkDependencies ++ sparkXMLDependencies ++ zioConfigDependencies @@ -92,6 +100,6 @@ lazy val `data-scalaxy-reader` = (project in file(".")) lazy val `reader-text` = (project in file("text")) .settings( - version := "1.0.0", + version := "1.1.0", libraryDependencies ++= textDependencies ) diff --git a/text/README.md b/text/README.md index c696c7c..e800f6e 100644 --- a/text/README.md +++ b/text/README.md @@ -238,3 +238,42 @@ TextToDataFrameReader textFormat = xmlTextFormat ) `````` + +### HTML Table + +Suppose user wants to read a text `htmlText` containing data in the form of html table and parse it to spark dataframe. +Then user need to perform below steps: + +#### 1. Define file format + +```scala +import com.clairvoyant.data.scalaxy.reader.text.formats.HTMLTableTextFormat + +val htmlTableTextFormat = HTMLTableTextFormat( + tableName = "my_table" +) +``` + +User can provide below options to the `HTMLTableTextFormat` instance: + +| Parameter Name | Default Value | Mandatory | Description | +| :------------- | :-----------: | :-------: | :---------------------------------------------------------------------------- | +| tableName | None | No | The name of the table in the `table` tag that you want to read the data from. | + +#### 2. Import type class instance + +```scala +import com.clairvoyant.data.scalaxy.reader.text.instances.HTMLTableTextToDataFrameReader +`````` + +#### 3. Call API + +```scala +import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader + +TextToDataFrameReader + .read( + text = htmlText, + textFormat = htmlTableTextFormat + ) +`````` diff --git a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/formats/TextFormat.scala b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/formats/TextFormat.scala index bfe7ffa..15fbc86 100644 --- a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/formats/TextFormat.scala +++ b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/formats/TextFormat.scala @@ -83,3 +83,7 @@ case class XMLTextFormat( valueTag: String = "_VALUE", wildcardColName: String = "xs_any" ) extends TextFormat + +case class HTMLTableTextFormat( + tableName: Option[String] = None +) extends TextFormat diff --git a/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/HTMLTableTextToDataFrameReader.scala b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/HTMLTableTextToDataFrameReader.scala new file mode 100644 index 0000000..dab7619 --- /dev/null +++ b/text/src/main/scala/com/clairvoyant/data/scalaxy/reader/text/instances/HTMLTableTextToDataFrameReader.scala @@ -0,0 +1,77 @@ +package com.clairvoyant.data.scalaxy.reader.text.instances + +import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader +import com.clairvoyant.data.scalaxy.reader.text.formats.{CSVTextFormat, HTMLTableTextFormat} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.jsoup.Jsoup +import org.jsoup.nodes.{Document, Element} +import org.jsoup.select.Elements + +import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable` +import scala.util.Try + +implicit object HTMLTableTextToDataFrameReader extends TextFormatToDataFrameReader[HTMLTableTextFormat] { + + val VALUE_SEPARATOR = "~" + + override def read( + text: Seq[String], + textFormat: HTMLTableTextFormat, + originalSchema: Option[StructType], + adaptSchemaColumns: StructType => StructType + )(using sparkSession: SparkSession): DataFrame = + TextToDataFrameReader.read( + text = text.map(htmlText => convertHTMLTableToCSV(htmlText, textFormat.tableName)), + textFormat = CSVTextFormat( + sep = VALUE_SEPARATOR + ), + originalSchema = originalSchema, + adaptSchemaColumns = adaptSchemaColumns + ) + + def convertHTMLTableToCSV(htmlText: String, tableName: Option[String] = None): String = { + Try { + val parsedDocument = Jsoup.parse(htmlText) + + val table = getTableFromParsedDocument(parsedDocument, tableName) + + val rows = table.select("tr") + val tableHeader = getTableHeader(rows) + val tableRows = getTableRows(rows) + + tableHeader.concat(tableRows) + }.recover { case ex: Exception => + ex.printStackTrace() + throw ex + }.get + } + + def getTableFromParsedDocument(parsedDocument: Document, tableName: Option[String]): Element = + tableName + .map { tblName => + val tables = parsedDocument.select(tblName) + if (tables.size() > 0) + tables.first() + else + throw new Exception(s"HTML table: $tblName not found") + } + .getOrElse(parsedDocument.getElementsByTag("table").first()) + + def getTableHeader(rows: Elements): String = + Seq(rows.select("th").map(_.text)).flatten + .mkString(VALUE_SEPARATOR) + .concat("\n") + + def getTableRows(rows: Elements): String = + Seq( + rows.map { row => + val flattenRows = Seq(row.select("td").map(_.text())).flatten + if (flattenRows.nonEmpty) + flattenRows.mkString(VALUE_SEPARATOR).concat("\n") + else + "" + } + ).flatten.mkString("") + +} diff --git a/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/HTMLTableTextToDataFrameReaderSpec.scala b/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/HTMLTableTextToDataFrameReaderSpec.scala new file mode 100644 index 0000000..9950833 --- /dev/null +++ b/text/src/test/scala/com/clairvoyant/data/scalaxy/reader/text/HTMLTableTextToDataFrameReaderSpec.scala @@ -0,0 +1,121 @@ +package com.clairvoyant.data.scalaxy.reader.text + +import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader +import com.clairvoyant.data.scalaxy.reader.text.formats.{CSVTextFormat, HTMLTableTextFormat} +import com.clairvoyant.data.scalaxy.reader.text.instances.HTMLTableTextToDataFrameReader +import com.clairvoyant.data.scalaxy.test.util.SparkUtil +import com.clairvoyant.data.scalaxy.test.util.matchers.DataFrameMatcher +import com.clairvoyant.data.scalaxy.test.util.readers.DataFrameReader +import org.apache.spark.sql.functions.col +import org.apache.spark.sql.types.* + +class HTMLTableTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher { + + "read() - with html text" should "return a dataframe with correct count and schema" in { + val htmlText = + """ + +
+

+ Are you monetising international traffic? You can now receive earnings in an international bank account. Click here to + learn more. +

+
+ + + + + + + + + + + + + + + + + + + + + + + +
+ Date + + Transaction + +
+ Amount +
+
+
+ Balance +
+
+ 01/06/2022 + + 04/2022 Advertising Fees + +
+ £4,049.19 +
+
+
+ £4,049.19 +
+
+ 30/05/2022 + + Payment by Direct Deposit + +
+
+ -£3,349.17 +
+
+
+
+ £0.00 +
+
+
+
+
+
+
Please note: + Any transactions that occurred prior to May 10, 2000, will not be visible on this report. +
+
+
+
+
+ """.stripMargin + + val htmlTableTextFormat = HTMLTableTextFormat() + + val actualDF = TextToDataFrameReader.read( + text = htmlText, + textFormat = htmlTableTextFormat + ) + + val expectedDF = readCSVFromText( + text = + """ + |Amount~Balance~Date~Transaction + |£4,049.19~£4,049.19~01/06/2022~04/2022 Advertising Fees + |-£3,349.17~£0.00~30/05/2022~Payment by Direct Deposit + """.stripMargin, + csvOptions = Map( + "sep" -> "~" + ) + ) + + actualDF should matchExpectedDataFrame(expectedDF) + } + +}