Skip to content

Commit

Permalink
Merge pull request #6 from teamclairvoyant/staging
Browse files Browse the repository at this point in the history
Staging
  • Loading branch information
rahulbhatia023 authored Oct 16, 2023
2 parents 4fd177e + 35d01e7 commit 1936c2c
Show file tree
Hide file tree
Showing 6 changed files with 251 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ Supported text formats are:
* CSV
* JSON
* XML
* HTML Table

Please see the detailed documentation [here](text/README.md).
10 changes: 9 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,15 @@ ThisBuild / wartremoverErrors ++= Warts.allBut(
Wart.LeakingSealed,
Wart.Null,
Wart.Overloading,
Wart.Throw,
Wart.TryPartial,
Wart.ToString
)

// ----- TOOL VERSIONS ----- //

val dataScalaxyTestUtilVersion = "1.0.0"
val jsoupVersion = "1.16.1"
val scalaParserCombinatorsVersion = "2.3.0"
val sparkVersion = "3.4.1"
val sparkXMLVersion = "0.16.0"
Expand All @@ -54,6 +57,10 @@ val dataScalaxyTestUtilDependencies = Seq(
"com.clairvoyant.data.scalaxy" %% "test-util" % dataScalaxyTestUtilVersion % Test
)

val jsoupDependencies = Seq(
"org.jsoup" % "jsoup" % jsoupVersion
)

val scalaParserCombinatorsDependencies = Seq(
"org.scala-lang.modules" %% "scala-parser-combinators" % scalaParserCombinatorsVersion
)
Expand All @@ -77,6 +84,7 @@ val zioConfigDependencies = Seq(

val textDependencies =
dataScalaxyTestUtilDependencies ++
jsoupDependencies ++
sparkDependencies ++
sparkXMLDependencies ++
zioConfigDependencies
Expand All @@ -92,6 +100,6 @@ lazy val `data-scalaxy-reader` = (project in file("."))

lazy val `reader-text` = (project in file("text"))
.settings(
version := "1.0.0",
version := "1.1.0",
libraryDependencies ++= textDependencies
)
39 changes: 39 additions & 0 deletions text/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -238,3 +238,42 @@ TextToDataFrameReader
textFormat = xmlTextFormat
)
``````

### HTML Table

Suppose user wants to read a text `htmlText` containing data in the form of html table and parse it to spark dataframe.
Then user need to perform below steps:

#### 1. Define file format

```scala
import com.clairvoyant.data.scalaxy.reader.text.formats.HTMLTableTextFormat

val htmlTableTextFormat = HTMLTableTextFormat(
tableName = "my_table"
)
```

User can provide below options to the `HTMLTableTextFormat` instance:

| Parameter Name | Default Value | Mandatory | Description |
| :------------- | :-----------: | :-------: | :---------------------------------------------------------------------------- |
| tableName | None | No | The name of the table in the `table` tag that you want to read the data from. |

#### 2. Import type class instance

```scala
import com.clairvoyant.data.scalaxy.reader.text.instances.HTMLTableTextToDataFrameReader
``````

#### 3. Call API

```scala
import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader

TextToDataFrameReader
.read(
text = htmlText,
textFormat = htmlTableTextFormat
)
``````
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,7 @@ case class XMLTextFormat(
valueTag: String = "_VALUE",
wildcardColName: String = "xs_any"
) extends TextFormat

case class HTMLTableTextFormat(
tableName: Option[String] = None
) extends TextFormat
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package com.clairvoyant.data.scalaxy.reader.text.instances

import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader
import com.clairvoyant.data.scalaxy.reader.text.formats.{CSVTextFormat, HTMLTableTextFormat}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.jsoup.Jsoup
import org.jsoup.nodes.{Document, Element}
import org.jsoup.select.Elements

import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable`
import scala.util.Try

implicit object HTMLTableTextToDataFrameReader extends TextFormatToDataFrameReader[HTMLTableTextFormat] {

val VALUE_SEPARATOR = "~"

override def read(
text: Seq[String],
textFormat: HTMLTableTextFormat,
originalSchema: Option[StructType],
adaptSchemaColumns: StructType => StructType
)(using sparkSession: SparkSession): DataFrame =
TextToDataFrameReader.read(
text = text.map(htmlText => convertHTMLTableToCSV(htmlText, textFormat.tableName)),
textFormat = CSVTextFormat(
sep = VALUE_SEPARATOR
),
originalSchema = originalSchema,
adaptSchemaColumns = adaptSchemaColumns
)

def convertHTMLTableToCSV(htmlText: String, tableName: Option[String] = None): String = {
Try {
val parsedDocument = Jsoup.parse(htmlText)

val table = getTableFromParsedDocument(parsedDocument, tableName)

val rows = table.select("tr")
val tableHeader = getTableHeader(rows)
val tableRows = getTableRows(rows)

tableHeader.concat(tableRows)
}.recover { case ex: Exception =>
ex.printStackTrace()
throw ex
}.get
}

def getTableFromParsedDocument(parsedDocument: Document, tableName: Option[String]): Element =
tableName
.map { tblName =>
val tables = parsedDocument.select(tblName)
if (tables.size() > 0)
tables.first()
else
throw new Exception(s"HTML table: $tblName not found")
}
.getOrElse(parsedDocument.getElementsByTag("table").first())

def getTableHeader(rows: Elements): String =
Seq(rows.select("th").map(_.text)).flatten
.mkString(VALUE_SEPARATOR)
.concat("\n")

def getTableRows(rows: Elements): String =
Seq(
rows.map { row =>
val flattenRows = Seq(row.select("td").map(_.text())).flatten
if (flattenRows.nonEmpty)
flattenRows.mkString(VALUE_SEPARATOR).concat("\n")
else
""
}
).flatten.mkString("")

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package com.clairvoyant.data.scalaxy.reader.text

import com.clairvoyant.data.scalaxy.reader.text.TextToDataFrameReader
import com.clairvoyant.data.scalaxy.reader.text.formats.{CSVTextFormat, HTMLTableTextFormat}
import com.clairvoyant.data.scalaxy.reader.text.instances.HTMLTableTextToDataFrameReader
import com.clairvoyant.data.scalaxy.test.util.SparkUtil
import com.clairvoyant.data.scalaxy.test.util.matchers.DataFrameMatcher
import com.clairvoyant.data.scalaxy.test.util.readers.DataFrameReader
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.*

class HTMLTableTextToDataFrameReaderSpec extends DataFrameReader with DataFrameMatcher {

"read() - with html text" should "return a dataframe with correct count and schema" in {
val htmlText =
"""
<!DOCTYPE html>
<div class="ac-card-content ac-card-content-normal ac-card-content-normal-alt">
<p class="a-spacing-base ac-cms-promo-big"><img alt="" src="https://images-eu.ssl-images-amazon.com/images/G/02/associates/network/holborn/ads-promo._V285961827_.png">
Are you monetising international traffic? You can now receive earnings in an international bank account. <a href="https://amazon-affiliate.eu/en/receive-international-earnings/" target="_blank">Click here</a> to
learn more.
</p>
<div class="a-dtt-datatable">
<table class="a-dtt-table">
<thead class="a-dtt-thead">
<tr>
<th class="a-dtt-header">
Date
</th>
<th class="a-dtt-header">
Transaction
</th>
<th class="a-dtt-header">
<div class="ac-float-r">
Amount
</div>
</th>
<th class="a-dtt-header">
<div class="ac-float-r">
Balance
</div>
</th>
</tr>
</thead>
<tbody class="a-dtt-tbody">
<tr>
<td>
01/06/2022
</td>
<td>
04/2022 Advertising Fees
</td>
<td>
<div class="ac-float-r">
£4,049.19
</div>
</td>
<td>
<div class="ac-float-r">
£4,049.19
</div>
</td>
</tr>
<tr>
<td>
30/05/2022
</td>
<td>
Payment by Direct Deposit
</td>
<td>
<div class="ac-float-r">
<div class="ac-payment-balance-negative">
-£3,349.17
</div>
</div>
</td>
<td>
<div class="ac-float-r">
£0.00
</div>
</td>
</tr>
</tbody>
</table>
</div>
<div class="ac-payment-note ac-standard-alert">
<div class="a-box a-alert a-alert-info" aria-live="polite" aria-atomic="true">
<div class="a-box-inner a-alert-container"><i class="a-icon a-icon-alert"></i>
<div class="a-alert-content">Please note:
Any transactions that occurred prior to May 10, 2000, will not be visible on this report.
</div>
</div>
</div>
</div>
</div>
""".stripMargin

val htmlTableTextFormat = HTMLTableTextFormat()

val actualDF = TextToDataFrameReader.read(
text = htmlText,
textFormat = htmlTableTextFormat
)

val expectedDF = readCSVFromText(
text =
"""
|Amount~Balance~Date~Transaction
|£4,049.19~£4,049.19~01/06/2022~04/2022 Advertising Fees
|-£3,349.17~£0.00~30/05/2022~Payment by Direct Deposit
""".stripMargin,
csvOptions = Map(
"sep" -> "~"
)
)

actualDF should matchExpectedDataFrame(expectedDF)
}

}

0 comments on commit 1936c2c

Please sign in to comment.