Skip to content

Commit

Permalink
Merge branch 'main' into multi-api-engine
Browse files Browse the repository at this point in the history
  • Loading branch information
elronbandel authored Nov 18, 2024
2 parents b686f95 + 06af796 commit b4dfe3b
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 6 deletions.
33 changes: 33 additions & 0 deletions prepare/serializers/table_serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from unitxt import add_to_catalog
from unitxt.struct_data_operators import (
SerializeTableAsConcatenation,
SerializeTableAsDFLoader,
SerializeTableAsHTML,
SerializeTableAsIndexedRowMajor,
SerializeTableAsJson,
SerializeTableAsMarkdown,
)

operator = SerializeTableAsConcatenation()

add_to_catalog(operator, "serializers.table.concat", overwrite=True)

operator = SerializeTableAsIndexedRowMajor()

add_to_catalog(operator, "serializers.table.indexed_row_major", overwrite=True)

operator = SerializeTableAsMarkdown()

add_to_catalog(operator, "serializers.table.markdown", overwrite=True)

operator = SerializeTableAsDFLoader()

add_to_catalog(operator, "serializers.table.df", overwrite=True)

operator = SerializeTableAsJson()

add_to_catalog(operator, "serializers.table.json", overwrite=True)

operator = SerializeTableAsHTML()

add_to_catalog(operator, "serializers.table.html", overwrite=True)
3 changes: 3 additions & 0 deletions src/unitxt/catalog/serializers/table/concat.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"__type__": "serialize_table_as_concatenation"
}
3 changes: 3 additions & 0 deletions src/unitxt/catalog/serializers/table/df.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"__type__": "serialize_table_as_df_loader"
}
3 changes: 3 additions & 0 deletions src/unitxt/catalog/serializers/table/html.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"__type__": "serialize_table_as_html"
}
3 changes: 3 additions & 0 deletions src/unitxt/catalog/serializers/table/indexed_row_major.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"__type__": "serialize_table_as_indexed_row_major"
}
3 changes: 3 additions & 0 deletions src/unitxt/catalog/serializers/table/json.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"__type__": "serialize_table_as_json"
}
3 changes: 3 additions & 0 deletions src/unitxt/catalog/serializers/table/markdown.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"__type__": "serialize_table_as_markdown"
}
67 changes: 61 additions & 6 deletions src/unitxt/struct_data_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,32 @@ def process_rows(self, rows: List[List]) -> str:
return rows_html


class SerializeTableAsConcatenation(SerializeTable):
"""Concat Serializer.
Concat all table content to one string of header and rows.
Format(Sample):
name age Alex 26 Diana 34
"""

def serialize_table(self, table_content: Dict) -> str:
# Extract headers and rows from the dictionary
header = table_content["header"]
rows = table_content["rows"]

assert header and rows, "Incorrect input table format"

# Process table header first
serialized_tbl_str = " ".join([str(i) for i in header])

# Process rows sequentially starting from row 1
for row in rows:
serialized_tbl_str += " " + " ".join([str(i) for i in row])

# return serialized table as a string
return serialized_tbl_str.strip()


# truncate cell value to maximum allowed length
def truncate_cell(cell_value, max_len):
if cell_value is None:
Expand Down Expand Up @@ -566,7 +592,7 @@ def replace_header(self, table_content: Dict) -> str:
return table_content


class ShuffleTableRows(FieldOperator):
class ShuffleTableRows(TypeDependentAugmentor):
"""Shuffles the input table rows randomly.
Sample Input:
Expand All @@ -582,12 +608,15 @@ class ShuffleTableRows(FieldOperator):
}
"""

augmented_type = Table
seed = 0

def process_value(self, table: Any) -> Any:
table_input = recursive_copy(table)
return shuffle_rows(table_input)
return shuffle_rows(table_input, self.seed)


class ShuffleTableColumns(FieldOperator):
class ShuffleTableColumns(TypeDependentAugmentor):
"""Shuffles the table columns randomly.
Sample Input:
Expand All @@ -603,9 +632,12 @@ class ShuffleTableColumns(FieldOperator):
}
"""

augmented_type = Table
seed = 0

def process_value(self, table: Any) -> Any:
table_input = recursive_copy(table)
return shuffle_columns(table_input)
return shuffle_columns(table_input, self.seed)


class LoadJson(FieldOperator):
Expand Down Expand Up @@ -640,9 +672,9 @@ class MapHTMLTableToJSON(FieldOperator):
_requirements_list = ["bs4"]

def process_value(self, table: Any) -> Any:
return self.truncate_table_rows(table_content=table)
return self.convert_to_json(table_content=table)

def truncate_table_rows(self, table_content: str) -> Dict:
def convert_to_json(self, table_content: str) -> Dict:
from bs4 import BeautifulSoup

soup = BeautifulSoup(table_content, "html.parser")
Expand Down Expand Up @@ -856,3 +888,26 @@ def process_value(self, table: Any) -> Any:

# Return the modified table
return {"header": header, "rows": rows}


class MaskColumnsNames(TypeDependentAugmentor):
"""Mask the names of tables columns with dummies "Col1", "Col2" etc."""

augmented_type = Table

def process_value(self, table: Any) -> Any:
masked_header = ["Col" + str(ind + 1) for ind in range(len(table["header"]))]

return {"header": masked_header, "rows": table["rows"]}


class ShuffleColumnsNames(TypeDependentAugmentor):
"""Shuffle table columns names to be displayed in random order."""

augmented_type = Table

def process_value(self, table: Any) -> Any:
shuffled_header = table["header"]
random.shuffle(shuffled_header)

return {"header": shuffled_header, "rows": table["rows"]}

0 comments on commit b4dfe3b

Please sign in to comment.