Skip to content
This repository has been archived by the owner on Jun 15, 2024. It is now read-only.

Commit

Permalink
chore: add annotation & small changes
Browse files Browse the repository at this point in the history
  • Loading branch information
doomspec committed Oct 18, 2023
1 parent d9246ce commit 442049f
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 7 deletions.
2 changes: 1 addition & 1 deletion docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ IDE recommendation: PyCharm

Docstring style: rst

## DocInPy
## Documentation format

Please use the [Moduler](https://m.evoevo.org) style for adding sections in the codes.

Expand Down
8 changes: 6 additions & 2 deletions fibers/data_loader/html_to_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ def url_to_tree(url: str) -> Tree:

def html_to_tree(html: str) -> Tree:
soup = BeautifulSoup(html, "html.parser")
title = soup.find("title").text
title = soup.find("title")
if title:
title = title.text
else:
title = ""
root = extract_article_root(soup)
tree = html_to_raw_tree(root, title=title)
html_to_markdown(tree.root)
Expand Down Expand Up @@ -83,7 +87,7 @@ def extract_article_root(soup: BeautifulSoup):
n_article_elements = []
elements = []
for path, element in bfs_on_soup(soup):
if element.name in ["div", "article"]:
if element.name in ["div", "article", "html", "body", "main"]:
elements.append(element)
# count the number of article related elements
n_article_elements_here = 0
Expand Down
Empty file.
42 changes: 42 additions & 0 deletions fibers/testing/testing_article/loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os

import jsonlines

curr_dir = os.path.dirname(os.path.abspath(__file__))

url_to_dataset = {
"QuALITY.v1.0.1.train": "https://raw.githubusercontent.com/nyu-mll/quality/main/data/v1.0.1/QuALITY.v1.0.1.train",
"QuALITY.v1.0.1.dev": "https://raw.githubusercontent.com/nyu-mll/quality/main/data/v1.0.1/QuALITY.v1.0.1.dev",
"QuALITY.v1.0.1.test": "https://raw.githubusercontent.com/nyu-mll/quality/main/data/v1.0.1/QuALITY.v1.0.1.test",
}


def download_dataset(name):
if name not in url_to_dataset:
raise ValueError(f"Unknown dataset {name}")
url = url_to_dataset[name]
dataset_path = os.path.join(curr_dir, name)
print(f"Downloading dataset {name} from {url} to {dataset_path}")
os.system(f"curl {url} -o {dataset_path}")


def iter_dataset(name):
dataset_path = os.path.join(curr_dir, name)
if not os.path.exists(dataset_path):
download_dataset(name)
with jsonlines.open(dataset_path) as reader:
for obj in reader:
yield obj


def extract_dataset(name, number):
i = 0
for obj in iter_dataset(name):
if i == number:
return obj
i += 1


if __name__ == "__main__":
data = extract_dataset("QuALITY.v1.0.1.dev", 2)
print(data)
4 changes: 4 additions & 0 deletions fibers/testing/testing_trees/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from fibers.data_loader.document import Document
from fibers.data_loader.html_to_tree import html_to_tree
from fibers.data_loader.latex_to_tree import latex_to_tree
from fibers.data_loader.markdown_to_tree import markdown_to_tree
from fibers.tree import Tree

Expand All @@ -23,6 +24,9 @@ def load_sample_tree(path: str) -> Tree:
case ".md":
file_name = os.path.splitext(path)[0]
tree = markdown_to_tree(src, file_name)
case ".tex":
file_name = os.path.splitext(path)[0]
tree = latex_to_tree(src)
case ".html":
tree = html_to_tree(src)
case _:
Expand Down
8 changes: 4 additions & 4 deletions fibers/tree/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def __init__(self, root_content="", rule_of_path: str = None):

# Set up the root
root = Node(self)
self.children[root] = {}
self.node_path[root] = tuple()
self.children[root]: Dict[Node, Node] = {}
self.node_path[root]: Dict[Node, Tuple[str, ...]] = tuple()
root.set_content(root_content)

# TODO: this can be node, tree, or string in the future
Expand All @@ -40,11 +40,11 @@ def __init__(self, root_content="", rule_of_path: str = None):
## Node information query
"""

def get_node_path(self, node: Node):
def get_node_path(self, node: Node) -> Tuple[str, ...]:
try:
return self.node_path[node]
except KeyError:
return None
raise Exception(f"Node {node} not in tree")

def get_children_dict(self, node: Node):
return self.children[node]
Expand Down

0 comments on commit 442049f

Please sign in to comment.