chore: add annotation & small changes

EvoEvolver · Oct 18, 2023 · 442049f · 442049f
1 parent d9246ce
commit 442049f
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 7 deletions.
diff --git a/docs/development.md b/docs/development.md
@@ -4,7 +4,7 @@ IDE recommendation: PyCharm
 
 Docstring style: rst
 
-## DocInPy
+## Documentation format
 
 Please use the [Moduler](https://m.evoevo.org) style for adding sections in the codes. 
 

diff --git a/fibers/data_loader/html_to_tree.py b/fibers/data_loader/html_to_tree.py
@@ -15,7 +15,11 @@ def url_to_tree(url: str) -> Tree:
 
 def html_to_tree(html: str) -> Tree:
     soup = BeautifulSoup(html, "html.parser")
-    title = soup.find("title").text
+    title = soup.find("title")
+    if title:
+        title = title.text
+    else:
+        title = ""
     root = extract_article_root(soup)
     tree = html_to_raw_tree(root, title=title)
     html_to_markdown(tree.root)
@@ -83,7 +87,7 @@ def extract_article_root(soup: BeautifulSoup):
     n_article_elements = []
     elements = []
     for path, element in bfs_on_soup(soup):
-        if element.name in ["div", "article"]:
+        if element.name in ["div", "article", "html", "body", "main"]:
             elements.append(element)
             # count the number of article related elements
             n_article_elements_here = 0

diff --git a/fibers/testing/testing_article/__init__.py b/fibers/testing/testing_article/__init__.py
diff --git a/fibers/testing/testing_article/loader.py b/fibers/testing/testing_article/loader.py
@@ -0,0 +1,42 @@
+import os
+
+import jsonlines
+
+curr_dir = os.path.dirname(os.path.abspath(__file__))
+
+url_to_dataset = {
+    "QuALITY.v1.0.1.train": "https://raw.githubusercontent.com/nyu-mll/quality/main/data/v1.0.1/QuALITY.v1.0.1.train",
+    "QuALITY.v1.0.1.dev": "https://raw.githubusercontent.com/nyu-mll/quality/main/data/v1.0.1/QuALITY.v1.0.1.dev",
+    "QuALITY.v1.0.1.test": "https://raw.githubusercontent.com/nyu-mll/quality/main/data/v1.0.1/QuALITY.v1.0.1.test",
+}
+
+
+def download_dataset(name):
+    if name not in url_to_dataset:
+        raise ValueError(f"Unknown dataset {name}")
+    url = url_to_dataset[name]
+    dataset_path = os.path.join(curr_dir, name)
+    print(f"Downloading dataset {name} from {url} to {dataset_path}")
+    os.system(f"curl {url} -o {dataset_path}")
+
+
+def iter_dataset(name):
+    dataset_path = os.path.join(curr_dir, name)
+    if not os.path.exists(dataset_path):
+        download_dataset(name)
+    with jsonlines.open(dataset_path) as reader:
+        for obj in reader:
+            yield obj
+
+
+def extract_dataset(name, number):
+    i = 0
+    for obj in iter_dataset(name):
+        if i == number:
+            return obj
+        i += 1
+
+
+if __name__ == "__main__":
+    data = extract_dataset("QuALITY.v1.0.1.dev", 2)
+    print(data)
diff --git a/fibers/testing/testing_trees/loader.py b/fibers/testing/testing_trees/loader.py
@@ -3,6 +3,7 @@
 
 from fibers.data_loader.document import Document
 from fibers.data_loader.html_to_tree import html_to_tree
+from fibers.data_loader.latex_to_tree import latex_to_tree
 from fibers.data_loader.markdown_to_tree import markdown_to_tree
 from fibers.tree import Tree
 
@@ -23,6 +24,9 @@ def load_sample_tree(path: str) -> Tree:
         case ".md":
             file_name = os.path.splitext(path)[0]
             tree = markdown_to_tree(src, file_name)
+        case ".tex":
+            file_name = os.path.splitext(path)[0]
+            tree = latex_to_tree(src)
         case ".html":
             tree = html_to_tree(src)
         case _:

diff --git a/fibers/tree/tree.py b/fibers/tree/tree.py
@@ -28,8 +28,8 @@ def __init__(self, root_content="", rule_of_path: str = None):
 
         # Set up the root
         root = Node(self)
-        self.children[root] = {}
-        self.node_path[root] = tuple()
+        self.children[root]: Dict[Node, Node] = {}
+        self.node_path[root]: Dict[Node, Tuple[str, ...]] = tuple()
         root.set_content(root_content)
 
         # TODO: this can be node, tree, or string in the future
@@ -40,11 +40,11 @@ def __init__(self, root_content="", rule_of_path: str = None):
     ## Node information query
     """
 
-    def get_node_path(self, node: Node):
+    def get_node_path(self, node: Node) -> Tuple[str, ...]:
         try:
             return self.node_path[node]
         except KeyError:
-            return None
+            raise Exception(f"Node {node} not in tree")
 
     def get_children_dict(self, node: Node):
         return self.children[node]