Merge pull request #27 from Yukino2002/revamp

Update scripts to use the Github API
deepchem · Aug 9, 2023 · 4379f5c · 4379f5c
2 parents e5c469a + 7b52bf2
commit 4379f5c
Show file tree

Hide file tree

Showing 4 changed files with 200 additions and 2,012 deletions.
diff --git a/new-website/utils/tutorials/fetch_tutorials.py b/new-website/utils/tutorials/fetch_tutorials.py
@@ -12,83 +12,102 @@
 import os
 import shutil
 import subprocess
-import re
 import requests
-from bs4 import BeautifulSoup
 from utils import clean
 
+DEEPCHEM_REPO_OWNER = "deepchem"
+DEEPCHEM_REPO_NAME = "deepchem"
+TUTORIALS_PATH = "examples/tutorials"
+TUTORIAL_RENDER_ORDER_PATH = "examples/tutorials/website-render-order"
 
-def fetch_file_list_from_repo(path_to_directory):
+
+def fetch_file_list_from_repo(repo_owner, repo_name, path):
     """
-    Fetches the names of all the files from a given directory in a Github repository.
+    Fetches the metadata like name, download_url etc of all the files in a given path in a Github repository .
 
     Parameters
     ----------
-    path_to_directory: str
-        The URL of the directory in the Github repository.
+    repo_owner: str
+        The owner of the Github repository.
+    repo_name: str
+        The name of the Github repository.
+    path: str
+        The path in the Github repository from which the file names are to be fetched.
 
     Returns
     -------
-    files: list
-        A list of strings, where each string represents the name of a file present in the directory.
-    """
-    files = []
-    response = requests.get(path_to_directory)
-    soup = BeautifulSoup(response.content, 'html.parser')
-
-    fileNames = soup.find_all(
-        'a', attrs={'class': 'js-navigation-open Link--primary'})
-    for fileName in fileNames:
-        fileName = fileName.text
-        files.append(fileName)
-
-    return files
-
+    data: list
+        A list of dictionaries, where each dictionary contains various metadata like the name and download_url of a file.
 
-def fetch_tutorial_data():
+    Raises
+    ------
+    Exception
+        If the response status code is not 200.
     """
-    Fetches the names of all the tutorials from the given Github URL.
-
+    url = "https://api.github.com/repos/{}/{}/contents/{}".format(
+        repo_owner, repo_name, path)
+    response = requests.get(url)
+    if response.status_code == 200:
+        data = response.json()
+        return data
+    else:
+        raise Exception(
+            "Error fetching file names: {}".format(response.status_code))
+
+
+def get_tutorial_list():
+    """
+    Fetches the names of all the tutorials from the Deepchem Github repository.
     Returns
     -------
     tutorials: list
         A list of strings, where each string represents the name of a tutorial.
     """
-    tutorials_url = 'https://github.com/deepchem/deepchem/tree/master/examples/tutorials'
-    tutorials = fetch_file_list_from_repo(tutorials_url)
+    tutorials = fetch_file_list_from_repo(
+        DEEPCHEM_REPO_OWNER, DEEPCHEM_REPO_NAME, TUTORIALS_PATH)
+
+    tutorial_names = [tutorial['name'] for tutorial in tutorials]
 
     # Filter only the ipynb files
-    tutorials = [
-        tutorial for tutorial in tutorials if tutorial.endswith('.ipynb')]
-    return tutorials
+    tutorial_names = [
+        tutorial for tutorial in tutorial_names if tutorial.endswith('.ipynb')]
+    return tutorial_names
+
 
 def fetch_tutorial_render_order():
     """
     Downloads the CSV files containing the tutorial order from the Deepchem repository.
     """
 
-    raw_path = 'https://raw.githubusercontent.com/deepchem/deepchem/master/examples/tutorials/website-render-order/'
-    csv_directory = 'https://github.com/deepchem/deepchem/tree/master/examples/tutorials/website-render-order'
-    tutorial_order = fetch_file_list_from_repo(csv_directory)
+    tutorial_order_csv_data = fetch_file_list_from_repo(
+        DEEPCHEM_REPO_OWNER, DEEPCHEM_REPO_NAME, TUTORIAL_RENDER_ORDER_PATH)
+
+    tutorial_order_csv_names = [tutorial['name']
+                                for tutorial in tutorial_order_csv_data]
 
     # Filter only the csv files
-    tutorial_order = [
-        tutorial for tutorial in tutorial_order if tutorial.endswith('.csv')]
+    tutorial_order_csv_names = [
+        tutorial for tutorial in tutorial_order_csv_names if tutorial.endswith('.csv')]
 
-    for tutorial_group in tutorial_order:
-        response = requests.get(raw_path + tutorial_group)
-        with open(f"./website-render-order/{tutorial_group}", "wb") as tutorial_file:
-            tutorial_file.write(response.content)
+    for tutorial_group in tutorial_order_csv_data:
+        response = requests.get(tutorial_group.get('download_url'))
+        if response.status_code == 200:
+            with open(f"./website-render-order/{tutorial_group.get('name')}", "wb") as tutorial_file:
+                tutorial_file.write(response.content)
+        else:
+            raise Exception(
+                "Error fetching tutorial render order: {}".format(response.status_code))
 
 
 def create_directories():
     """
     Creates the required directories   
     """
-    os.makedirs('./html-notebooks',exist_ok=True)
+    os.makedirs('./html-notebooks', exist_ok=True)
     os.makedirs('./ipynb-notebooks', exist_ok=True)
     os.makedirs('./website-render-order', exist_ok=True)
 
+
 def convert_to_html(tutorials):
     """
     Converts the Jupyter notebooks in the './ipynb-notebooks' directory to HTML files and stores them in the './html-notebooks' directory.
@@ -101,31 +120,41 @@ def convert_to_html(tutorials):
     fromPath = "./ipynb-notebooks/"
     toPath = "./html-notebooks/"
 
-    tutorialURL = 'https://raw.githubusercontent.com/deepchem/deepchem/master/examples/tutorials/'
-
     for tutorial in tutorials:
         try:
-            file_name_html = f'{tutorial.rsplit(".")[0]}.html'
-            response = requests.get(tutorialURL + tutorial)
-            with open(f"./ipynb-notebooks/{tutorial}", "wb") as tutorial_file:
+            tutorial_file_name = tutorial["name"]
+            tutorial_download_link = tutorial["download_url"]
+
+            if not tutorial_file_name.endswith('.ipynb'):
+                continue
+
+            file_name_html = f'{tutorial_file_name.rsplit(".")[0]}.html'
+            response = requests.get(tutorial_download_link)
+            with open(f"./ipynb-notebooks/{tutorial_file_name}", "wb") as tutorial_file:
                 tutorial_file.write(response.content)
 
             subprocess.call(
-                f'jq -M "del(.metadata.widgets)" ./ipynb-notebooks/{tutorial} > ./ipynb-notebooks/fixed-{tutorial}', shell=True)
+                f'jq -M "del(.metadata.widgets)" ./ipynb-notebooks/{tutorial_file_name} > ./ipynb-notebooks/fixed-{tutorial_file_name}', shell=True
+            )
             subprocess.call(
-                f'python -m nbconvert --to html ./ipynb-notebooks/fixed-{tutorial}', shell=True)
+                f'python -m nbconvert --to html ./ipynb-notebooks/fixed-{tutorial_file_name}', shell=True)
             shutil.copyfile(f'{fromPath}fixed-{file_name_html}',
                             toPath + file_name_html)
 
             with open('./notebooks.txt', "a") as notebook_list:
                 notebook_list.write(file_name_html + '\n')
         except Exception as exception:
             print(exception)
-            print(f"Could not process {tutorial}")
+            print(f"Could not process {tutorial_file_name}")
 
 
 if __name__ == "__main__":
-    tutorials = fetch_tutorial_data()
     create_directories()
-    convert_to_html(tutorials)
+    tutorials = fetch_file_list_from_repo(
+        DEEPCHEM_REPO_OWNER, DEEPCHEM_REPO_NAME, TUTORIALS_PATH)
+
+# The script throws an AssertionError if no tutorials are fetched. This is to prevent website deployment if no tutorials are fetched.
+    assert len(tutorials) > 0
+
     fetch_tutorial_render_order()
+    convert_to_html(tutorials)