lilacai · nsthorat · Jul 10, 2023 · Jul 6, 2023 · Jul 7, 2023 · Jul 8, 2023
diff --git a/.env b/.env
@@ -17,6 +17,20 @@ DUCKDB_USE_VIEWS=0
 
 # Get key from https://platform.openai.com/account/api-keys
 # OPENAI_API_KEY=
-
 # Get key from https://makersuite.google.com/app/apikey
 # PALM_API_KEY=
+
+# HuggingFace demos: machine that uploads to HuggingFace.
+
+# For authenticating with HuggingFace to deploy to a Space.
+# HF_USERNAME=
+# The default repo to deploy to for a staging demo. Can be overridden by a command line flag.
+# HF_STAGING_DEMO_REPO='HF_ORG/HF_REPO_NAME'
+
+# HuggingFace demos: HuggingFace machine that runs the demo.
+
+# To read private uploaded data from the server (running on HF spaces) for the demo.
+# Get a token from https://huggingface.co/settings/tokens
+# HF_ACCESS_TOKEN=
+# To sync data from huggingface before the server boots.
+# HF_DATA_FROM_SPACE='HF_ORG/HF_REPO_NAME'
diff --git a/.gitignore b/.gitignore
@@ -8,8 +8,6 @@ cloned_repos/
 py_coverage_html/
 *.deps.txt
 requirements.txt
-# Cloned huggingface spaces repos for pushing demos.
-hf_spaces/
 
 # Mac OS.
 .DS_Store

diff --git a/Dockerfile b/Dockerfile
@@ -12,12 +12,6 @@ WORKDIR /server
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
-# Copy the data to /data, the HF persistent storage. We do this after pip install to avoid
-# re-installing dependencies if the data changes, which is likely more often.
-WORKDIR /
-COPY /data /data
-WORKDIR /server
-
 COPY .env .
 COPY LICENSE .
 

diff --git a/README.md b/README.md
@@ -38,13 +38,34 @@ Details can be found at [Managing Spaces with Github Actions](https://huggingfac
 
 We use the HuggingFace git server, [follow the instructions](https://huggingface.co/docs/hub/repositories-getting-started) to use your git SSH keys to talk to HuggingFace.
 
+###### Staging demo
+
+Make sure you have created a HuggingFace space: [huggingface.co/spaces](https://huggingface.co/spaces)
+
+Set .env.local environment variables so you can upload data to the soace:
+
+```sh
+# The repo to use for the huggingface demo.
+HF_STAGING_DEMO_REPO='lilacai/your-space'
+# To authenticate with HuggingFace for uploading to the space.
+HF_USERNAME='your-username'
+```
+
+Set the environment variables on the HuggingFace space from the HuggingFace Space Settings UI to
+authenticate the binary running on HuggingFace to read private space data:
+
+- `LILAC_DL_HF_SPACE_DATA`: lilacai/your-space
+- `HF_ACCESS_TOKEN`: yourtoken
+
+NOTE: `HF_ACCESS_TOKEN` can be generated from [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens). Create a read-only token for this step.
+
 To deploy to huggingface:
 
 ```
 poetry run python -m scripts.deploy_hf \
-  --hf_username=$HF_USERNAME \
-  --hf_space=$HF_ORG/$HF_SPACE \
   --dataset=$DATASET_NAMESPACE/$DATASET_NAME
+
+# --hf_username and --hf_space are optional and can override the ENV for local uploading.
 ```
 
 #### Deployment

diff --git a/scripts/__init__.py b/scripts/__init__.py
diff --git a/scripts/deploy_hf.py b/scripts/deploy_hf.py
@@ -1,47 +1,59 @@
 """Deploy to a huggingface space."""
 import os
 import subprocess
+from typing import Optional
 
 import click
 from huggingface_hub import HfApi
 
-HF_SPACE_DIR = 'hf_spaces'
+from src.config import CONFIG, data_path
+from src.utils import get_dataset_output_dir
+
+HF_SPACE_DIR = os.path.join(data_path(), '.hf_spaces')
 
 
 @click.command()
 @click.option(
-  '--hf_username',
-  help='The huggingface username to use to authenticate for the space.',
-  type=str,
-  required=True)
+  '--hf_username', help='The huggingface username to use to authenticate for the space.', type=str)
 @click.option(
   '--hf_space',
-  help='The huggingface space. Should be formatted like `SPACE_ORG/SPACE_NAME`',
-  type=str,
-  required=True)
+  help='The huggingface space. Defaults to env.HF_STAGING_DEMO_REPO. '
+  'Should be formatted like `SPACE_ORG/SPACE_NAME`.',
+  type=str)
 @click.option(
   '--skip_build',
   help='Skip building the web server TypeScript. '
-  'Useful if you are only changing python or are only changing data.',
+  'Useful to speed up the build if you are only changing python or data.',
   type=bool,
   default=False)
 @click.option('--dataset', help='The name of a dataset to upload', type=str, multiple=True)
-def main(hf_username: str, hf_space: str, dataset: list[str], skip_build: bool) -> None:
+def main(hf_username: Optional[str], hf_space: Optional[str], dataset: list[str],
+         skip_build: bool) -> None:
   """Generate the huggingface space app."""
+  hf_username = hf_username or CONFIG['HF_USERNAME']
+  if not hf_username:
+    raise ValueError('Must specify --hf_username or set env.HF_USERNAME')
+
+  hf_space = hf_space or CONFIG['HF_STAGING_DEMO_REPO']
+  if not hf_space:
+    raise ValueError('Must specify --hf_space or set env.HF_STAGING_DEMO_REPO')
+
   # Upload datasets to HuggingFace.
   # NOTE(nsthorat): This currently doesn't write to persistent storage and does not work because of
   # a bug in HuggingFace.
   hf_api = HfApi()
   for d in dataset:
-    dataset_path = os.path.join('data', 'datasets', d)
+    namespace, name = d.split('/')
+
     hf_api.upload_folder(
-      folder_path=os.path.abspath(dataset_path),
-      path_in_repo='/' + dataset_path,
+      folder_path=get_dataset_output_dir(data_path(), namespace, name),
+      path_in_repo=get_dataset_output_dir('data', namespace, name),
       repo_id=hf_space,
       repo_type='space',
       # Delete all data on the server.
       delete_patterns='*')
 
+  # Build the web server Svelte & TypeScript.
   if not skip_build:
     run('sh ./scripts/build_server_prod.sh')
 
@@ -52,19 +64,33 @@ def main(hf_username: str, hf_space: str, dataset: list[str], skip_build: bool)
   run(f'rm -rf {repo_basedir}')
   run(f'git clone https://{hf_username}@huggingface.co/spaces/{hf_space} {repo_basedir}')
 
+  # Clear out the repo.
+  run(f'rm -rf {repo_basedir}/*')
+
+  # Export the requirements file so it can be pip installed in the docker container.
   run(f'poetry export --without-hashes > {repo_basedir}/requirements.txt')
 
   # Copy source code.
   copy_dirs = ['src', 'web/blueprint/build']
   for dir in copy_dirs:
+    run(f'rm -rf {repo_basedir}/{dir}')
     run(f'mkdir -p {repo_basedir}/{dir}')
     run(f'cp -vaR ./{dir}/* {repo_basedir}/{dir}')
 
   # Copy a subset of root files.
-  copy_files = ['.env', 'Dockerfile', 'LICENSE']
+  copy_files = ['.dockerignore', '.env', 'Dockerfile', 'LICENSE']
   for file in copy_files:
     run(f'cp ./{file} {repo_basedir}/{file}')
 
+  # Create a .gitignore to avoid uploading unnecessary files.
+  with open(f'{repo_basedir}/.gitignore', 'w') as f:
+    f.write("""**/__pycache__
+**/*.pyc
+**/*.pyo
+**/*.pyd
+**/*_test.py
+""")
+
   # Create the huggingface README.
   with open(f'{repo_basedir}/README.md', 'w') as f:
     f.write("""---

diff --git a/src/router_dataset.py b/src/router_dataset.py
@@ -1,5 +1,4 @@
 """Router for the dataset database."""
-import os
 from typing import Optional, Sequence, Union, cast
 from urllib.parse import unquote
 
@@ -39,49 +38,18 @@
 )
 from .signals.substring_search import SubstringSignal
 from .tasks import TaskId, task_manager
-from .utils import DATASETS_DIR_NAME
+from .utils import DatasetInfo, list_datasets
 
 router = APIRouter(route_class=RouteErrorHandler)
 
 register_default_signals()
 set_default_dataset_cls(DatasetDuckDB)
 
 
-class DatasetInfo(BaseModel):
-  """Information about a dataset."""
-  namespace: str
-  dataset_name: str
-  description: Optional[str]
-
-
 @router.get('/', response_model_exclude_none=True)
 def get_datasets() -> list[DatasetInfo]:
   """List the datasets."""
-  datasets_path = os.path.join(data_path(), DATASETS_DIR_NAME)
-  # Skip if 'datasets' doesn't exist.
-  if not os.path.isdir(datasets_path):
-    return []
-
-  dataset_infos: list[DatasetInfo] = []
-  for namespace in os.listdir(datasets_path):
-    dataset_dir = os.path.join(datasets_path, namespace)
-    # Skip if namespace is not a directory.
-    if not os.path.isdir(dataset_dir):
-      continue
-    if namespace.startswith('.'):
-      continue
-
-    for dataset_name in os.listdir(dataset_dir):
-      # Skip if dataset_name is not a directory.
-      dataset_path = os.path.join(dataset_dir, dataset_name)
-      if not os.path.isdir(dataset_path):
-        continue
-      if dataset_name.startswith('.'):
-        continue
-
-      dataset_infos.append(DatasetInfo(namespace=namespace, dataset_name=dataset_name))
-
-  return dataset_infos
+  return list_datasets(data_path())
 
 
 class WebManifest(BaseModel):

diff --git a/src/server.py b/src/server.py
@@ -2,16 +2,21 @@
 
 import logging
 import os
+import shutil
+import subprocess
 from typing import Any
 
 from fastapi import APIRouter, FastAPI
-from fastapi.responses import ORJSONResponse
+from fastapi.responses import FileResponse, ORJSONResponse
 from fastapi.routing import APIRoute
 from fastapi.staticfiles import StaticFiles
+from huggingface_hub import snapshot_download
 
 from . import router_concept, router_data_loader, router_dataset, router_signal, router_tasks
+from .config import CONFIG, data_path
 from .router_utils import RouteErrorHandler
 from .tasks import task_manager
+from .utils import get_dataset_output_dir, list_datasets
 
 DIST_PATH = os.path.abspath(os.path.join('web', 'blueprint', 'build'))
 
@@ -49,8 +54,49 @@ def custom_generate_unique_id(route: APIRoute) -> str:
 
 app.include_router(v1_router, prefix='/api/v1')
 
+
+@app.api_route('/{path_name}', include_in_schema=False)
+def catch_all() -> FileResponse:
+  """Catch any other requests and serve index for HTML5 history."""
+  return FileResponse(path=os.path.join(DIST_PATH, 'index.html'))
+
+
 # Serve static files in production mode.
-app.mount('/', StaticFiles(directory=os.path.join(DIST_PATH), html=True, check_dir=False))
+app.mount('/', StaticFiles(directory=DIST_PATH, html=True, check_dir=False))
+
+
+@app.on_event('startup')
+def startup() -> None:
+  """Download dataset files from the HF space that was uploaded before building the image."""
+  repo_id = CONFIG.get('HF_DATA_FROM_SPACE', None)
+
+  if repo_id:
+    # Download the huggingface space data. This includes code and datasets, so we move the datasets
+    # alone to the data directory.
+    spaces_download_dir = os.path.join(data_path(), '.hf_spaces', repo_id)
+    snapshot_download(
+      repo_id=repo_id,
+      repo_type='space',
+      local_dir=spaces_download_dir,
+      local_dir_use_symlinks=False,
+      token=CONFIG['HF_ACCESS_TOKEN'])
+
+    datasets = list_datasets(os.path.join(spaces_download_dir, 'data'))
+    for dataset in datasets:
+      spaces_dataset_output_dir = get_dataset_output_dir(
+        os.path.join(spaces_download_dir, 'data'), dataset.namespace, dataset.dataset_name)
+      persistent_output_dir = get_dataset_output_dir(data_path(), dataset.namespace,
+                                                     dataset.dataset_name)
+
+      # Huggingface doesn't let you selectively download files so we just copy the data directory
+      # out of the cloned space.
+      shutil.rmtree(persistent_output_dir, ignore_errors=True)
+      shutil.move(spaces_dataset_output_dir, persistent_output_dir)
+
+
+def run(cmd: str) -> subprocess.CompletedProcess[bytes]:
+  """Run a command and return the result."""
+  return subprocess.run(cmd, shell=True, check=True)
 
 
 @app.on_event('shutdown')

diff --git a/src/utils.py b/src/utils.py
@@ -106,6 +106,43 @@ def get_dataset_output_dir(base_dir: Union[str, pathlib.Path], namespace: str,
   return os.path.join(get_datasets_dir(base_dir), namespace, dataset_name)
 
 
+class DatasetInfo(BaseModel):
+  """Information about a dataset."""
+  namespace: str
+  dataset_name: str
+  description: Optional[str]
+
+
+def list_datasets(base_dir: Union[str, pathlib.Path]) -> list[DatasetInfo]:
+  """List the datasets in a data directory."""
+  datasets_path = get_datasets_dir(base_dir)
+
+  # Skip if 'datasets' doesn't exist.
+  if not os.path.isdir(datasets_path):
+    return []
+
+  dataset_infos: list[DatasetInfo] = []
+  for namespace in os.listdir(datasets_path):
+    dataset_dir = os.path.join(datasets_path, namespace)
+    # Skip if namespace is not a directory.
+    if not os.path.isdir(dataset_dir):
+      continue
+    if namespace.startswith('.'):
+      continue
+
+    for dataset_name in os.listdir(dataset_dir):
+      # Skip if dataset_name is not a directory.
+      dataset_path = os.path.join(dataset_dir, dataset_name)
+      if not os.path.isdir(dataset_path):
+        continue
+      if dataset_name.startswith('.'):
+        continue
+
+      dataset_infos.append(DatasetInfo(namespace=namespace, dataset_name=dataset_name))
+
+  return dataset_infos
+
+
 class CopyRequest(BaseModel):
   """A request to copy a file from source to destination path. Used to copy media files to GCS."""
   from_path: str

diff --git a/web/blueprint/src/routes/+layout.svelte b/web/blueprint/src/routes/+layout.svelte
@@ -18,6 +18,11 @@
   let showError: ApiError | undefined = undefined;
 
   onMount(() => {
+    // This fixes a cross-origin error when the app is embedding in an iframe. Some carbon
+    // components attach listeners to window.parent, which is not allowed in an iframe, so we set
+    // the parent to window.
+    window.parent = window;
+
     urlHash.set(location.hash);
     history.pushState = function (_state, _unused, url) {
       if (url instanceof URL) {