From 2d3f3445d00d54a5a516a98bf3a4a1029a7b8a0c Mon Sep 17 00:00:00 2001
From: nsthorat <nsthorat@gmail.com>
Date: Thu, 6 Jul 2023 16:34:02 -0400
Subject: [PATCH 1/7] save

---
 Dockerfile    |  3 +++
 src/server.py | 11 +++++++++++
 2 files changed, 14 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 8c728d9d4..d68612487 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,4 +27,7 @@ COPY /web/blueprint/build ./web/blueprint/build
 # Copy python files.
 COPY /src ./src/
 
+# Copy the entrypoint file.
+COPY docker_entrypoint.sh .
+
 CMD ["uvicorn", "src.server:app", "--host", "0.0.0.0", "--port", "5432"]
diff --git a/src/server.py b/src/server.py
index 2a7f841de..970517ff9 100644
--- a/src/server.py
+++ b/src/server.py
@@ -2,6 +2,7 @@
 
 import logging
 import os
+from contextlib import asynccontextmanager
 from typing import Any
 
 from fastapi import APIRouter, FastAPI
@@ -53,6 +54,16 @@ def custom_generate_unique_id(route: APIRoute) -> str:
 app.mount('/', StaticFiles(directory=os.path.join(DIST_PATH), html=True, check_dir=False))
 
 
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+  """The lifspan hook for the server."""
+  # Setup.
+
+  yield
+
+  # Teardown.
+
+
 @app.on_event('shutdown')
 async def shutdown_event() -> None:
   """Kill the task manager when FastAPI shuts down."""

From 92f8a90c44dd7dfe0b3d06e2ca63c4abeb498ba9 Mon Sep 17 00:00:00 2001
From: nsthorat <nsthorat@gmail.com>
Date: Fri, 7 Jul 2023 11:52:10 -0400
Subject: [PATCH 2/7] save

---
 .env                  | 12 +++++++++++
 Dockerfile            |  9 --------
 README.md             | 24 ++++++++++++++++++++--
 scripts/__init__.py   |  0
 scripts/deploy_hf.py  | 45 +++++++++++++++++++++++++++++-----------
 src/router_dataset.py | 36 ++------------------------------
 src/server.py         | 48 +++++++++++++++++++++++++++++++++++--------
 src/utils.py          | 37 +++++++++++++++++++++++++++++++++
 8 files changed, 146 insertions(+), 65 deletions(-)
 create mode 100644 scripts/__init__.py

diff --git a/.env b/.env
index b361ad2d1..e06e7fc3e 100644
--- a/.env
+++ b/.env
@@ -17,3 +17,15 @@ DUCKDB_USE_VIEWS=0
 
 # Get key from https://platform.openai.com/account/api-keys
 # OPENAI_API_KEY=
+
+# For authenticating with HuggingFace to read private data from the hub from the huggingface
+# demo.
+# HF_USERNAME=
+# https://huggingface.co/settings/tokens
+# HF_ACCESS_TOKEN=
+
+# The repo to use for the huggingface demo.
+# HF_STAGING_DEMO_REPO='HF_ORG/HF_REPO_NAME'
+
+# To sync data from huggingface before the server boots.
+# LILAC_DL_DATA_FROM_HF_SPACE='HF_ORG/HF_REPO_NAME'
diff --git a/Dockerfile b/Dockerfile
index d68612487..7a2a6a50b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,12 +12,6 @@ WORKDIR /server
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
-# Copy the data to /data, the HF persistent storage. We do this after pip install to avoid
-# re-installing dependencies if the data changes, which is likely more often.
-WORKDIR /
-COPY /data /data
-WORKDIR /server
-
 COPY .env .
 COPY LICENSE .
 
@@ -27,7 +21,4 @@ COPY /web/blueprint/build ./web/blueprint/build
 # Copy python files.
 COPY /src ./src/
 
-# Copy the entrypoint file.
-COPY docker_entrypoint.sh .
-
 CMD ["uvicorn", "src.server:app", "--host", "0.0.0.0", "--port", "5432"]
diff --git a/README.md b/README.md
index 452f859e4..9cb4f1948 100644
--- a/README.md
+++ b/README.md
@@ -38,13 +38,33 @@ Details can be found at [Managing Spaces with Github Actions](https://huggingfac
 
 We use the HuggingFace git server, [follow the instructions](https://huggingface.co/docs/hub/repositories-getting-started) to use your git SSH keys to talk to HuggingFace.
 
+###### Staging demo
+
+Make sure you have created a HuggingFace space: [huggingface.co/spaces](https://huggingface.co/spaces)
+
+Set .env.local environment variables so you can upload data to the soace:
+
+```sh
+# The repo to use for the huggingface demo.
+HF_STAGING_DEMO_REPO='lilacai/your-space'
+# To authenticate with HuggingFace for uploading to the space.
+HF_USERNAME='your-username'
+```
+
+Set the variables on the HuggingFace space from the UI to authenticate the binary running on HuggingFace to read private space data:
+
+- `LILAC_DL_HF_SPACE_DATA`: lilacai/your-space
+- `HF_ACCESS_TOKEN`: yourtoken
+
+NOTE: `HF_ACCESS_TOKEN` can be generated from [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens). Create a read-only token for this step.
+
 To deploy to huggingface:
 
 ```
 poetry run python -m scripts.deploy_hf \
-  --hf_username=$HF_USERNAME \
-  --hf_space=$HF_ORG/$HF_SPACE \
   --dataset=$DATASET_NAMESPACE/$DATASET_NAME
+
+# --hf_username and --hf_space are optional and can override the ENV for local uploading.
 ```
 
 #### Deployment
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/scripts/deploy_hf.py b/scripts/deploy_hf.py
index 7c28927d0..af31a8322 100644
--- a/scripts/deploy_hf.py
+++ b/scripts/deploy_hf.py
@@ -1,24 +1,25 @@
 """Deploy to a huggingface space."""
 import os
 import subprocess
+from typing import Optional
 
 import click
 from huggingface_hub import HfApi
 
+from src.config import CONFIG, data_path
+from src.utils import get_dataset_output_dir
+
 HF_SPACE_DIR = 'hf_spaces'
 
 
 @click.command()
 @click.option(
-  '--hf_username',
-  help='The huggingface username to use to authenticate for the space.',
-  type=str,
-  required=True)
+  '--hf_username', help='The huggingface username to use to authenticate for the space.', type=str)
 @click.option(
   '--hf_space',
-  help='The huggingface space. Should be formatted like `SPACE_ORG/SPACE_NAME`',
-  type=str,
-  required=True)
+  help='The huggingface space. Defaults to env.HF_STAGING_DEMO_REPO. '
+  'Should be formatted like `SPACE_ORG/SPACE_NAME`.',
+  type=str)
 @click.option(
   '--skip_build',
   help='Skip building the web server TypeScript. '
@@ -26,17 +27,27 @@
   type=bool,
   default=False)
 @click.option('--dataset', help='The name of a dataset to upload', type=str, multiple=True)
-def main(hf_username: str, hf_space: str, dataset: list[str], skip_build: bool) -> None:
+def main(hf_username: Optional[str], hf_space: Optional[str], dataset: list[str],
+         skip_build: bool) -> None:
   """Generate the huggingface space app."""
+  hf_username = hf_username or CONFIG['HF_USERNAME']
+  if not hf_username:
+    raise ValueError('Must specify --hf_username or set env.HF_USERNAME')
+
+  hf_space = hf_space or CONFIG['HF_STAGING_DEMO_REPO']
+  if not hf_space:
+    raise ValueError('Must specify --hf_space or set env.HF_STAGING_DEMO_REPO')
+
   # Upload datasets to HuggingFace.
   # NOTE(nsthorat): This currently doesn't write to persistent storage and does not work because of
   # a bug in HuggingFace.
   hf_api = HfApi()
   for d in dataset:
-    dataset_path = os.path.join('data', 'datasets', d)
+    namespace, name = d.split('/')
+
     hf_api.upload_folder(
-      folder_path=os.path.abspath(dataset_path),
-      path_in_repo='/' + dataset_path,
+      folder_path=get_dataset_output_dir(data_path(), namespace, name),
+      path_in_repo=get_dataset_output_dir('data', namespace, name),
       repo_id=hf_space,
       repo_type='space',
       # Delete all data on the server.
@@ -54,14 +65,24 @@ def main(hf_username: str, hf_space: str, dataset: list[str], skip_build: bool)
 
   run(f'poetry export --without-hashes > {repo_basedir}/requirements.txt')
 
+  # Create a .gitignore to avoid uploading unnecessary files.
+  with open(f'{repo_basedir}/.gitignore', 'w') as f:
+    f.write("""**/__pycache__
+**/*.pyc
+**/*.pyo
+**/*.pyd
+**/*_test.py
+""")
+
   # Copy source code.
   copy_dirs = ['src', 'web/blueprint/build']
   for dir in copy_dirs:
+    run(f'rm -rf {repo_basedir}/{dir}')
     run(f'mkdir -p {repo_basedir}/{dir}')
     run(f'cp -vaR ./{dir}/* {repo_basedir}/{dir}')
 
   # Copy a subset of root files.
-  copy_files = ['.env', 'Dockerfile', 'LICENSE']
+  copy_files = ['.dockerignore', '.env', 'Dockerfile', 'LICENSE']
   for file in copy_files:
     run(f'cp ./{file} {repo_basedir}/{file}')
 
diff --git a/src/router_dataset.py b/src/router_dataset.py
index 723a842cc..059320eae 100644
--- a/src/router_dataset.py
+++ b/src/router_dataset.py
@@ -1,5 +1,4 @@
 """Router for the dataset database."""
-import os
 from typing import Optional, Sequence, Union, cast
 from urllib.parse import unquote
 
@@ -39,7 +38,7 @@
 )
 from .signals.substring_search import SubstringSignal
 from .tasks import TaskId, task_manager
-from .utils import DATASETS_DIR_NAME
+from .utils import DatasetInfo, list_datasets
 
 router = APIRouter(route_class=RouteErrorHandler)
 
@@ -47,41 +46,10 @@
 set_default_dataset_cls(DatasetDuckDB)
 
 
-class DatasetInfo(BaseModel):
-  """Information about a dataset."""
-  namespace: str
-  dataset_name: str
-  description: Optional[str]
-
-
 @router.get('/', response_model_exclude_none=True)
 def get_datasets() -> list[DatasetInfo]:
   """List the datasets."""
-  datasets_path = os.path.join(data_path(), DATASETS_DIR_NAME)
-  # Skip if 'datasets' doesn't exist.
-  if not os.path.isdir(datasets_path):
-    return []
-
-  dataset_infos: list[DatasetInfo] = []
-  for namespace in os.listdir(datasets_path):
-    dataset_dir = os.path.join(datasets_path, namespace)
-    # Skip if namespace is not a directory.
-    if not os.path.isdir(dataset_dir):
-      continue
-    if namespace.startswith('.'):
-      continue
-
-    for dataset_name in os.listdir(dataset_dir):
-      # Skip if dataset_name is not a directory.
-      dataset_path = os.path.join(dataset_dir, dataset_name)
-      if not os.path.isdir(dataset_path):
-        continue
-      if dataset_name.startswith('.'):
-        continue
-
-      dataset_infos.append(DatasetInfo(namespace=namespace, dataset_name=dataset_name))
-
-  return dataset_infos
+  return list_datasets(data_path())
 
 
 class WebManifest(BaseModel):
diff --git a/src/server.py b/src/server.py
index 970517ff9..d06883d32 100644
--- a/src/server.py
+++ b/src/server.py
@@ -2,17 +2,21 @@
 
 import logging
 import os
-from contextlib import asynccontextmanager
+import shutil
+import subprocess
 from typing import Any
 
 from fastapi import APIRouter, FastAPI
 from fastapi.responses import ORJSONResponse
 from fastapi.routing import APIRoute
 from fastapi.staticfiles import StaticFiles
+from huggingface_hub import snapshot_download
 
 from . import router_concept, router_data_loader, router_dataset, router_signal, router_tasks
+from .config import CONFIG, data_path
 from .router_utils import RouteErrorHandler
 from .tasks import task_manager
+from .utils import get_dataset_output_dir, list_datasets
 
 DIST_PATH = os.path.abspath(os.path.join('web', 'blueprint', 'build'))
 
@@ -54,14 +58,42 @@ def custom_generate_unique_id(route: APIRoute) -> str:
 app.mount('/', StaticFiles(directory=os.path.join(DIST_PATH), html=True, check_dir=False))
 
 
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-  """The lifspan hook for the server."""
+@app.on_event('startup')
+def startup() -> None:
+  """Download dataset files from the HF space that is uploaded before building the image."""
   # Setup.
-
-  yield
-
-  # Teardown.
+  repo_id = CONFIG.get('LILAC_DL_DATA_FROM_HF_SPACE', None)
+
+  if repo_id:
+    # Download the huggingface space data. This includes code and datasets, so we move the datasets
+    # alone to the data directory.
+    spaces_download_dir = os.path.join(data_path(), '.hf-spaces', repo_id)
+    snapshot_download(
+      repo_id=repo_id,
+      repo_type='space',
+      local_dir=spaces_download_dir,
+      local_dir_use_symlinks=False,
+      token=CONFIG['HF_ACCESS_TOKEN'])
+
+    datasets = list_datasets(os.path.join(spaces_download_dir, 'data'))
+    for dataset in datasets:
+      spaces_dataset_output_dir = get_dataset_output_dir(
+        os.path.join(spaces_download_dir, 'data'), dataset.namespace, dataset.dataset_name)
+      persistent_output_dir = get_dataset_output_dir(data_path(), dataset.namespace,
+                                                     dataset.dataset_name)
+
+      shutil.rmtree(persistent_output_dir, ignore_errors=True)
+      print('~~~~moving', os.path.join(spaces_download_dir, dataset.namespace,
+                                       dataset.dataset_name), 'to', persistent_output_dir)
+      shutil.move(spaces_dataset_output_dir, persistent_output_dir)
+
+    run('ls -al')
+    run(f'ls {data_path()}')
+
+
+def run(cmd: str) -> subprocess.CompletedProcess[bytes]:
+  """Run a command and return the result."""
+  return subprocess.run(cmd, shell=True, check=True)
 
 
 @app.on_event('shutdown')
diff --git a/src/utils.py b/src/utils.py
index eb99ceaba..79b13ca35 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -106,6 +106,43 @@ def get_dataset_output_dir(base_dir: Union[str, pathlib.Path], namespace: str,
   return os.path.join(get_datasets_dir(base_dir), namespace, dataset_name)
 
 
+class DatasetInfo(BaseModel):
+  """Information about a dataset."""
+  namespace: str
+  dataset_name: str
+  description: Optional[str]
+
+
+def list_datasets(base_dir: Union[str, pathlib.Path]) -> list[DatasetInfo]:
+  """List the datasets in a data directory."""
+  datasets_path = get_datasets_dir(base_dir)
+
+  # Skip if 'datasets' doesn't exist.
+  if not os.path.isdir(datasets_path):
+    return []
+
+  dataset_infos: list[DatasetInfo] = []
+  for namespace in os.listdir(datasets_path):
+    dataset_dir = os.path.join(datasets_path, namespace)
+    # Skip if namespace is not a directory.
+    if not os.path.isdir(dataset_dir):
+      continue
+    if namespace.startswith('.'):
+      continue
+
+    for dataset_name in os.listdir(dataset_dir):
+      # Skip if dataset_name is not a directory.
+      dataset_path = os.path.join(dataset_dir, dataset_name)
+      if not os.path.isdir(dataset_path):
+        continue
+      if dataset_name.startswith('.'):
+        continue
+
+      dataset_infos.append(DatasetInfo(namespace=namespace, dataset_name=dataset_name))
+
+  return dataset_infos
+
+
 class CopyRequest(BaseModel):
   """A request to copy a file from source to destination path. Used to copy media files to GCS."""
   from_path: str

From 2a0661e0563b1e764e04c546cd80c780866c887f Mon Sep 17 00:00:00 2001
From: nsthorat <nsthorat@gmail.com>
Date: Sun, 9 Jul 2023 18:40:56 -0400
Subject: [PATCH 3/7] save

---
 .env                 |  2 +-
 scripts/deploy_hf.py | 18 +++++++++---------
 src/server.py        | 20 +++++++++++---------
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/.env b/.env
index a67ad5c57..bf277f11a 100644
--- a/.env
+++ b/.env
@@ -29,4 +29,4 @@ DUCKDB_USE_VIEWS=0
 # Get a token from https://huggingface.co/settings/tokens
 # HF_ACCESS_TOKEN=
 # To sync data from huggingface before the server boots.
-# LILAC_DL_DATA_FROM_HF_SPACE='HF_ORG/HF_REPO_NAME'
+# LILAC_DATA_FROM_HF_SPACE='HF_ORG/HF_REPO_NAME'
diff --git a/scripts/deploy_hf.py b/scripts/deploy_hf.py
index af31a8322..e1a83a751 100644
--- a/scripts/deploy_hf.py
+++ b/scripts/deploy_hf.py
@@ -65,15 +65,6 @@ def main(hf_username: Optional[str], hf_space: Optional[str], dataset: list[str]
 
   run(f'poetry export --without-hashes > {repo_basedir}/requirements.txt')
 
-  # Create a .gitignore to avoid uploading unnecessary files.
-  with open(f'{repo_basedir}/.gitignore', 'w') as f:
-    f.write("""**/__pycache__
-**/*.pyc
-**/*.pyo
-**/*.pyd
-**/*_test.py
-""")
-
   # Copy source code.
   copy_dirs = ['src', 'web/blueprint/build']
   for dir in copy_dirs:
@@ -86,6 +77,15 @@ def main(hf_username: Optional[str], hf_space: Optional[str], dataset: list[str]
   for file in copy_files:
     run(f'cp ./{file} {repo_basedir}/{file}')
 
+  # Create a .gitignore to avoid uploading unnecessary files.
+  with open(f'{repo_basedir}/.gitignore', 'w') as f:
+    f.write("""**/__pycache__
+**/*.pyc
+**/*.pyo
+**/*.pyd
+**/*_test.py
+""")
+
   # Create the huggingface README.
   with open(f'{repo_basedir}/README.md', 'w') as f:
     f.write("""---
diff --git a/src/server.py b/src/server.py
index d06883d32..6f17744f3 100644
--- a/src/server.py
+++ b/src/server.py
@@ -7,7 +7,7 @@
 from typing import Any
 
 from fastapi import APIRouter, FastAPI
-from fastapi.responses import ORJSONResponse
+from fastapi.responses import FileResponse, ORJSONResponse
 from fastapi.routing import APIRoute
 from fastapi.staticfiles import StaticFiles
 from huggingface_hub import snapshot_download
@@ -54,15 +54,22 @@ def custom_generate_unique_id(route: APIRoute) -> str:
 
 app.include_router(v1_router, prefix='/api/v1')
 
+
+@app.api_route('/{path_name}', include_in_schema=False)
+def catch_all() -> FileResponse:
+  """Catch any other requests and serve index for HTML5 history."""
+  return FileResponse(path=os.path.join(DIST_PATH, 'index.html'))
+
+
 # Serve static files in production mode.
-app.mount('/', StaticFiles(directory=os.path.join(DIST_PATH), html=True, check_dir=False))
+app.mount('/', StaticFiles(directory=DIST_PATH, html=True, check_dir=False))
 
 
 @app.on_event('startup')
 def startup() -> None:
-  """Download dataset files from the HF space that is uploaded before building the image."""
+  """Download dataset files from the HF space that was uploaded before building the image."""
   # Setup.
-  repo_id = CONFIG.get('LILAC_DL_DATA_FROM_HF_SPACE', None)
+  repo_id = CONFIG.get('LILAC_DATA_FROM_HF_SPACE', None)
 
   if repo_id:
     # Download the huggingface space data. This includes code and datasets, so we move the datasets
@@ -83,13 +90,8 @@ def startup() -> None:
                                                      dataset.dataset_name)
 
       shutil.rmtree(persistent_output_dir, ignore_errors=True)
-      print('~~~~moving', os.path.join(spaces_download_dir, dataset.namespace,
-                                       dataset.dataset_name), 'to', persistent_output_dir)
       shutil.move(spaces_dataset_output_dir, persistent_output_dir)
 
-    run('ls -al')
-    run(f'ls {data_path()}')
-
 
 def run(cmd: str) -> subprocess.CompletedProcess[bytes]:
   """Run a command and return the result."""

From c3ea658e52edcd39200832aae93b853d0f799c52 Mon Sep 17 00:00:00 2001
From: nsthorat <nsthorat@gmail.com>
Date: Sun, 9 Jul 2023 19:31:17 -0400
Subject: [PATCH 4/7] save

---
 .env                                               |  6 +++++-
 .gitignore                                         |  2 --
 README.md                                          |  3 ++-
 scripts/deploy_hf.py                               |  9 +++++++--
 src/server.py                                      |  4 ++--
 .../lib/components/datasetView/SearchPanel.svelte  | 14 ++++++++++++++
 web/blueprint/src/routes/+layout.svelte            |  5 +++++
 7 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/.env b/.env
index bf277f11a..ba2995c32 100644
--- a/.env
+++ b/.env
@@ -20,13 +20,17 @@ DUCKDB_USE_VIEWS=0
 # Get key from https://makersuite.google.com/app/apikey
 # PALM_API_KEY=
 
+# HuggingFace demos: machine that uploads to HuggingFace.
+
 # For authenticating with HuggingFace to deploy to a Space.
 # HF_USERNAME=
 # The default repo to deploy to for a staging demo. Can be overridden by a command line flag.
 # HF_STAGING_DEMO_REPO='HF_ORG/HF_REPO_NAME'
 
+# HuggingFace demos: HuggingFace machine that runs the demo.
+
 # To read private uploaded data from the server (running on HF spaces) for the demo.
 # Get a token from https://huggingface.co/settings/tokens
 # HF_ACCESS_TOKEN=
 # To sync data from huggingface before the server boots.
-# LILAC_DATA_FROM_HF_SPACE='HF_ORG/HF_REPO_NAME'
+# HF_DATA_FROM_SPACE='HF_ORG/HF_REPO_NAME'
diff --git a/.gitignore b/.gitignore
index e191b8620..fb437b73c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,8 +8,6 @@ cloned_repos/
 py_coverage_html/
 *.deps.txt
 requirements.txt
-# Cloned huggingface spaces repos for pushing demos.
-hf_spaces/
 
 # Mac OS.
 .DS_Store
diff --git a/README.md b/README.md
index 9cb4f1948..e6c8d8efd 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,8 @@ HF_STAGING_DEMO_REPO='lilacai/your-space'
 HF_USERNAME='your-username'
 ```
 
-Set the variables on the HuggingFace space from the UI to authenticate the binary running on HuggingFace to read private space data:
+Set the environment variables on the HuggingFace space from the HuggingFace Space Settings UI to
+authenticate the binary running on HuggingFace to read private space data:
 
 - `LILAC_DL_HF_SPACE_DATA`: lilacai/your-space
 - `HF_ACCESS_TOKEN`: yourtoken
diff --git a/scripts/deploy_hf.py b/scripts/deploy_hf.py
index e1a83a751..a1861b523 100644
--- a/scripts/deploy_hf.py
+++ b/scripts/deploy_hf.py
@@ -9,7 +9,7 @@
 from src.config import CONFIG, data_path
 from src.utils import get_dataset_output_dir
 
-HF_SPACE_DIR = 'hf_spaces'
+HF_SPACE_DIR = os.path.join(data_path(), '.hf_spaces')
 
 
 @click.command()
@@ -23,7 +23,7 @@
 @click.option(
   '--skip_build',
   help='Skip building the web server TypeScript. '
-  'Useful if you are only changing python or are only changing data.',
+  'Useful to speed up the build if you are only changing python or data.',
   type=bool,
   default=False)
 @click.option('--dataset', help='The name of a dataset to upload', type=str, multiple=True)
@@ -53,6 +53,7 @@ def main(hf_username: Optional[str], hf_space: Optional[str], dataset: list[str]
       # Delete all data on the server.
       delete_patterns='*')
 
+  # Build the web server Svelte & TypeScript.
   if not skip_build:
     run('sh ./scripts/build_server_prod.sh')
 
@@ -63,6 +64,10 @@ def main(hf_username: Optional[str], hf_space: Optional[str], dataset: list[str]
   run(f'rm -rf {repo_basedir}')
   run(f'git clone https://{hf_username}@huggingface.co/spaces/{hf_space} {repo_basedir}')
 
+  # Clear out the repo.
+  run(f'rm -rf {repo_basedir}/*')
+
+  # Export the requirements file so it can be pip installed in the docker container.
   run(f'poetry export --without-hashes > {repo_basedir}/requirements.txt')
 
   # Copy source code.
diff --git a/src/server.py b/src/server.py
index 6f17744f3..4e9f9ff11 100644
--- a/src/server.py
+++ b/src/server.py
@@ -69,12 +69,12 @@ def catch_all() -> FileResponse:
 def startup() -> None:
   """Download dataset files from the HF space that was uploaded before building the image."""
   # Setup.
-  repo_id = CONFIG.get('LILAC_DATA_FROM_HF_SPACE', None)
+  repo_id = CONFIG.get('HF_DATA_FROM_SPACE', None)
 
   if repo_id:
     # Download the huggingface space data. This includes code and datasets, so we move the datasets
     # alone to the data directory.
-    spaces_download_dir = os.path.join(data_path(), '.hf-spaces', repo_id)
+    spaces_download_dir = os.path.join(data_path(), '.hf_spaces', repo_id)
     snapshot_download(
       repo_id=repo_id,
       repo_type='space',
diff --git a/web/blueprint/src/lib/components/datasetView/SearchPanel.svelte b/web/blueprint/src/lib/components/datasetView/SearchPanel.svelte
index 18fe7009b..a0d768ccd 100644
--- a/web/blueprint/src/lib/components/datasetView/SearchPanel.svelte
+++ b/web/blueprint/src/lib/components/datasetView/SearchPanel.svelte
@@ -252,6 +252,20 @@
     }
     datasetViewStore.setSortOrder(sort?.order === 'ASC' ? 'DESC' : 'ASC');
   };
+  // const pageClickHandler = () => console.log('clicky');
+  // onMount(() => {
+  //   if (parent) {
+  //     console.log(parent, '=', parent);
+  //     window.parent = window;
+  //     parent.addEventListener('click', pageClickHandler);
+  //   }
+
+  //   return () => {
+  //     if (parent) {
+  //       parent.removeEventListener('click', pageClickHandler);
+  //     }
+  //   };
+  // });
 </script>
 
 <div class="border-1 flex flex-row items-start px-4 py-2">
diff --git a/web/blueprint/src/routes/+layout.svelte b/web/blueprint/src/routes/+layout.svelte
index 4f1d91e77..e43c36550 100644
--- a/web/blueprint/src/routes/+layout.svelte
+++ b/web/blueprint/src/routes/+layout.svelte
@@ -18,6 +18,11 @@
   let showError: ApiError | undefined = undefined;
 
   onMount(() => {
+    // This fixes a cross-origin error when the app is embedding in an iframe. Some carbon
+    // components attach listeners to window.parent, which is not allowed in an iframe, so we set
+    // the parent to window.
+    window.parent = window;
+
     urlHash.set(location.hash);
     history.pushState = function (_state, _unused, url) {
       if (url instanceof URL) {

From fc66f83264f94118f1f7396a88745c01b72aad2b Mon Sep 17 00:00:00 2001
From: nsthorat <nsthorat@gmail.com>
Date: Sun, 9 Jul 2023 19:33:15 -0400
Subject: [PATCH 5/7] save

---
 src/server.py                                      |  3 ++-
 .../lib/components/datasetView/SearchPanel.svelte  | 14 --------------
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/src/server.py b/src/server.py
index 4e9f9ff11..15c53c89e 100644
--- a/src/server.py
+++ b/src/server.py
@@ -68,7 +68,6 @@ def catch_all() -> FileResponse:
 @app.on_event('startup')
 def startup() -> None:
   """Download dataset files from the HF space that was uploaded before building the image."""
-  # Setup.
   repo_id = CONFIG.get('HF_DATA_FROM_SPACE', None)
 
   if repo_id:
@@ -89,6 +88,8 @@ def startup() -> None:
       persistent_output_dir = get_dataset_output_dir(data_path(), dataset.namespace,
                                                      dataset.dataset_name)
 
+      # Huggingface doesn't let you selectively download files so we just copy the data directory
+      # out of the cloned space.
       shutil.rmtree(persistent_output_dir, ignore_errors=True)
       shutil.move(spaces_dataset_output_dir, persistent_output_dir)
 
diff --git a/web/blueprint/src/lib/components/datasetView/SearchPanel.svelte b/web/blueprint/src/lib/components/datasetView/SearchPanel.svelte
index a0d768ccd..18fe7009b 100644
--- a/web/blueprint/src/lib/components/datasetView/SearchPanel.svelte
+++ b/web/blueprint/src/lib/components/datasetView/SearchPanel.svelte
@@ -252,20 +252,6 @@
     }
     datasetViewStore.setSortOrder(sort?.order === 'ASC' ? 'DESC' : 'ASC');
   };
-  // const pageClickHandler = () => console.log('clicky');
-  // onMount(() => {
-  //   if (parent) {
-  //     console.log(parent, '=', parent);
-  //     window.parent = window;
-  //     parent.addEventListener('click', pageClickHandler);
-  //   }
-
-  //   return () => {
-  //     if (parent) {
-  //       parent.removeEventListener('click', pageClickHandler);
-  //     }
-  //   };
-  // });
 </script>
 
 <div class="border-1 flex flex-row items-start px-4 py-2">

From ff79ad7b3a158d4b27d4b8d646ebc1ed870b961c Mon Sep 17 00:00:00 2001
From: nsthorat <nsthorat@gmail.com>
Date: Mon, 10 Jul 2023 12:32:31 -0400
Subject: [PATCH 6/7] save

---
 Dockerfile           |  6 +++++-
 README.md            | 11 ++++++-----
 mypy.ini             |  5 +++++
 poetry.lock          | 22 +++++++++++++++++++++-
 pyproject.toml       |  1 +
 scripts/deploy_hf.py |  3 +--
 6 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 7a2a6a50b..e6a9569af 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -21,4 +21,8 @@ COPY /web/blueprint/build ./web/blueprint/build
 # Copy python files.
 COPY /src ./src/
 
-CMD ["uvicorn", "src.server:app", "--host", "0.0.0.0", "--port", "5432"]
+CMD [ \
+  "gunicorn", "src.server:app", \
+  "--bind", "0.0.0.0:5432", \
+  "-k", "uvicorn.workers.UvicornWorker" \
+  ]
diff --git a/README.md b/README.md
index e6c8d8efd..4b611303c 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,7 @@ We use the HuggingFace git server, [follow the instructions](https://huggingface
 
 ###### Staging demo
 
+Create a HuggingFace space.
 Make sure you have created a HuggingFace space: [huggingface.co/spaces](https://huggingface.co/spaces)
 
 Set .env.local environment variables so you can upload data to the soace:
@@ -51,14 +52,14 @@ HF_STAGING_DEMO_REPO='lilacai/your-space'
 HF_USERNAME='your-username'
 ```
 
-Set the environment variables on the HuggingFace space from the HuggingFace Space Settings UI to
-authenticate the binary running on HuggingFace to read private space data:
+- Generate a read-only token from [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) which will be used as `HF_ACCESS_TOKEN` below.
+- Open the HuggingFace space in your browser and click "Settings".
+- Set these two environment variables from the settings UI to
+  authenticate the binary running on HuggingFace to read private space data:
 
 - `LILAC_DL_HF_SPACE_DATA`: lilacai/your-space
 - `HF_ACCESS_TOKEN`: yourtoken
 
-NOTE: `HF_ACCESS_TOKEN` can be generated from [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens). Create a read-only token for this step.
-
 To deploy to huggingface:
 
 ```
@@ -73,7 +74,7 @@ poetry run python -m scripts.deploy_hf \
 To build the docker image:
 
 ```sh
-./build_docker.sh
+./scripts/build_docker.sh
 ```
 
 To run the docker image locally:
diff --git a/mypy.ini b/mypy.ini
index 8bf6a9b15..6561d2d15 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -116,3 +116,8 @@ follow_imports = skip
 [mypy-google.generativeai.*]
 ignore_missing_imports = True
 follow_imports = skip
+
+[mypy-huggingface_hub.*]
+ignore_missing_imports = True
+follow_imports = skip
+
diff --git a/poetry.lock b/poetry.lock
index 4efc8f454..f01b90575 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1887,6 +1887,26 @@ googleapis-common-protos = ">=1.5.5"
 grpcio = ">=1.56.0"
 protobuf = ">=4.21.6"
 
+[[package]]
+name = "gunicorn"
+version = "20.1.0"
+description = "WSGI HTTP Server for UNIX"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "gunicorn-20.1.0-py3-none-any.whl", hash = "sha256:9dcc4547dbb1cb284accfb15ab5667a0e5d1881cc443e0677b4882a4067a807e"},
+    {file = "gunicorn-20.1.0.tar.gz", hash = "sha256:e0a968b5ba15f8a328fdfd7ab1fcb5af4470c28aaf7e55df02a99bc13138e6e8"},
+]
+
+[package.dependencies]
+setuptools = ">=3.0"
+
+[package.extras]
+eventlet = ["eventlet (>=0.24.1)"]
+gevent = ["gevent (>=1.4.0)"]
+setproctitle = ["setproctitle"]
+tornado = ["tornado (>=0.2)"]
+
 [[package]]
 name = "h11"
 version = "0.14.0"
@@ -6211,4 +6231,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "~3.9"
-content-hash = "2ef9495d9487c43879081a097f41955de910032019a1369f8c946b36c68379e5"
+content-hash = "3a2070f9d45f19db63333a16c0034a3584c07d9c3013728b1f90b27fb87a2cba"
diff --git a/pyproject.toml b/pyproject.toml
index 0b93623bc..75c87bed2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,7 @@ email-reply-parser = "^0.5.12"
 
 # For text statistics.
 textacy = "^0.13.0"
+gunicorn = "^20.1.0"
 
 [tool.poetry.group.dev]  # Deps for development.
 optional = true
diff --git a/scripts/deploy_hf.py b/scripts/deploy_hf.py
index a1861b523..809c05d12 100644
--- a/scripts/deploy_hf.py
+++ b/scripts/deploy_hf.py
@@ -62,7 +62,7 @@ def main(hf_username: Optional[str], hf_space: Optional[str], dataset: list[str]
   # Clone the HuggingFace spaces repo.
   repo_basedir = os.path.join(HF_SPACE_DIR, hf_space)
   run(f'rm -rf {repo_basedir}')
-  run(f'git clone https://{hf_username}@huggingface.co/spaces/{hf_space} {repo_basedir}')
+  run(f'git clone https://{hf_username}@huggingface.co/spaces/{hf_space} {repo_basedir} --depth 1')
 
   # Clear out the repo.
   run(f'rm -rf {repo_basedir}/*')
@@ -73,7 +73,6 @@ def main(hf_username: Optional[str], hf_space: Optional[str], dataset: list[str]
   # Copy source code.
   copy_dirs = ['src', 'web/blueprint/build']
   for dir in copy_dirs:
-    run(f'rm -rf {repo_basedir}/{dir}')
     run(f'mkdir -p {repo_basedir}/{dir}')
     run(f'cp -vaR ./{dir}/* {repo_basedir}/{dir}')
 

From 53336b64721c9786888dda106a0ccc3786b91ab0 Mon Sep 17 00:00:00 2001
From: nsthorat <nsthorat@gmail.com>
Date: Mon, 10 Jul 2023 12:36:17 -0400
Subject: [PATCH 7/7] save

---
 README.md | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 4b611303c..d31e673d4 100644
--- a/README.md
+++ b/README.md
@@ -40,10 +40,10 @@ We use the HuggingFace git server, [follow the instructions](https://huggingface
 
 ###### Staging demo
 
-Create a HuggingFace space.
-Make sure you have created a HuggingFace space: [huggingface.co/spaces](https://huggingface.co/spaces)
+1. Create a HuggingFace space.
+   Create a huggingface space from your browser: [huggingface.co/spaces](https://huggingface.co/spaces)
 
-Set .env.local environment variables so you can upload data to the soace:
+2. Set .env.local environment variables so you can upload data to the soace:
 
 ```sh
 # The repo to use for the huggingface demo.
@@ -52,15 +52,17 @@ HF_STAGING_DEMO_REPO='lilacai/your-space'
 HF_USERNAME='your-username'
 ```
 
-- Generate a read-only token from [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) which will be used as `HF_ACCESS_TOKEN` below.
-- Open the HuggingFace space in your browser and click "Settings".
-- Set these two environment variables from the settings UI to
-  authenticate the binary running on HuggingFace to read private space data:
+3.  Generate a read-only token from [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) which will be used as `HF_ACCESS_TOKEN` below.
+
+4.  Open the HuggingFace space in your browser and click "Settings".
+
+5.  Set these two environment variables from the settings UI to
+    authenticate the binary running on HuggingFace to read private space data:
 
 - `LILAC_DL_HF_SPACE_DATA`: lilacai/your-space
 - `HF_ACCESS_TOKEN`: yourtoken
 
-To deploy to huggingface:
+6: Deploy to your HuggingFace Space:
 
 ```
 poetry run python -m scripts.deploy_hf \