calypr · Copilot · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/.gitignore b/.gitignore
@@ -113,3 +113,4 @@ venv.bak/
 
 # SQL files
 .sql
+__pycache__/
diff --git a/README.md b/README.md
@@ -23,6 +23,52 @@
 
 Data backup and recovery service for the CALYPR systems 🔄
 
+## Deployment
+
+This service is deployed using [Helm charts](https://github.com/ohsu-comp-bio/helm-charts/tree/main/charts/backups). The helm chart provides:
+
+- Automated backup scheduling via CronJob
+- Secret management integration with existing PostgreSQL secrets
+- Configurable S3 storage backends
+- Simple deployment with `helm install`
+
+For manual job execution:
+```bash
+# Create a backup job
+kubectl create job backup-job --from=cronjob/backup-service-cronjob --namespace backups
+
+# Create a restore job (set OPERATION=restore environment variable)
+kubectl create job restore-job --from=cronjob/backup-service-cronjob --namespace backups
+# Note: You'll need to patch the job to set OPERATION=restore environment variable
+```
+
+## Configuration
+
+The service can be configured through environment variables:
+
+- **OPERATION**: `backup` (default) or `restore` - determines the operation mode
+- **RESTORE_DIR**: Directory path for restore operations (defaults to timestamped directory)
+- **PGPASSWORD**: Can be sourced from existing `local-postgresql` secret (base64 encoded)
+- **GRIP_GRAPH**: Graph name (should be configurable via helm global config)
+- **GRIP_LIMIT**: Query limit (should be removed for production use)
+
+The helm chart automatically handles secret management and configuration from global helm values.
+
+## Best Practices
+
+### Namespace Configuration
+While the helm chart defaults to a separate `backups` namespace, consider deploying in the same namespace as your databases to simplify network access and secret sharing, as the backup service needs direct access to database resources.
+
+### Secret Management
+- **No separate secrets needed**: PGPASSWORD can be extracted from existing `local-postgresql` secret (base64 encoded)
+- **Helm integration**: All configuration should be managed through helm values files
+- **S3 credentials**: Configure S3 bucket and credentials through helm secrets file
+
+### Performance and Storage
+- **Remove query limits**: Production deployments should remove GRIP_LIMIT for complete backups
+- **Backup retention**: Implement a retention policy (e.g., keep daily backups for 30 days, weekly for 3+ months)
+- **Global configuration**: Use helm global config for shared values like graph names instead of hardcoding
+
 # 2. Quick Start ⚡
 
 ```sh

diff --git a/entrypoint.sh b/entrypoint.sh
@@ -3,28 +3,69 @@ set -e
 
 TIMESTAMP=$(date +"%Y-%m-%dT%H:%M:%S")
 
+# Default operation is backup, but can be overridden with OPERATION env var
+OPERATION=${OPERATION:-backup}
+
 export DIR="${DIR}/${TIMESTAMP}"
 
-# Postgres Dump
-bak --debug pg dump \
-    --dir "${DIR}" \
-    --host "${PGHOST}" \
-    --user "${PGUSER}" \
-    --password "${PGPASSWORD}"
-
-# GRIP Backup
-bak --debug grip backup \
-    --dir "${DIR}" \
-    --host "${GRIP_HOST}" \
-    --graph "${GRIP_GRAPH}" \
-    --limit "${GRIP_LIMIT}" \
-    --vertex \
-    --edge
-
-# S3 Upload
-bak --debug s3 upload \
-    --dir "${DIR}" \
-    --endpoint "${ENDPOINT}" \
-    --bucket "${BUCKET}" \
-    --key "${KEY}" \
-    --secret "${SECRET}"
+if [ "$OPERATION" = "backup" ]; then
+    echo "Starting backup operation..."
+
+    # Postgres Dump
+    bak --debug pg dump \
+        --dir "${DIR}" \
+        --host "${PGHOST}" \
+        --user "${PGUSER}" \
+        --password "${PGPASSWORD}"
+
+    # GRIP Backup
+    bak --debug grip backup \
+        --dir "${DIR}" \
+        --host "${GRIP_HOST}" \
+        --graph "${GRIP_GRAPH}" \
+        --limit "${GRIP_LIMIT}" \
+        --vertex \
+        --edge
+
+    # S3 Upload
+    bak --debug s3 upload \
+        --dir "${DIR}" \
+        --endpoint "${ENDPOINT}" \
+        --bucket "${BUCKET}" \
+        --key "${KEY}" \
+        --secret "${SECRET}"
+
+    echo "Backup operation completed successfully"
+
+elif [ "$OPERATION" = "restore" ]; then
+    echo "Starting restore operation..."
+
+    # S3 Download - restore from specified backup directory or latest
+    RESTORE_DIR=${RESTORE_DIR:-"${DIR}"}
+
+    bak --debug s3 download \
+        --dir "${RESTORE_DIR}" \
+        --endpoint "${ENDPOINT}" \
+        --bucket "${BUCKET}" \
+        --key "${KEY}" \
+        --secret "${SECRET}"
+
+    # Postgres Restore
+    bak --debug pg restore \
+        --dir "${RESTORE_DIR}" \
+        --host "${PGHOST}" \
+        --user "${PGUSER}" \
+        --password "${PGPASSWORD}"
+
+    # GRIP Restore
+    bak --debug grip restore \
+        --dir "${RESTORE_DIR}" \
+        --host "${GRIP_HOST}" \
+        --graph "${GRIP_GRAPH}"
+
+    echo "Restore operation completed successfully"
+
+else
+    echo "Error: Unknown operation '${OPERATION}'. Valid operations are 'backup' or 'restore'"
+    exit 1
+fi
diff --git a/src/backup/grip/__init__.py b/src/backup/grip/__init__.py
@@ -45,7 +45,7 @@ def _getEdges(grip: GripConfig, graph: str, limit: int) -> list[str]:
 
     G = c.graph(graph)
 
-    for i in G.query().E().limit(limit):
+    for i in G.E().limit(limit):
         edges.append(i)
 
     return edges
@@ -62,7 +62,7 @@ def _getVertices(grip: GripConfig, graph: str, limit: int) -> list[str]:
 
     G = c.graph(graph)
 
-    for i in G.query().V().limit(limit):
+    for i in G.V().limit(limit):
         vertices.append(i)
 
     return vertices
@@ -91,12 +91,12 @@ def _dump(grip: GripConfig, graph: str, limit: int, vertex: bool, edge: bool, ou
     # write vertex and edge objects from grip DB to file
     if vertex:
         with open(out / f"{graph}.vertices", "wb") as f:
-            for i in G.query().V().limit(limit):
+            for i in G.V().limit(limit):
                 f.write(orjson.dumps(i, option=orjson.OPT_APPEND_NEWLINE))
 
     if edge:
         with open(out / f"{graph}.edges", "wb") as f:
-            for i in G.query().E().limit(limit):
+            for i in G.E().limit(limit):
                 f.write(orjson.dumps(i, option=orjson.OPT_APPEND_NEWLINE))
 
     # TODO: At this point you will need to reconnect to the new grip instance to load the data that was dumped
@@ -107,34 +107,41 @@ def _restore(grip: GripConfig, graph: str, dir: Path):
     conn = _connect(grip)
     G = conn.graph(graph)
 
-    bulkV = G.bulkAdd()
-    with open("grip.vertices", "rb") as f:
-        count = 0
-        for i in f:
-            data = orjson.loads(i)
-            _id = data["_id"]
-            _label = data["_label"]
-            del data["_id"], data["_label"]
-            bulkV.addVertex(_id, _label, data)
-            count += 1
-            if count % 10000 == 0:
-                print("loaded %d vertices" % count)
-    err = bulkV.execute()
-    print("Vertices load res: ", str(err))
-
-    bulkE = G.bulkAdd()
-    with open("grip.edges", "rb") as f:
-        count = 0
-        for i in f:
-            data = orjson.loads(i)
-            _id = data["_id"]
-            _label = data["_label"]
-            _to = data["_to"]
-            _from = data["_from"]
-            del data["_id"], data["_label"], data["_to"], data["_from"]
-            bulkE.addEdge(_to, _from, _label, data=data, gid=_id)
-            count += 1
-            if count % 10000 == 0:
-                print("loaded %d edges" % count)
-    err = bulkE.execute()
-    print("Edges load res: ", str(err))
+    vertex_file = dir / f"{graph}.vertices"
+    edge_file = dir / f"{graph}.edges"
+
+    # Load vertices if file exists
+    if vertex_file.exists():
+        bulkV = G.bulkAdd()
+        with open(vertex_file, "rb") as f:
+            count = 0
+            for i in f:
+                data = orjson.loads(i)
+                _id = data["_id"]
+                _label = data["_label"]
+                del data["_id"], data["_label"]
+                bulkV.addVertex(_id, _label, data)
+                count += 1
+                if count % 10000 == 0:
+                    print("loaded %d vertices" % count)
+        err = bulkV.execute()
+        print("Vertices load res: ", str(err))
+
+    # Load edges if file exists
+    if edge_file.exists():
+        bulkE = G.bulkAdd()
+        with open(edge_file, "rb") as f:
+            count = 0
+            for i in f:
+                data = orjson.loads(i)
+                _id = data["_id"]
+                _label = data["_label"]
+                _to = data["_to"]
+                _from = data["_from"]
+                del data["_id"], data["_label"], data["_to"], data["_from"]
+                bulkE.addEdge(_to, _from, _label, data=data, gid=_id)
+                count += 1
+                if count % 10000 == 0:
+                    print("loaded %d edges" % count)
+        err = bulkE.execute()
+        print("Edges load res: ", str(err))
diff --git a/src/backup/postgres/__pycache__/__init__.cpython-312.pyc b/src/backup/postgres/__pycache__/__init__.cpython-312.pyc
diff --git a/src/backup/s3/__pycache__/__init__.cpython-312.pyc b/src/backup/s3/__pycache__/__init__.cpython-312.pyc
Original file line number	Diff line number	Diff line change
Expand Up		@@ -113,3 +113,4 @@ venv.bak/

		# SQL files
		.sql
		__pycache__/