From e3d500dc4245ed40b7ba399974a72aa194eed5fc Mon Sep 17 00:00:00 2001 From: Edmund Higham Date: Fri, 2 Apr 2021 17:59:38 +0100 Subject: [PATCH] [GH-1221] import snapshots into a workspace (#347) RR: https://broadinstitute.atlassian.net/browse/GH-1221 In this PR, I'm adding test demonstration code for how to import data from a TDR snapshot into a Terra Workspace. This includes: Querying all the rows in the specified table in the snapshot Selecting and renaming the columns of the table to match those of the target entity Importing the rows as new entities into the workspace. The renaming works by viewing the rows of the table as a list of maps, mapping those names onto workspace names, then reconstructing the table. Added dataset/snapshot querying utilities to the datarepo namespace. Bug fixes: "flattening" the big-query rows didn't flatten rows of arrays --- api/src/wfl/environment.clj | 4 +- api/src/wfl/service/cromwell.clj | 13 +- api/src/wfl/service/datarepo.clj | 167 ++++++++++-------- api/src/wfl/service/firecloud.clj | 88 ++++++--- api/src/wfl/service/google/bigquery.clj | 58 +++--- api/src/wfl/util.clj | 62 ++++++- .../sarscov2-illumina-full-inputs.json | 45 +++-- .../dataset-from-inputs.edn | 30 +--- .../entity-from-dataset.edn | 8 + .../flowcell-ingest-request.edn | 11 ++ .../inputs-ingest-request.edn | 9 - api/test/wfl/integration/datarepo_test.clj | 95 ++++++++-- api/test/wfl/system/automation_test.clj | 22 +-- api/test/wfl/tools/datasets.clj | 14 +- api/test/wfl/tools/fixtures.clj | 12 +- api/test/wfl/tools/snapshots.clj | 29 +-- api/test/wfl/unit/google/bigquery_test.clj | 43 ++--- 17 files changed, 429 insertions(+), 281 deletions(-) create mode 100644 api/test/resources/workflows/sarscov2_illumina_full/entity-from-dataset.edn create mode 100644 api/test/resources/workflows/sarscov2_illumina_full/flowcell-ingest-request.edn delete mode 100644 api/test/resources/workflows/sarscov2_illumina_full/inputs-ingest-request.edn diff --git a/api/src/wfl/environment.clj b/api/src/wfl/environment.clj index d2fece9cf..eeb7eb6c9 100644 --- a/api/src/wfl/environment.clj +++ b/api/src/wfl/environment.clj @@ -37,7 +37,7 @@ "WFL_COOKIE_SECRET" #(-> "secret/dsde/gotc/dev/zero" vault-secrets :cookie_secret) "WFL_TDR_URL" - #(-> "https://jade.datarepo-dev.broadinstitute.org/") + #(-> "https://jade.datarepo-dev.broadinstitute.org") "WFL_OAUTH2_CLIENT_ID" #(-> "secret/dsde/gotc/dev/zero" vault-secrets :oauth2_client_id) "WFL_POSTGRES_PASSWORD" @@ -47,7 +47,7 @@ "WFL_POSTGRES_USERNAME" #(-> nil) "WFL_FIRECLOUD_URL" - #(-> "https://api.firecloud.org/") + #(-> "https://api.firecloud.org") ;; -- variables used in test code below this line -- "WFL_CROMWELL_URL" diff --git a/api/src/wfl/service/cromwell.clj b/api/src/wfl/service/cromwell.clj index 64d72fc60..d69daf466 100644 --- a/api/src/wfl/service/cromwell.clj +++ b/api/src/wfl/service/cromwell.clj @@ -263,13 +263,6 @@ (defn wait-for-workflow-complete "Return status of workflow named by ID when it completes, given Cromwell URL." [url id] - (work-around-cromwell-fail-bug 9 url id) - (loop [url url id id] - (let [seconds 15 - now (status url id)] - (if (#{"Submitted" "Running"} now) - (do (log/infof "%s: Sleeping %s seconds on status: %s" - id seconds now) - (util/sleep-seconds seconds) - (recur url id)) - (status url id))))) + (let [finished? (comp (set final-statuses) #(status url id))] + (work-around-cromwell-fail-bug 9 url id) + (util/poll finished? 15 (Integer/MAX_VALUE)))) diff --git a/api/src/wfl/service/datarepo.clj b/api/src/wfl/service/datarepo.clj index b65342611..977f3165e 100644 --- a/api/src/wfl/service/datarepo.clj +++ b/api/src/wfl/service/datarepo.clj @@ -1,34 +1,38 @@ (ns wfl.service.datarepo "Do stuff in the data repo." - (:require [clj-http.client :as http] - [clojure.data.json :as json] - [clojure.string :as str] - [wfl.auth :as auth] - [wfl.environment :as env] - [wfl.mime-type :as mime-type] - [wfl.util :as util]) + (:require [clj-http.client :as http] + [clojure.data.json :as json] + [clojure.string :as str] + [wfl.auth :as auth] + [wfl.environment :as env] + [wfl.mime-type :as mime-type] + [wfl.service.google.bigquery :as bigquery] + [wfl.util :as util]) (:import (java.time Instant) (java.util.concurrent TimeUnit))) (defn ^:private datarepo-url [& parts] - (let [url (util/slashify (env/getenv "WFL_TDR_URL"))] - (apply str url parts))) + (let [url (util/de-slashify (env/getenv "WFL_TDR_URL"))] + (str/join "/" (cons url parts)))) (def ^:private repository "API URL for Data Repo API." - (partial datarepo-url "api/repository/v1/")) + (partial datarepo-url "api/repository/v1")) + +(defn ^:private get-repository-json [& parts] + (-> (apply repository parts) + (http/get {:headers (auth/get-auth-header)}) + util/response-body-json)) (defn dataset "Query the DataRepo for the Dataset with `dataset-id`." [dataset-id] - (-> (repository "datasets/" dataset-id) - (http/get {:headers (auth/get-service-account-header)}) - util/response-body-json)) + (get-repository-json "datasets" dataset-id)) (defn ^:private ingest "Ingest THING to DATASET-ID according to BODY." [thing dataset-id body] - (-> (repository (format "datasets/%s/%s" dataset-id thing)) + (-> (repository "datasets" dataset-id thing) (http/post {:content-type :application/json :headers (auth/get-service-account-header) :body (json/write-str body :escape-slash false)}) @@ -73,25 +77,24 @@ (ingest "ingest" dataset-id - {:format "json" - :load_tag "string" - :max_bad_records 0 - :path path - :table table})) + {:format "json" + :load_tag (new-load-tag) + :max_bad_records 0 + :path path + :table table})) (defn poll-job - "Return result for JOB-ID in ENVIRONMENT when it stops running." - [job-id] - (let [get-result #(-> (repository "jobs/" job-id "/result") - (http/get {:headers (auth/get-service-account-header)}) - util/response-body-json) - running? #(-> (repository "jobs/" job-id) - (http/get {:headers (auth/get-service-account-header)}) - util/response-body-json - :job_status - #{"running"})] - (while (running?) (.sleep TimeUnit/SECONDS 1)) - (get-result))) + "Poll the job with `job-id` every `seconds` [default: 5] and return its + result." + ([job-id seconds] + (let [result #(get-repository-json "jobs" job-id "result") + running? #(-> (get-repository-json "jobs" job-id) + :job_status + #{"running"})] + (while (running?) (.sleep TimeUnit/SECONDS seconds)) + (result))) + ([job-id] + (poll-job job-id 5))) (defn create-dataset "Create a dataset with EDN `dataset-request` and return the id @@ -111,26 +114,21 @@ :id)) (defn delete-dataset - "Delete the Dataset with `dataset-id`." + "Delete the dataset with `dataset-id`." [dataset-id] - (-> (repository "datasets/" dataset-id) + (-> (repository "datasets" dataset-id) (http/delete {:headers (auth/get-service-account-header)}) util/response-body-json :id poll-job)) -;; Note the TDR is under active development, -;; the endpoint spec is getting changed so the -;; spec in this function is not consistent with -;; the TDR Swagger page in order to make the -;; request work. +;; Note the TDR is under active development, the endpoint spec is getting +;; changed so the spec in this function is not consistent with the TDR Swagger +;; page in order to make the request work. ;; See also https://cloud.google.com/bigquery/docs/reference/standard-sql/migrating-from-legacy-sql (defn create-snapshot - "Return snapshot-id when the snapshot defined - by `snapshot-request` is ready. - - See `SnapshotRequestModel` in the - DataRepo swagger page for more information. + "Return snapshot-id when the snapshot defined by `snapshot-request` is ready. + See `SnapshotRequestModel` in the DataRepo swagger page for more information. https://jade.datarepo-dev.broadinstitute.org/swagger-ui.html#/" [snapshot-request] (-> (repository "snapshots") @@ -143,9 +141,8 @@ :id)) (defn list-snapshots - "Return snapshots optionally filtered by source dataset, - where dataset-ids identify the source datasets. - Hard-coded to return 999 pages for now. + "Return snapshots optionally filtered by source dataset, where dataset-ids + identify the source datasets. Hard-coded to return 999 pages for now. Parameters ---------- @@ -155,7 +152,7 @@ Example ------- (list-snapshots) - (list-snapshots \"48a51f71-6bab-483d-a270-3f9ebfb241cd\" \"85efdfea-52fb-4698-bee6-eef76104a7f4\")" + (list-snapshots \"48a51f71-6bab-483d-a270-3f9ebfb241cd\")" [& dataset-ids] (letfn [(maybe-merge [m k v] (if (seq v) (assoc m k {:datasetIds v}) m))] (-> (http/get (repository "snapshots") @@ -167,7 +164,7 @@ (defn delete-snapshot "Delete the Snapshot with `snapshot-id`." [snapshot-id] - (-> (repository "snapshots/" snapshot-id) + (-> (repository "snapshots" snapshot-id) (http/delete {:headers (auth/get-service-account-header)}) util/response-body-json :id @@ -176,30 +173,7 @@ (defn snapshot "Return the snapshot with `snapshot-id`." [snapshot-id] - (-> (repository "snapshots/" snapshot-id) - (http/get {:headers (auth/get-service-account-header)}) - util/response-body-json)) - -;; Note if there are no matching rows between (start, end], TDR will throw -;; a 400 exception. -;; Note TDR prefixes datasets in BigQuery with `datarepo_`. -(defn make-snapshot-query - "Make row-id query payload from `dataset` and `table`, - given a date range specified by exclusive `start` and inclusive `end`. - - Parameters - ---------- - _dataset - Dataset information response from TDR. - table - Name of the table in the dataset schema to query from. - start - The start date object in the timeframe to query exclusively. - end - The end date object in the timeframe to query inclusively." - [{:keys [name dataProject] :as _dataset} table start end] - (let [dataset-name (str "datarepo_" name) - query (str/join \newline ["SELECT datarepo_row_id" - "FROM `%s.%s.%s`" - "WHERE datarepo_ingest_date > '%s'" - "AND datarepo_ingest_date <= '%s'"])] - (format query dataProject dataset-name table start end))) + (get-repository-json "snapshots" snapshot-id)) (defn all-columns "Return all of the columns of `table` in `dataset` content." @@ -212,8 +186,8 @@ ;; Note TDR uses snapshot names as unique identifier so the ;; name must be unique among snapshots. (defn make-snapshot-request - "Return a snapshot request for `row-ids`and `columns` - from `table` name in `_dataset`." + "Return a snapshot request for `row-ids`and `columns` from `table` name + in `_dataset`." [{:keys [name defaultProfileId description] :as _dataset} columns table row-ids] (let [row-ids (vec row-ids)] {:contents [{:datasetName name @@ -224,3 +198,48 @@ :description description :name name :profileId defaultProfileId})) + +;; hack - TDR adds the "datarepo_" prefix to the dataset name in BigQuery +;; They plan to expose this name via `GET /api/repository/v1/datasets/{id}` +;; in a future release. +(defn ^:private bigquery-name + "Return the BigQuery name of the `dataset-or-snapshot`." + [{:keys [name] :as dataset-or-snapshot}] + (letfn [(snapshot? [x] (util/absent? x :defaultSnapshotId))] + (if (snapshot? dataset-or-snapshot) name (str "datarepo_" name)))) + +(defn ^:private query-table-impl + ([{:keys [dataProject] :as dataset} table col-spec] + (let [bq-name (bigquery-name dataset)] + (->> (format "SELECT %s FROM `%s.%s.%s`" col-spec dataProject bq-name table) + (bigquery/query-sync dataProject))))) + +(defn query-table + "Query everything or optionally the `columns` in `table` in the Terra DataRepo + `dataset`, where `dataset` is a DataRepo dataset or a snapshot of a dataset." + ([dataset table] + (query-table-impl dataset table "*")) + ([dataset table columns] + (->> (util/to-comma-separated-list (map name columns)) + (query-table-impl dataset table)))) + +(defn ^:private query-table-between-impl + [{:keys [dataProject] :as dataset} table [start end] col-spec] + (let [bq-name (bigquery-name dataset) + query "SELECT %s + FROM `%s.%s.%s` + WHERE datarepo_ingest_date > '%s' + AND datarepo_ingest_date <= '%s'"] + (->> (format query col-spec dataProject bq-name table start end) + (bigquery/query-sync dataProject)))) + +(defn query-table-between + "Query everything or optionally the `columns` in `table` in the Terra DataRepo + `dataset` in the open-closed `interval` of `datarepo_ingest_date`, where + `dataset` is a DataRepo dataset or a snapshot of a dataset. If no rows match + the `interval`, TDR will respond with error 400." + ([dataset table interval] + (query-table-between-impl dataset table interval "*")) + ([dataset table interval columns] + (->> (util/to-comma-separated-list (map name columns)) + (query-table-between-impl dataset table interval)))) diff --git a/api/src/wfl/service/firecloud.clj b/api/src/wfl/service/firecloud.clj index a0b8aedc9..f2cc00631 100644 --- a/api/src/wfl/service/firecloud.clj +++ b/api/src/wfl/service/firecloud.clj @@ -8,15 +8,21 @@ [wfl.util :as util])) (defn ^:private firecloud-url [& parts] - (let [url (util/slashify (env/getenv "WFL_FIRECLOUD_URL"))] - (apply str url parts))) + (let [url (util/de-slashify (env/getenv "WFL_FIRECLOUD_URL"))] + (str/join "/" (cons url parts)))) -(def workspace-api-url (partial firecloud-url "api/workspaces/")) +(def ^:private workspace-api-url + (partial firecloud-url "api/workspaces")) + +(defn ^:private get-workspace-json [& parts] + (-> (apply workspace-api-url parts) + (http/get {:headers (auth/get-auth-header)}) + util/response-body-json)) (defn abort-submission "Abort the submission with `submission-id` in the Terra `workspace`." [workspace submission-id] - (-> (workspace-api-url (str/join "/" [workspace "submissions" submission-id])) + (-> (workspace-api-url workspace "submissions" submission-id) (http/delete {:headers (auth/get-auth-header)}))) (defn create-submission @@ -24,7 +30,7 @@ [workspace methodconfig [entity-type entity-name :as _entity]] (let [[mcns mcn] (str/split methodconfig #"/")] (-> {:method :post - :url (workspace-api-url workspace "/submissions") + :url (workspace-api-url workspace "submissions") :headers (auth/get-auth-header) :content-type :application/json :body (json/write-str @@ -38,30 +44,44 @@ util/response-body-json :submissionId))) +(defn create-workspace + "Create an empty Terra workspace with the fully-qualified `workspace` name, + granting access to the `firecloud-group`." + [workspace firecloud-group] + (let [[namespace name] (str/split workspace #"/") + payload {:namespace namespace + :name name + :attributes {:description ""} + :authorizationDomain [{:membersGroupName firecloud-group}]}] + (-> (workspace-api-url) + (http/post {:headers (auth/get-auth-header) + :content-type :application/json + :body (json/write-str payload)}) + util/response-body-json))) + +(defn delete-workspace + "Delete the terra `workspace` and all data within." + [workspace] + (-> (workspace-api-url workspace) + (http/delete {:headers (auth/get-auth-header)}) + util/response-body-json)) + (defn get-submission "Return the submission in the Terra `workspace` with `submission-id`." [workspace submission-id] - (-> (workspace-api-url workspace "/submissions/" submission-id) - (http/get {:headers (auth/get-auth-header)}) - util/response-body-json)) + (get-workspace-json workspace "submissions" submission-id)) (defn get-workflow "Query the `firecloud-url` for the the `workflow` created by the `submission` in the Terra `workspace`." [workspace submission-id workflow-id] - (-> (str/join "/" [workspace "submissions" submission-id "workflows" workflow-id]) - workspace-api-url - (http/get {:headers (auth/get-auth-header)}) - util/response-body-json)) + (get-workspace-json workspace "submissions" submission-id "workflows" workflow-id)) (defn get-workflow-outputs "Query the `firecloud-url` for the outputs of the `workflow` created by the `submission` in the Terra `workspace`." [workspace submission-id workflow-id] - (-> (str/join "/" [workspace "submissions" submission-id "workflows" workflow-id "outputs"]) - workspace-api-url - (http/get {:headers (auth/get-auth-header)}) - util/response-body-json)) + (get-workspace-json workspace "submissions" submission-id "workflows" workflow-id "outputs")) (defn get-workflow-status-by-entity "Get workflow status given a Terra submission-id and entity-name." @@ -73,6 +93,20 @@ first :status))) +(defn delete-entities + "Delete the `entities` from the Terra `workspace`. + Parameters + ---------- + workspace - Terra Workspace to delete entities from + entities - list of entity `[type name]` pairs" + [workspace entities] + (letfn [(make-entity [[type name]] {:entityType type :entityName name})] + (-> (workspace-api-url workspace "entities" "delete") + (http/post {:headers (auth/get-auth-header) + :content-type :application/json + :body (json/write-str (map make-entity entities))}) + util/response-body-json))) + (defn import-entities "Import sample entities into a Terra WORKSPACE from a tsv FILE. The upload requires owner permission on the workspace. @@ -86,21 +120,31 @@ ------- (import-entities \"workspace-namespace/workspace-name\" \"./samples.tsv\")" [workspace file] - (-> (workspace-api-url workspace "/flexibleImportEntities") + (-> (workspace-api-url workspace "flexibleImportEntities") (http/post {:headers (auth/get-auth-header) :multipart (util/multipart-body {:Content/type "text/tab-separated-values" :entities (slurp file)})}))) +(defn list-entities + "List all entities with `entity-type` in `workspace`." + [workspace entity-type] + (get-workspace-json workspace "entities" entity-type)) + +(defn list-entity-types + "List the entity types along with their attributes in `workspace`." + [workspace] + (get-workspace-json workspace "entities")) + (defn describe-workflow - "Get a machine-readbale description of the `workflow`, including its inputs + "Get a machine-readable description of the `workflow`, including its inputs and outputs. `workflow` can either be a url or the workflow source code." [workflow] (letfn [(url? [s] (some #(str/starts-with? s %) ["http://" "https://"]))] (-> (firecloud-url "/api/womtool/v1/describe") (http/post {:headers (auth/get-auth-header) - :multipart (util/multipart-body - (if (url? workflow) - {:workflowUrl workflow} - {:workflowSource workflow}))}) + :multipart (util/multipart-body {(if (url? workflow) + :workflowUrl + :workflowSource) + workflow})}) util/response-body-json))) diff --git a/api/src/wfl/service/google/bigquery.clj b/api/src/wfl/service/google/bigquery.clj index a5dfeb546..fabe75153 100644 --- a/api/src/wfl/service/google/bigquery.clj +++ b/api/src/wfl/service/google/bigquery.clj @@ -46,6 +46,13 @@ util/response-body-json :tables)) +(defn ^:private normalize-table [{:keys [schema] :as response}] + (let [repeated? (mapv #(= "REPEATED" (:mode %)) (:fields schema))] + (letfn [(flatten-column [idx column] + (if (repeated? idx) (mapv :v (:v column)) (:v column))) + (flatten-row [row] (vec (map-indexed flatten-column (:f row))))] + (update response :rows #(map flatten-row %))))) + (defn query-sync "Given QUERY, look for rows in a BigQuery table within a Google Cloud PROJECT synchronously, using non-legacy query @@ -56,47 +63,40 @@ project - Google Cloud Project to list the BigQuery datasets in. query - BigQuery Standard SQL query string." [project query] - (letfn [(flatten-rows [rows] (mapv #(map :v (:f %)) rows))] - (-> (str/join "/" ["projects" project "queries"]) - bigquery-url - (http/post {:headers (auth/get-auth-header) - :body (json/write-str - {:query query - :use_legacy_sql false})}) - util/response-body-json - (update :rows flatten-rows)))) + (-> (str/join "/" ["projects" project "queries"]) + bigquery-url + (http/post {:headers (auth/get-auth-header) + :body (json/write-str {:query query + :use_legacy_sql false})}) + util/response-body-json + normalize-table)) (defn dump-table->tsv "Dump a BigQuery TABLE/view into a tsv FILE that's supported by Terra. Will dump the tsv contents to bytes if no FILE is provided. - The first row header must follow the format 'entity:{data_table}_id'. - For example, 'entity:sample_id' will upload the tsv data into a `sample` - table in the workspace (or create one if it does not exist). If the - table already contains a sample with that id, it will get overwritten. + The first column of the table must be the table primary key, + i.e. [entity-type]_id. For example, 'entity:sample_id' will upload the tsv + data into a `sample` table in the workspace (or create one if it does not + exist). If the table already contains a sample with that id, it will get + overwritten. Parameters ---------- table - BigQuery table/view body with the rows field flatten. - terra-data-table - The `table` name in the Terra workspace to import the TSV. file - [optional] TSV file name to dump. Example ------- - (dump-table->tsv table \"datarepo_row\" \"dumped.tsv\") - (dump-table->tsv table \"datarepo_row\")" - ([table terra-data-table file] - (letfn [(format-header-for-terra [header] - (cons (format "entity:%s_id" terra-data-table) (rest header)))] - (let [headers (map :name (get-in table [:schema :fields])) - rows (get-in table [:rows]) - contents (-> [] - (into [(format-header-for-terra headers)]) - (into rows))] + (dump-table->tsv table \"dumped.tsv\") + (dump-table->tsv table)" + ([table file] + (letfn [(format-entity-type [[head & rest]] + (cons (format "entity:%s" head) rest))] + (let [columns (map :name (get-in table [:schema :fields]))] (with-open [writer (io/writer file)] - (csv/write-csv writer - contents - :separator \tab) + (csv/write-csv writer [(format-entity-type columns)] :separator \tab) + (csv/write-csv writer (:rows table) :separator \tab) file)))) - ([table terra-data-table] - (str (dump-table->tsv table terra-data-table (StringWriter.))))) + ([table] + (str (dump-table->tsv table (StringWriter.))))) diff --git a/api/src/wfl/util.clj b/api/src/wfl/util.clj index 3e10f9465..8d9809d2f 100644 --- a/api/src/wfl/util.clj +++ b/api/src/wfl/util.clj @@ -1,17 +1,18 @@ (ns wfl.util "Some utilities shared across this program." - (:require [clojure.data.json :as json] - [clojure.java.io :as io] - [clojure.java.shell :as shell] - [clojure.string :as str] + (:require [clojure.data.json :as json] + [clojure.java.io :as io] + [clojure.java.shell :as shell] + [clojure.string :as str] [clojure.tools.logging :as log] - [wfl.wfl :as wfl]) + [wfl.wfl :as wfl]) (:import [java.io File Writer IOException] - [java.time OffsetDateTime Clock LocalDate] - [java.time.temporal ChronoUnit] [java.nio.file Files] [java.nio.file.attribute FileAttribute] + [java.time OffsetDateTime Clock LocalDate] + [java.time.temporal ChronoUnit] [java.util ArrayList Collections Random UUID] + [java.util.concurrent TimeUnit TimeoutException] [java.util.zip ZipOutputStream ZipEntry] [org.apache.commons.io FilenameUtils])) @@ -336,13 +337,23 @@ (with-open [in (io/input-stream resource)] (io/copy in file)) file))) +(defn between + "Place the `middle` lexeme between [`first` `second`]." + [[first second] middle] (str first middle second)) + +(defn to-comma-separated-list + "Return the sequence `xs` composed into a comma-separated list string. + Example: + (to-comma-separated-list '['x 'y 'z]) => \"(x,y,z)\"" + [xs] + (between "()" (str/join "," xs))) + (defn to-quoted-comma-separated-list "Return the sequence `xs` composed into a comma-separated list string. Example: (to-quoted-comma-separated-list '[x y z]) => \"('x','y','z')\"" [xs] - (letfn [(between [[first second] x] (str first x second))] - (between "()" (str/join "," (map (partial between "''") xs))))) + (between "()" (str/join "," (map (partial between "''") xs)))) (defn slashify "Ensure URL ends in a slash /." @@ -388,3 +399,36 @@ backward or forward depends on the sign of n." [^Integer n] (.plus (today) n ChronoUnit/DAYS)) + +(defn randomize + "Append a random suffix to `string`." + [string] + (->> (str/replace (UUID/randomUUID) "-" "") (str string))) + +(defn curry + "Curry the function `f` such that its arguments may be supplied across two + applications." + [f] + (fn [x & xs] (apply partial f x xs))) + +(defn >>> + "Left-to-right function composition, ie `(= (>>> f g) (comp g f))`." + [f & fs] + (reduce #(comp %2 %1) f fs)) + +(defn poll + "Poll `task!` every `seconds` [default: 1], attempting at most `max-attempts` + [default: 3]." + ([task! seconds max-attempts] + (loop [attempt 1] + (if-let [result (task!)] + result + (do (when (<= max-attempts attempt) + (throw (TimeoutException. "Max number of attempts exceeded"))) + (log/debugf "Sleeping - attempt #%s of %s" attempt max-attempts) + (.sleep TimeUnit/SECONDS seconds) + (recur (inc attempt)))))) + ([task seconds] + (poll task seconds 3)) + ([task] + (poll task 1))) diff --git a/api/test/resources/datasets/sarscov2-illumina-full-inputs.json b/api/test/resources/datasets/sarscov2-illumina-full-inputs.json index 5178907e8..79b54f163 100644 --- a/api/test/resources/datasets/sarscov2-illumina-full-inputs.json +++ b/api/test/resources/datasets/sarscov2-illumina-full-inputs.json @@ -1,52 +1,49 @@ { "name": "sarscov2_illumina_full_inputs", - "description": "COVID-19 sarscov2_illumina_full pipeline inputs", + "description": "initial flowcell values for the sarscov2_illumina_full COVID-19 surveillance pipeline", "defaultProfileId": "390e7a85-d47f-4531-b612-165fc977d3bd", "schema": { "tables": [ { - "name": "sarscov2_illumina_full_inputs", + "name": "flowcell", "columns": [ { - "name": "flowcell_tgz", + "name": "authors_sbt", "datatype": "fileref" }, { - "name": "reference_fasta", - "datatype": "fileref" + "name": "biosample_map", + "datatype": "fileref", + "array_of": true }, { - "name": "amplicon_bed_prefix", + "name": "flowcell_id", "datatype": "string" }, { - "name": "biosample_attributes", - "datatype": "fileref", - "array_of": true + "name": "flowcell_tgz", + "datatype": "fileref" }, { "name": "instrument_model", "datatype": "string" }, { - "name": "sra_title", - "datatype": "string" - }, - { - "name": "min_genome_bases", - "datatype": "int64" + "name": "sample_rename_map", + "datatype": "fileref" }, { - "name": "max_vadr_alerts", - "datatype": "int64" + "name": "samplesheets", + "datatype": "fileref", + "array_of": true }, { - "name": "workspace_name", + "name": "title", "datatype": "string" }, { - "name": "terra_project", - "datatype": "string" + "name": "updated", + "datatype": "timestamp" }, { "name": "extra", @@ -54,11 +51,13 @@ } ], "primaryKey": [ + "authors_sbt", + "flowcell_id", "flowcell_tgz", - "reference_fasta", - "amplicon_bed_prefix", "instrument_model", - "sra_title" + "sample_rename_map", + "title", + "updated" ], "partitionMode": "date", "datePartitionOptions": { diff --git a/api/test/resources/workflows/sarscov2_illumina_full/dataset-from-inputs.edn b/api/test/resources/workflows/sarscov2_illumina_full/dataset-from-inputs.edn index 784878f78..c6d7a8da7 100644 --- a/api/test/resources/workflows/sarscov2_illumina_full/dataset-from-inputs.edn +++ b/api/test/resources/workflows/sarscov2_illumina_full/dataset-from-inputs.edn @@ -1,21 +1,9 @@ -{:flowcell_tgz "flowcell_tgz", - :reference_fasta "reference_fasta", - :amplicon_bed_prefix "amplicon_bed_prefix", - :biosample_attributes "biosample_attributes", - :instrument_model "instrument_model", - :min_genome_bases "min_genome_bases", - :max_vadr_alerts "max_vadr_alerts", - :sra_title "sra_title", - :workspace_name "$SARSCoV2-Illumina-Full", - :terra_project "$wfl-dev", - :extra {:demux_deplete.spikein_db "demux_deplete.spikein_db", - :demux_deplete.samplesheets "demux_deplete.samplesheets", - :demux_deplete.sample_rename_map "demux_deplete.sample_rename_map", - :demux_deplete.bwaDbs "demux_deplete.bwaDbs", - :demux_deplete.blastDbs "demux_deplete.blastDbs", - :gisaid_meta_prep.username "gisaid_meta_prep.username", - :gisaid_meta_prep.submitting_lab_addr "gisaid_meta_prep.submitting_lab_addr", - :gisaid_meta_prep.submitting_lab_name "gisaid_meta_prep.submitting_lab_name", - :package_genbank_ftp_submission.spuid_namespace "package_genbank_ftp_submission.spuid_namespace", - :package_genbank_ftp_submission.author_template_sbt "package_genbank_ftp_submission.author_template_sbt", - :package_genbank_ftp_submission.account_name "package_genbank_ftp_submission.account_name"}} +{:authors_sbt "package_genbank_ftp_submission.author_template_sbt" + :biosample_map "biosample_attributes" + :flowcell_id "$Test_Flowcell" + :flowcell_tgz "flowcell_tgz" + :instrument_model "instrument_model" + :sample_rename_map "demux_deplete.sample_rename_map" + :samplesheets "demux_deplete.samplesheets" + :title "sra_title" + :updated "$2021-03-30 20:46:39+00"} diff --git a/api/test/resources/workflows/sarscov2_illumina_full/entity-from-dataset.edn b/api/test/resources/workflows/sarscov2_illumina_full/entity-from-dataset.edn new file mode 100644 index 000000000..38d9c2b06 --- /dev/null +++ b/api/test/resources/workflows/sarscov2_illumina_full/entity-from-dataset.edn @@ -0,0 +1,8 @@ +{:authors_sbt "authors_sbt" + :biosample_map "biosample_map" + :flowcell_id "flowcell_id" + :flowcell_tgz "flowcell_tgz" + :instrument_model "instrument_model" + :sample_rename_map "sample_rename_map" + :samplesheets "samplesheets" + :title "title"} diff --git a/api/test/resources/workflows/sarscov2_illumina_full/flowcell-ingest-request.edn b/api/test/resources/workflows/sarscov2_illumina_full/flowcell-ingest-request.edn new file mode 100644 index 000000000..73af2073a --- /dev/null +++ b/api/test/resources/workflows/sarscov2_illumina_full/flowcell-ingest-request.edn @@ -0,0 +1,11 @@ +{:authors_sbt "96bd0bda-b7a6-4680-9a2a-caa2bb52e914" + :biosample_map ["0a6c073a-f73c-4f1f-9b0c-f80a105a5a54"] + :flowcell_id "Test_Flowcell" + :flowcell_tgz "ae888931-7e79-488b-b5b3-a12bb81fbd65" + :instrument_model "Illumina NovaSeq 6000" + :sample_rename_map "4c9d1a3f-739d-4524-9f58-401c6210ad1a" + :samplesheets + ["0b3ea1d2-1d51-43cd-8de8-25d4f5482884" + "0b3ea1d2-1d51-43cd-8de8-25d4f5482884"] + :title "Metagenomic RNA-Seq of SARS-CoV-2 from patients" + :updated "2021-03-30 20:46:39+00"} diff --git a/api/test/resources/workflows/sarscov2_illumina_full/inputs-ingest-request.edn b/api/test/resources/workflows/sarscov2_illumina_full/inputs-ingest-request.edn deleted file mode 100644 index 186ea2430..000000000 --- a/api/test/resources/workflows/sarscov2_illumina_full/inputs-ingest-request.edn +++ /dev/null @@ -1,9 +0,0 @@ -{:instrument_model "Illumina NovaSeq 6000" - :workspace_name "SARSCoV2-Illumina-Full" - :terra_project "wfl-dev" - :flowcell_tgz "093314e1-688e-49cb-90af-b392d104a57f" - :sra_title "Metagenomic RNA-Seq of SARS-CoV-2 from patients" - :biosample_attributes ["d1830e0c-a1dc-448b-816c-b4f656c21876"] - :amplicon_bed_prefix "gs://pathogen-public-dbs/v1/amplicon_primers-" - :reference_fasta "a2599579-aa20-4f9b-a0a0-99b5043ec0b4" - :extra "{\"demux_deplete.spikein_db\":\"5613ae0f-7df9-460b-8cab-58b7f97f924f\",\"demux_deplete.samplesheets\":[\"7dbe165d-aef0-4a5c-bdff-d717e7a3f084\",\"7dbe165d-aef0-4a5c-bdff-d717e7a3f084\"],\"demux_deplete.sample_rename_map\":\"93b5ccd4-cf50-4dc4-9c9f-91a45afbb588\",\"demux_deplete.bwaDbs\":[\"85c66634-6c80-4133-92b1-22b955f36e42\"],\"demux_deplete.blastDbs\":[\"84d52a91-dfc6-421c-ae6f-a68118fa901d\",\"6aacab93-027a-4513-a609-c25143faef04\"],\"gisaid_meta_prep.username\":\"dpark\",\"gisaid_meta_prep.submitting_lab_addr\":\"75 Ames St, Cambridge, MA 02142, USA\",\"package_genbank_ftp_submission.spuid_namespace\":\"Broad_GCID\",\"gisaid_meta_prep.submitting_lab_name\":\"Infectious Disease Program, Broad Institute of Harvard and MIT\",\"package_genbank_ftp_submission.author_template_sbt\":\"f96db65e-668a-41d1-95cf-4e3457742500\",\"package_genbank_ftp_submission.account_name\":\"broad_gcid-srv\"}"} diff --git a/api/test/wfl/integration/datarepo_test.clj b/api/test/wfl/integration/datarepo_test.clj index abdb052ba..047f56037 100644 --- a/api/test/wfl/integration/datarepo_test.clj +++ b/api/test/wfl/integration/datarepo_test.clj @@ -3,6 +3,7 @@ [clojure.test :refer [deftest is testing]] [wfl.environment :as env] [wfl.service.datarepo :as datarepo] + [wfl.service.firecloud :as firecloud] [wfl.service.google.storage :as gcs] [wfl.service.google.bigquery :as bigquery] [wfl.tools.datasets :as datasets] @@ -10,7 +11,7 @@ [wfl.tools.snapshots :as snapshots] [wfl.tools.resources :as resources] [wfl.tools.workflows :as workflows] - [wfl.util :as util]) + [wfl.util :as util :refer [>>>]]) (:import [java.util UUID])) (deftest test-create-dataset @@ -77,20 +78,86 @@ (is (= 1 row_count)) (is (= 0 bad_row_count)))))))) -(def ^:private testing-dataset "3b41c460-d994-47a4-9bf7-c59861e858a6") +(def ^:private testing-dataset "ff6e2b40-6497-4340-8947-2f52a658f561") ;; Get row-ids from BigQuery and use them to create a snapshot. (deftest test-create-snapshot (let [tdr-profile (env/getenv "WFL_TDR_DEFAULT_PROFILE") - {:keys [dataProject] :as dataset} (datarepo/dataset testing-dataset) - table "flowcell" - start-datetime "2021-03-17" - end-datetime "2021-03-19" - row-ids (->> (datarepo/make-snapshot-query dataset table start-datetime end-datetime) - (bigquery/query-sync dataProject) - :rows flatten)] - (testing "creating snapshot" - (fixtures/with-temporary-snapshot - (snapshots/unique-snapshot-request tdr-profile dataset table row-ids) - #(let [snapshot (datarepo/snapshot %)] - (is (= % (:id snapshot)))))))) + dataset (datarepo/dataset testing-dataset) + table "flowcell" + row-ids (-> (datarepo/query-table-between + dataset + table + ["2021-03-30" "2021-03-31"] + [:datarepo_row_id]) + :rows + flatten)] + (fixtures/with-temporary-snapshot + (snapshots/unique-snapshot-request tdr-profile dataset table row-ids) + #(let [snapshot (datarepo/snapshot %)] + (is (= % (:id snapshot))))))) + +(deftest test-flattened-query-result + (let [samplesheets (-> (datarepo/dataset testing-dataset) + (datarepo/query-table "flowcell" [:samplesheets]) + :rows + (->> (mapcat first)))] + (is (every? string? samplesheets) "Nested arrays were not normalized."))) + +(defn ^:private table->map-view + "Create a hash-map 'view' of (adapter for) the BigQuery `table`." + [{:keys [schema rows] :as _table}] + (let [make-entry (fn [idx field] [(-> field :name keyword) idx]) + key->index (into {} (map-indexed make-entry (:fields schema)))] + (map (util/curry #(when-let [idx (key->index %2)] (%1 idx))) rows))) + +(defn ^:private maps->table + "Transform a list of maps into a table with `columns`." + [maps columns] + (let [table {:schema {:fields (mapv #(-> {:name (name %)}) columns)}} + f #(reduce (fn [row attr] (conj row (% attr))) [] columns)] + (assoc table :rows (map f maps)))) + +(defn ^:private make-entity-import-request-tsv + [from-dataset columns maps] + (-> (map #(datasets/rename-gather % from-dataset) maps) + (maps->table columns) + bigquery/dump-table->tsv)) + +(defn import-table + "Import the BigQuery `table` into the `entity` in the Terra `workspace`, + using `from-snapshot` to map column names in `table` to the `columns` in the + workspace entity. Return the names of the entities imported into `workspace`" + [table workspace [primary-key & _ :as columns] from-snapshot] + (let [maps (table->map-view table)] + (->> (make-entity-import-request-tsv from-snapshot columns maps) + .getBytes + (firecloud/import-entities workspace)) + (map #(% primary-key) maps))) + +(def ^:private snapshot-id "7cb392d8-949b-419d-b40b-d039617d2fc7") + +(def ^:private entity-columns + "Return the columns in the `entity-type`." + (>>> (juxt :idName :attributeNames) + #(apply cons %) + #(mapv keyword %))) + +(deftest test-import-snapshot + (let [dataset-table "flowcell" + entity "flowcell" + from-dataset (resources/read-resource "entity-from-dataset.edn") + columns (-> (firecloud/list-entity-types "wfl-dev/SARSCoV2-Illumina-Full") + :flowcell + entity-columns)] + (fixtures/with-temporary-workspace + (fn [workspace] + (let [entities (-> (datarepo/snapshot snapshot-id) + (datarepo/query-table dataset-table) + (import-table workspace columns from-dataset)) + names (->> #(firecloud/list-entities workspace entity) + (comp not-empty) + util/poll + (map :name) + set)] + (is (every? names entities))))))) diff --git a/api/test/wfl/system/automation_test.clj b/api/test/wfl/system/automation_test.clj index 795c23d6b..0a52c580b 100644 --- a/api/test/wfl/system/automation_test.clj +++ b/api/test/wfl/system/automation_test.clj @@ -22,8 +22,7 @@ (deftest test-automate-sarscov2-illumina-full (let [tdr-profile (env/getenv "WFL_TDR_DEFAULT_PROFILE")] (fixtures/with-fixtures - [(fixtures/with-temporary-cloud-storage-folder - "broad-gotc-dev-wfl-ptc-test-inputs") + [(fixtures/with-temporary-cloud-storage-folder fixtures/gcs-test-bucket) (fixtures/with-temporary-dataset (datasets/unique-dataset-request tdr-profile @@ -35,20 +34,21 @@ (fn [[temp source sink]] ;; TODO: create + start the workload ;; upload a sample - (let [inputs (resources/read-resource "sarscov2_illumina_full/inputs.edn") + (let [;; This defines how we'll convert the inputs of the pipeline into + ;; a form that can be ingested as a new row in the dataset. + ;; I think a user would specify something like this in the initial + ;; workload request, one mapping for dataset to workspace entity + ;; names and one for outputs to dataset. + from-inputs (resources/read-resource "sarscov2_illumina_full/dataset-from-inputs.edn") + inputs (-> (resources/read-resource "sarscov2_illumina_full/inputs.edn") + (select-keys (map keyword (vals from-inputs)))) inputs-type (-> "sarscov2_illumina_full.edn" resources/read-resource :inputs workflows/make-object-type) - table-name "sarscov2_illumina_full_inputs" + table-name "flowcell" unique-prefix (UUID/randomUUID) - table-url (str temp "inputs.json") - ;; This defines how we'll convert the inputs of the pipeline into - ;; a form that can be ingested as a new row in the dataset. - ;; I think a user would specify something like this in the initial - ;; workload request, one mapping for dataset to inputs and one for - ;; outputs to dataset. - from-inputs (resources/read-resource "sarscov2_illumina_full/dataset-from-inputs.edn")] + table-url (str temp "inputs.json")] (-> (->> (workflows/get-files inputs-type inputs) (datasets/ingest-files tdr-profile source unique-prefix)) (replace-urls-with-file-ids inputs-type inputs) diff --git a/api/test/wfl/tools/datasets.clj b/api/test/wfl/tools/datasets.clj index 7b55371a8..7e64f1827 100644 --- a/api/test/wfl/tools/datasets.clj +++ b/api/test/wfl/tools/datasets.clj @@ -1,9 +1,9 @@ (ns wfl.tools.datasets - (:require [clojure.string :as str] - [clojure.data.json :as json] - [wfl.service.datarepo :as datarepo] - [wfl.service.google.storage :as storage]) - (:import (java.util UUID))) + (:require [clojure.string :as str] + [clojure.data.json :as json] + [wfl.service.datarepo :as datarepo] + [wfl.service.google.storage :as storage] + [wfl.util :as util])) (defn unique-dataset-request "Create a dataset request for a uniquely-named dataset defined by the json @@ -12,7 +12,7 @@ (-> (str "test/resources/datasets/" dataset-basename) slurp json/read-str - (update "name" #(str % (-> (UUID/randomUUID) (str/replace "-" "")))) + (update "name" util/randomize) (update "defaultProfileId" (constantly tdr-profile)))) ;; TDR limits size of bulk ingest request to 1000 files. Muscles says @@ -42,7 +42,7 @@ (letfn [(literal? [x] (str/starts-with? x "$")) (go! [v] (cond (literal? v) (subs v 1 (count v)) - (string? v) (get values (keyword v)) + (string? v) (values (keyword v)) (map? v) (json/write-str (rename-gather values v) :escape-slash false) (coll? v) (keep go! v) diff --git a/api/test/wfl/tools/fixtures.clj b/api/test/wfl/tools/fixtures.clj index 38d13d83a..879884e7d 100644 --- a/api/test/wfl/tools/fixtures.clj +++ b/api/test/wfl/tools/fixtures.clj @@ -7,7 +7,8 @@ [wfl.tools.liquibase :as liquibase] [wfl.jdbc :as jdbc] [wfl.util :as util] - [wfl.environment :as env]) + [wfl.environment :as env] + [wfl.service.firecloud :as firecloud]) (:import [java.util UUID] (java.nio.file Files) (org.apache.commons.io FileUtils) @@ -197,6 +198,15 @@ datarepo/delete-snapshot f)) +(defn with-temporary-workspace + "Create and use a temporary Terra Workspace." + [f] + (util/bracket + #(doto (util/randomize "wfl-dev/test-workspace") + (firecloud/create-workspace "workflow-launcher-dev")) + firecloud/delete-workspace + f)) + (defn with-temporary-environment "Temporarily override the environment with the key-value mapping in `env`. The original environment will be restored after `f` returns. No guarantees diff --git a/api/test/wfl/tools/snapshots.clj b/api/test/wfl/tools/snapshots.clj index 3d7d4325e..a35a38992 100644 --- a/api/test/wfl/tools/snapshots.clj +++ b/api/test/wfl/tools/snapshots.clj @@ -1,29 +1,16 @@ (ns wfl.tools.snapshots - (:require [clojure.string :as str] - [clojure.test :refer :all] - [wfl.service.google.bigquery :as bigquery] + (:require [clojure.test :refer :all] [wfl.service.datarepo :as datarepo] - [wfl.util :as util]) - (:import (java.util UUID))) - -;; See https://broadworkbench.atlassian.net/browse/DR-1696 -(defn ^:private legalize-tdr-columns - "Legalize TDR columns by stripping out columns that are Array[File] types, as - TDR does not support them yet." - [cols] - (letfn [(is-fileref-array? [{:keys [datatype array_of]}] - (and (= datatype "fileref") array_of))] - (remove is-fileref-array? cols))) + [wfl.util :as util])) (defn unique-snapshot-request "Wrap `table` from `dataset` in a snapshot with a unique name for `tdr-profile`." [tdr-profile dataset table row-ids] (let [columns (-> (datarepo/all-columns dataset table) - legalize-tdr-columns (->> (map :name) set) (conj "datarepo_row_id"))] (-> (datarepo/make-snapshot-request dataset columns table row-ids) - (update :name #(str % (-> (UUID/randomUUID) (str/replace "-" "")))) + (update :name util/randomize) (update :profileId (constantly tdr-profile))))) ;; Partition row IDs into batches of 500 to keep TDR happy. @@ -35,16 +22,10 @@ Note the dataset row query time range is set to (yesterday, today] for testing purposes for now." [tdr-profile dataset table] - (let [{:keys [dataProject]} dataset - table table - today (util/today) + (let [today (util/today) yesterday (util/days-from-today -1) - row-ids (->> (datarepo/make-snapshot-query dataset table yesterday today) - (bigquery/query-sync dataProject) - flatten) unique-request-batch (partial unique-snapshot-request tdr-profile dataset table)] - (->> (datarepo/make-snapshot-query dataset table yesterday today) - (bigquery/query-sync dataProject) + (->> (datarepo/query-table-between dataset table [yesterday today] [:datarepo_row_id]) flatten (partition-all 500) (map unique-request-batch) diff --git a/api/test/wfl/unit/google/bigquery_test.clj b/api/test/wfl/unit/google/bigquery_test.clj index fd3fa2b3f..3a0be47dc 100644 --- a/api/test/wfl/unit/google/bigquery_test.clj +++ b/api/test/wfl/unit/google/bigquery_test.clj @@ -3,31 +3,24 @@ [clojure.data.csv :as csv] [wfl.service.google.bigquery :as bigquery])) +;; mock output from bigquery/query-sync (def ^:private dr-view-content - "Test fixture to simulate data structure from bigquery/query-sync." - {:kind "bigquery#queryResponse" - :schema {:fields [{:mode "NULLABLE" name "datarepo_row_id" :type "STRING"} - {:mode "NULLABLE" :name "vcf" :type "STRING"} - {:mode "NULLABLE" :name "id" :type "STRING"} - {:mode "NULLABLE" :name "vcf_index" :type "STRING"}]} - :jobReference {:projectId "broad-jade-dev-data" :jobId "job_Zd6Ld4uCl8kmuFkiGKsPdk5OnBNP" :location "US"} - :totalRows "2" - :rows ['("8d529c08-bc21-4ea0-9254-d99b9c12dfd2" - "drs://jade.datarepo-dev.broadinstitute.org/v1_f1c765c6-5446-4aef-bdbe-c741ff09c27c_f2a7d885-4fd3-4faf-bd16-06219a8eef99" - "wfl-test-a830fe00-7ef2-430a-9d5e-fa0c18dc99e1/" - "drs://jade.datarepo-dev.broadinstitute.org/v1_f1c765c6-5446-4aef-bdbe-c741ff09c27c_2b67ed53-ccac-49c6-8ad6-8952a1dfaf98") - '("8d529c08-bc21-4ea0-9254-d99b9c12dfd2" - "drs://jade.datarepo-dev.broadinstitute.org/v1_f1c765c6-5446-4aef-bdbe-c741ff09c27c_f2a7d885-4fd3-4faf-bd16-06219a8eef99" - "wfl-test-a830fe00-7ef2-430a-9d5e-fa0c18dc99e1/" - "drs://jade.datarepo-dev.broadinstitute.org/v1_f1c765c6-5446-4aef-bdbe-c741ff09c27c_2b67ed53-ccac-49c6-8ad6-8952a1dfaf98")] - :totalBytesProcessed "221025" - :jobComplete true - :cacheHit false}) + {:schema {:fields [{:name "test-name_id" :type "STRING" :mode "NULLABLE"} + {:name "vcf" :type "STRING" :mode "NULLABLE"} + {:name "id" :type "STRING" :mode "NULLABLE"} + {:name "vcf_index" :type "STRING" :mode "NULLABLE"}]} + :rows [["8d529c08-bc21-4ea0-9254-d99b9c12dfd2" + "drs://jade.datarepo-dev.broadinstitute.org/v1_f1c765c6-5446-4aef-bdbe-c741ff09c27c_f2a7d885-4fd3-4faf-bd16-06219a8eef99" + "wfl-test-a830fe00-7ef2-430a-9d5e-fa0c18dc99e1/" + "drs://jade.datarepo-dev.broadinstitute.org/v1_f1c765c6-5446-4aef-bdbe-c741ff09c27c_2b67ed53-ccac-49c6-8ad6-8952a1dfaf98"] + ["8d529c08-bc21-4ea0-9254-d99b9c12dfd2" + "drs://jade.datarepo-dev.broadinstitute.org/v1_f1c765c6-5446-4aef-bdbe-c741ff09c27c_f2a7d885-4fd3-4faf-bd16-06219a8eef99" + "wfl-test-a830fe00-7ef2-430a-9d5e-fa0c18dc99e1/" + "drs://jade.datarepo-dev.broadinstitute.org/v1_f1c765c6-5446-4aef-bdbe-c741ff09c27c_2b67ed53-ccac-49c6-8ad6-8952a1dfaf98"]]}) (deftest test-dump-table->tsv - (testing "Dumping from BigQuery table response to TSV works" - (let [terra-table-name "test-name" - contents (-> (bigquery/dump-table->tsv dr-view-content "test-name") - (csv/read-csv :separator \tab))] - (is (= (format "entity:%s_id" terra-table-name) - ((comp first first) contents)) "The result TSV header is not properly formatted!")))) + (let [contents (-> (bigquery/dump-table->tsv dr-view-content) + (csv/read-csv :separator \tab))] + (is (= 3 (count contents)) "expect 3 rows (headers + 2 of data)") + (is (every? #(= 4 %) (map count contents)) "rows have same number of columns as fields") + (is (= "entity:test-name_id" (ffirst contents)) "The result TSV header is not properly formatted!")))