Skip to content

Commit 6dc30d2

Browse files
authored
bug-1886021: Implement GCS storage classes for GCP (#6572)
1 parent 0e44482 commit 6dc30d2

24 files changed

+1513
-263
lines changed

bin/gcs_cli.py

+49
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# Usage: ./bin/gcs_cli.py CMD
1010

1111
import os
12+
from pathlib import Path, PurePosixPath
1213

1314
import click
1415

@@ -119,6 +120,54 @@ def list_objects(bucket_name, details):
119120
click.echo("No objects in bucket.")
120121

121122

123+
@gcs_group.command("upload")
124+
@click.argument("source")
125+
@click.argument("destination")
126+
def upload(source, destination):
127+
"""Upload files to a bucket
128+
129+
SOURCE is a path to a file or directory of files. will recurse on directory trees
130+
131+
DESTINATION is a path to a file or directory in the bucket. If SOURCE is a
132+
directory or DESTINATION ends with "/", then DESTINATION is treated as a directory.
133+
"""
134+
135+
client = get_client()
136+
137+
# remove protocol from destination if present
138+
destination = destination.split("://", 1)[-1]
139+
bucket_name, _, prefix = destination.partition("/")
140+
prefix_path = PurePosixPath(prefix)
141+
142+
try:
143+
bucket = client.get_bucket(bucket_name)
144+
except NotFound as e:
145+
raise click.ClickException(f"GCS bucket {bucket_name!r} does not exist.") from e
146+
147+
source_path = Path(source)
148+
if not source_path.exists():
149+
raise click.ClickException(f"local path {source!r} does not exist.")
150+
source_is_dir = source_path.is_dir()
151+
if source_is_dir:
152+
sources = [p for p in source_path.rglob("*") if not p.is_dir()]
153+
else:
154+
sources = [source_path]
155+
if not sources:
156+
raise click.ClickException(f"No files in directory {source!r}.")
157+
for path in sources:
158+
if source_is_dir:
159+
# source is a directory so treat destination as a directory
160+
key = str(prefix_path / path.relative_to(source_path))
161+
elif prefix == "" or prefix.endswith("/"):
162+
# source is a file but destination is a directory, preserve file name
163+
key = str(prefix_path / path.name)
164+
else:
165+
key = prefix
166+
blob = bucket.blob(key)
167+
blob.upload_from_filename(path)
168+
click.echo(f"Uploaded gs://{bucket_name}/{key}")
169+
170+
122171
def main(argv=None):
123172
argv = argv or []
124173
gcs_group(argv)

bin/process_crashes.sh

+12-5
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
# License, v. 2.0. If a copy of the MPL was not distributed with this
55
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
66

7-
# Pulls down crash data for specified crash ids, syncs to the S3 bucket, and
8-
# sends the crash ids to the Pub/Sub queue.
7+
# Pulls down crash data for specified crash ids, syncs to the cloud storage
8+
# bucket, and sends the crash ids to the queue.
99
#
1010
# Usage: ./bin/process_crashes.sh
1111
#
@@ -47,9 +47,16 @@ mkdir "${DATADIR}" || echo "${DATADIR} already exists."
4747
./socorro-cmd fetch_crash_data "${DATADIR}" $@
4848

4949
# Make the bucket and sync contents
50-
./bin/socorro_aws_s3.sh mb s3://dev-bucket/
51-
./bin/socorro_aws_s3.sh cp --recursive "${DATADIR}" "s3://${CRASHSTORAGE_S3_BUCKET}/"
52-
./bin/socorro_aws_s3.sh ls --recursive "s3://${CRASHSTORAGE_S3_BUCKET}/"
50+
# ^^ returns CLOUD_PROVIDER value as uppercase
51+
if [[ "${CLOUD_PROVIDER^^}" == "GCP" ]]; then
52+
./socorro-cmd gcs create "${CRASHSTORAGE_GCS_BUCKET}"
53+
./socorro-cmd gcs upload "${DATADIR}" "${CRASHSTORAGE_GCS_BUCKET}"
54+
./socorro-cmd gcs list_objects "${CRASHSTORAGE_GCS_BUCKET}"
55+
else
56+
./bin/socorro_aws_s3.sh mb "s3://${CRASHSTORAGE_S3_BUCKET}/"
57+
./bin/socorro_aws_s3.sh cp --recursive "${DATADIR}" "s3://${CRASHSTORAGE_S3_BUCKET}/"
58+
./bin/socorro_aws_s3.sh ls --recursive "s3://${CRASHSTORAGE_S3_BUCKET}/"
59+
fi
5360

5461
# Add crash ids to queue
5562
# ^^ returns CLOUD_PROVIDER value as uppercase

socorro/external/boto/connection_context.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -204,17 +204,22 @@ def load_file(self, bucket, path):
204204
f"(bucket={bucket!r} key={path}) not found, no value returned"
205205
) from exc
206206

207-
def list_objects_paginator(self, bucket, prefix):
207+
def list_objects_paginator(self, bucket, prefix, page_size=None):
208208
"""Returns S3 client paginator of objects with key prefix in bucket
209209
210210
:arg bucket: the name of the bucket
211211
:arg prefix: the key prefix
212+
:arg page_size: the size of pages to request
212213
213214
:returns: S3 paginator
214215
215216
"""
216217
paginator = self.client.get_paginator("list_objects_v2")
217-
page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)
218+
page_iterator = paginator.paginate(
219+
Bucket=bucket,
220+
Prefix=prefix,
221+
PaginationConfig={} if page_size is None else {"PageSize": page_size},
222+
)
218223
return page_iterator
219224

220225
def head_object(self, bucket, key):

socorro/external/boto/crashstorage.py

+12-45
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
# License, v. 2.0. If a copy of the MPL was not distributed with this
33
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
44

5-
import datetime
65
import json
76
import logging
87

@@ -12,6 +11,10 @@
1211
CrashStorageBase,
1312
CrashIDNotFound,
1413
MemoryDumpsMapping,
14+
get_datestamp,
15+
dict_to_str,
16+
list_to_str,
17+
str_to_list,
1518
)
1619
from socorro.external.boto.connection_context import S3Connection
1720
from socorro.lib.libjsonschema import JsonSchemaReducer
@@ -21,7 +24,6 @@
2124
SocorroDataReducer,
2225
transform_schema,
2326
)
24-
from socorro.lib.libooid import date_from_ooid
2527
from socorro.schemas import TELEMETRY_SOCORRO_CRASH_SCHEMA
2628

2729

@@ -32,25 +34,6 @@ def wait_time_generator():
3234
yield from [1, 1, 1, 1, 1]
3335

3436

35-
class CrashIDMissingDatestamp(Exception):
36-
"""Indicates the crash id is invalid and missing a datestamp."""
37-
38-
39-
def get_datestamp(crashid):
40-
"""Parses out datestamp from a crashid.
41-
42-
:returns: datetime
43-
44-
:raises CrashIDMissingDatestamp: if the crash id has no datestamp at the end
45-
46-
"""
47-
datestamp = date_from_ooid(crashid)
48-
if datestamp is None:
49-
# We should never hit this situation unless the crashid is not valid
50-
raise CrashIDMissingDatestamp(f"{crashid} is missing datestamp")
51-
return datestamp
52-
53-
5437
def build_keys(name_of_thing, crashid):
5538
"""Builds a list of s3 pseudo-filenames
5639
@@ -81,25 +64,6 @@ def build_keys(name_of_thing, crashid):
8164
return [f"v1/{name_of_thing}/{crashid}"]
8265

8366

84-
class JSONISOEncoder(json.JSONEncoder):
85-
def default(self, obj):
86-
if isinstance(obj, datetime.date):
87-
return obj.isoformat()
88-
raise NotImplementedError(f"Don't know about {obj!r}")
89-
90-
91-
def dict_to_str(a_mapping):
92-
return json.dumps(a_mapping, cls=JSONISOEncoder)
93-
94-
95-
def list_to_str(a_list):
96-
return json.dumps(list(a_list))
97-
98-
99-
def str_to_list(a_string):
100-
return json.loads(a_string)
101-
102-
10367
class BotoS3CrashStorage(CrashStorageBase):
10468
"""Saves and loads crash data to S3"""
10569

@@ -195,15 +159,18 @@ def save_processed_crash(self, raw_crash, processed_crash):
195159
path = build_keys("processed_crash", crash_id)[0]
196160
self.save_file(path, data)
197161

198-
def list_objects_paginator(self, prefix):
199-
"""Return generator of objects in the bucket that have a specified key prefix
162+
def list_objects_paginator(self, prefix, page_size=None):
163+
"""Yield pages of object keys in the bucket that have a specified key prefix
200164
201165
:arg prefix: the prefix to look at
166+
:arg page_size: the number of results to return per page
202167
203-
:returns: generator of keys
204-
168+
:returns: generator of pages (lists) of object keys
205169
"""
206-
return self.connection.list(bucket=self.bucket, prefix=prefix)
170+
for page in self.connection.list_objects_paginator(
171+
bucket=self.bucket, prefix=prefix, page_size=page_size
172+
):
173+
yield [item["Key"] for item in page.get("Contents", [])]
207174

208175
def exists_object(self, key):
209176
"""Returns whether the object exists in the bucket

socorro/external/crashstorage_base.py

+42
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,13 @@
55
"""Base classes for crashstorage system."""
66

77
from contextlib import suppress
8+
import datetime
9+
import json
810
import logging
911
import os
1012

13+
from socorro.lib.libooid import date_from_ooid
14+
1115

1216
class MemoryDumpsMapping(dict):
1317
"""there has been a bifurcation in the crash storage data throughout the
@@ -262,3 +266,41 @@ def remove(self, crash_id):
262266

263267
with suppress(KeyError):
264268
del self._processed_crash_data[crash_id]
269+
270+
271+
class CrashIDMissingDatestamp(Exception):
272+
"""Indicates the crash id is invalid and missing a datestamp."""
273+
274+
275+
def get_datestamp(crashid):
276+
"""Parses out datestamp from a crashid.
277+
278+
:returns: datetime
279+
280+
:raises CrashIDMissingDatestamp: if the crash id has no datestamp at the end
281+
282+
"""
283+
datestamp = date_from_ooid(crashid)
284+
if datestamp is None:
285+
# We should never hit this situation unless the crashid is not valid
286+
raise CrashIDMissingDatestamp(f"{crashid} is missing datestamp")
287+
return datestamp
288+
289+
290+
class JSONISOEncoder(json.JSONEncoder):
291+
def default(self, obj):
292+
if isinstance(obj, datetime.date):
293+
return obj.isoformat()
294+
raise NotImplementedError(f"Don't know about {obj!r}")
295+
296+
297+
def dict_to_str(a_mapping):
298+
return json.dumps(a_mapping, cls=JSONISOEncoder)
299+
300+
301+
def list_to_str(a_list):
302+
return json.dumps(list(a_list))
303+
304+
305+
def str_to_list(a_string):
306+
return json.loads(a_string)

socorro/external/gcs/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# This Source Code Form is subject to the terms of the Mozilla Public
2+
# License, v. 2.0. If a copy of the MPL was not distributed with this
3+
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

0 commit comments

Comments
 (0)