Skip to content

Commit

Permalink
apacheGH-36352: [Python] Add project_id to GcsFileSystem options (apa…
Browse files Browse the repository at this point in the history
…che#36376)

### Rationale for this change
Some of our Python CI tests for GCS are failing due to the new project_id option added for GcsFileSystem here: apache#36228

### What changes are included in this PR?

Added option

### Are these changes tested?

Will be tested on CI.

### Are there any user-facing changes?

Yes, there is a new project_id option when defining a GcsFileSystem.
* Closes: apache#36352

Lead-authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Raúl Cumplido <raulcumplido@gmail.com>
  • Loading branch information
raulcd and jorisvandenbossche authored Jul 4, 2023
1 parent 7ebc88c commit cd1ed18
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 5 deletions.
24 changes: 22 additions & 2 deletions python/pyarrow/_gcsfs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ cdef class GcsFileSystem(FileSystem):
retry_time_limit : timedelta, default None
Set the maximum amount of time the GCS client will attempt to retry
transient errors. Subsecond granularity is ignored.
project_id : str, default None
The GCP project identifier to use for creating buckets.
If not set, the library uses the GOOGLE_CLOUD_PROJECT environment
variable. Most I/O operations do not need a project id, only applications
that create new buckets need a project id.
"""

cdef:
Expand All @@ -86,7 +91,8 @@ cdef class GcsFileSystem(FileSystem):
scheme=None,
endpoint_override=None,
default_metadata=None,
retry_time_limit=None):
retry_time_limit=None,
project_id=None):
cdef:
CGcsOptions options
shared_ptr[CGcsFileSystem] wrapped
Expand Down Expand Up @@ -136,6 +142,8 @@ cdef class GcsFileSystem(FileSystem):
if retry_time_limit is not None:
time_limit_seconds = retry_time_limit.total_seconds()
options.retry_limit_seconds = time_limit_seconds
if project_id is not None:
options.project_id = <c_string>tobytes(project_id)

with nogil:
wrapped = GetResultValue(CGcsFileSystem.Make(options))
Expand Down Expand Up @@ -165,6 +173,9 @@ cdef class GcsFileSystem(FileSystem):
if opts.retry_limit_seconds.has_value():
retry_time_limit = timedelta(
seconds=opts.retry_limit_seconds.value())
project_id = None
if opts.project_id.has_value():
project_id = frombytes(opts.project_id.value())
return (
GcsFileSystem._reconstruct, (dict(
access_token=frombytes(opts.credentials.access_token()),
Expand All @@ -176,7 +187,8 @@ cdef class GcsFileSystem(FileSystem):
default_bucket_location=frombytes(
opts.default_bucket_location),
default_metadata=pyarrow_wrap_metadata(opts.default_metadata),
retry_time_limit=retry_time_limit
retry_time_limit=retry_time_limit,
project_id=project_id
),))

@property
Expand All @@ -185,3 +197,11 @@ cdef class GcsFileSystem(FileSystem):
The GCP location this filesystem will write to.
"""
return frombytes(self.gcsfs.options().default_bucket_location)

@property
def project_id(self):
"""
The GCP project id this filesystem will use.
"""
if self.gcsfs.options().project_id.has_value():
return frombytes(self.gcsfs.options().project_id.value())
1 change: 1 addition & 0 deletions python/pyarrow/includes/libarrow_fs.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil:
c_string endpoint_override
c_string scheme
c_string default_bucket_location
optional[c_string] project_id
optional[double] retry_limit_seconds
shared_ptr[const CKeyValueMetadata] default_metadata
c_bool Equals(const CS3Options& other)
Expand Down
9 changes: 6 additions & 3 deletions python/pyarrow/tests/test_fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,8 @@ def gcsfs(request, gcs_server):
scheme='http',
# Mock endpoint doesn't check credentials.
anonymous=True,
retry_time_limit=timedelta(seconds=45)
retry_time_limit=timedelta(seconds=45),
project_id='test-project-id'
)
try:
fs.create_dir(bucket)
Expand Down Expand Up @@ -1064,9 +1065,11 @@ def test_gcs_options():
target_service_account='service_account@apache',
credential_token_expiration=dt,
default_bucket_location='us-west2',
scheme='https', endpoint_override='localhost:8999')
scheme='https', endpoint_override='localhost:8999',
project_id='test-project-id')
assert isinstance(fs, GcsFileSystem)
assert fs.default_bucket_location == 'us-west2'
assert fs.project_id == 'test-project-id'
assert pickle.loads(pickle.dumps(fs)) == fs

fs = GcsFileSystem()
Expand Down Expand Up @@ -1476,7 +1479,7 @@ def test_filesystem_from_uri_gcs(gcs_server):

uri = ("gs://anonymous@" +
f"mybucket/foo/bar?scheme=http&endpoint_override={host}:{port}&" +
"retry_limit_seconds=5")
"retry_limit_seconds=5&project_id=test-project-id")

fs, path = FileSystem.from_uri(uri)
assert isinstance(fs, GcsFileSystem)
Expand Down

0 comments on commit cd1ed18

Please sign in to comment.