Skip to content

Commit 8fb4d88

Browse files
Merge branch 'main' into daryna/add-stream-slicer-to-http-component-resolver
2 parents ec9e480 + 57e1b52 commit 8fb4d88

21 files changed

+1162
-442
lines changed

airbyte_cdk/sources/declarative/declarative_component_schema.yaml

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1234,6 +1234,8 @@ definitions:
12341234
- "$ref": "#/definitions/CustomTransformation"
12351235
- "$ref": "#/definitions/RemoveFields"
12361236
- "$ref": "#/definitions/KeysToLower"
1237+
- "$ref": "#/definitions/KeysToSnakeCase"
1238+
- "$ref": "#/definitions/FlattenFields"
12371239
state_migrations:
12381240
title: State Migrations
12391241
description: Array of state migrations to be applied on the input state
@@ -1766,6 +1768,18 @@ definitions:
17661768
- "$ref": "#/definitions/AsyncRetriever"
17671769
- "$ref": "#/definitions/CustomRetriever"
17681770
- "$ref": "#/definitions/SimpleRetriever"
1771+
schema_transformations:
1772+
title: Schema Transformations
1773+
description: A list of transformations to be applied to the schema.
1774+
type: array
1775+
items:
1776+
anyOf:
1777+
- "$ref": "#/definitions/AddFields"
1778+
- "$ref": "#/definitions/CustomTransformation"
1779+
- "$ref": "#/definitions/RemoveFields"
1780+
- "$ref": "#/definitions/KeysToLower"
1781+
- "$ref": "#/definitions/KeysToSnakeCase"
1782+
- "$ref": "#/definitions/FlattenFields"
17691783
schema_type_identifier:
17701784
"$ref": "#/definitions/SchemaTypeIdentifier"
17711785
$parameters:
@@ -1838,6 +1852,32 @@ definitions:
18381852
$parameters:
18391853
type: object
18401854
additionalProperties: true
1855+
KeysToSnakeCase:
1856+
title: Key to Snake Case
1857+
description: A transformation that renames all keys to snake case.
1858+
type: object
1859+
required:
1860+
- type
1861+
properties:
1862+
type:
1863+
type: string
1864+
enum: [KeysToSnakeCase]
1865+
$parameters:
1866+
type: object
1867+
additionalProperties: true
1868+
FlattenFields:
1869+
title: Flatten Fields
1870+
description: A transformation that flatten record to single level format.
1871+
type: object
1872+
required:
1873+
- type
1874+
properties:
1875+
type:
1876+
type: string
1877+
enum: [FlattenFields]
1878+
$parameters:
1879+
type: object
1880+
additionalProperties: true
18411881
IterableDecoder:
18421882
title: Iterable Decoder
18431883
description: Use this if the response consists of strings separated by new lines (`\n`). The Decoder will wrap each row into a JSON object with the `record` key.
@@ -2160,7 +2200,9 @@ definitions:
21602200
description: |-
21612201
The DeclarativeOAuth Specific optional headers to inject while exchanging the `auth_code` to `access_token` during `completeOAuthFlow` step.
21622202
examples:
2163-
- {"Authorization": "Basic {base64Encoder:{client_id}:{client_secret}}"}
2203+
- {
2204+
"Authorization": "Basic {base64Encoder:{client_id}:{client_secret}}",
2205+
}
21642206
access_token_params:
21652207
title: Access Token Query Params (Json Encoded)
21662208
type: object

airbyte_cdk/sources/declarative/models/declarative_component_schema.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,16 @@ class KeysToLower(BaseModel):
710710
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
711711

712712

713+
class KeysToSnakeCase(BaseModel):
714+
type: Literal["KeysToSnakeCase"]
715+
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
716+
717+
718+
class FlattenFields(BaseModel):
719+
type: Literal["FlattenFields"]
720+
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
721+
722+
713723
class IterableDecoder(BaseModel):
714724
type: Literal["IterableDecoder"]
715725

@@ -1654,7 +1664,16 @@ class Config:
16541664
title="Schema Loader",
16551665
)
16561666
transformations: Optional[
1657-
List[Union[AddFields, CustomTransformation, RemoveFields, KeysToLower]]
1667+
List[
1668+
Union[
1669+
AddFields,
1670+
CustomTransformation,
1671+
RemoveFields,
1672+
KeysToLower,
1673+
KeysToSnakeCase,
1674+
FlattenFields,
1675+
]
1676+
]
16581677
] = Field(
16591678
None,
16601679
description="A list of transformations to be applied to each output record.",
@@ -1818,6 +1837,22 @@ class DynamicSchemaLoader(BaseModel):
18181837
description="Component used to coordinate how records are extracted across stream slices and request pages.",
18191838
title="Retriever",
18201839
)
1840+
schema_transformations: Optional[
1841+
List[
1842+
Union[
1843+
AddFields,
1844+
CustomTransformation,
1845+
RemoveFields,
1846+
KeysToLower,
1847+
KeysToSnakeCase,
1848+
FlattenFields,
1849+
]
1850+
]
1851+
] = Field(
1852+
None,
1853+
description="A list of transformations to be applied to the schema.",
1854+
title="Schema Transformations",
1855+
)
18211856
schema_type_identifier: SchemaTypeIdentifier
18221857
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
18231858

airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,9 @@
197197
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
198198
ExponentialBackoffStrategy as ExponentialBackoffStrategyModel,
199199
)
200+
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
201+
FlattenFields as FlattenFieldsModel,
202+
)
200203
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
201204
GzipJsonDecoder as GzipJsonDecoderModel,
202205
)
@@ -236,6 +239,9 @@
236239
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
237240
KeysToLower as KeysToLowerModel,
238241
)
242+
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
243+
KeysToSnakeCase as KeysToSnakeCaseModel,
244+
)
239245
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
240246
LegacySessionTokenAuthenticator as LegacySessionTokenAuthenticatorModel,
241247
)
@@ -323,6 +329,9 @@
323329
SinglePartitionRouter,
324330
SubstreamPartitionRouter,
325331
)
332+
from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import (
333+
AsyncJobPartitionRouter,
334+
)
326335
from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import (
327336
ParentStreamConfig,
328337
)
@@ -387,9 +396,15 @@
387396
RemoveFields,
388397
)
389398
from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition
399+
from airbyte_cdk.sources.declarative.transformations.flatten_fields import (
400+
FlattenFields,
401+
)
390402
from airbyte_cdk.sources.declarative.transformations.keys_to_lower_transformation import (
391403
KeysToLowerTransformation,
392404
)
405+
from airbyte_cdk.sources.declarative.transformations.keys_to_snake_transformation import (
406+
KeysToSnakeCaseTransformation,
407+
)
393408
from airbyte_cdk.sources.message import (
394409
InMemoryMessageRepository,
395410
LogAppenderMessageRepositoryDecorator,
@@ -472,6 +487,8 @@ def _init_mappings(self) -> None:
472487
JsonlDecoderModel: self.create_jsonl_decoder,
473488
GzipJsonDecoderModel: self.create_gzipjson_decoder,
474489
KeysToLowerModel: self.create_keys_to_lower_transformation,
490+
KeysToSnakeCaseModel: self.create_keys_to_snake_transformation,
491+
FlattenFieldsModel: self.create_flatten_fields,
475492
IterableDecoderModel: self.create_iterable_decoder,
476493
XmlDecoderModel: self.create_xml_decoder,
477494
JsonFileSchemaLoaderModel: self.create_json_file_schema_loader,
@@ -587,6 +604,16 @@ def create_keys_to_lower_transformation(
587604
) -> KeysToLowerTransformation:
588605
return KeysToLowerTransformation()
589606

607+
def create_keys_to_snake_transformation(
608+
self, model: KeysToSnakeCaseModel, config: Config, **kwargs: Any
609+
) -> KeysToSnakeCaseTransformation:
610+
return KeysToSnakeCaseTransformation()
611+
612+
def create_flatten_fields(
613+
self, model: FlattenFieldsModel, config: Config, **kwargs: Any
614+
) -> FlattenFields:
615+
return FlattenFields()
616+
590617
@staticmethod
591618
def _json_schema_type_name_to_type(value_type: Optional[ValueType]) -> Optional[Type[Any]]:
592619
if not value_type:
@@ -1638,6 +1665,13 @@ def create_dynamic_schema_loader(
16381665
model.retriever, stream_slicer
16391666
)
16401667

1668+
schema_transformations = []
1669+
if model.schema_transformations:
1670+
for transformation_model in model.schema_transformations:
1671+
schema_transformations.append(
1672+
self._create_component_from_model(model=transformation_model, config=config)
1673+
)
1674+
16411675
retriever = self._create_component_from_model(
16421676
model=model.retriever,
16431677
config=config,
@@ -1652,6 +1686,7 @@ def create_dynamic_schema_loader(
16521686
return DynamicSchemaLoader(
16531687
retriever=retriever,
16541688
config=config,
1689+
schema_transformations=schema_transformations,
16551690
schema_type_identifier=schema_type_identifier,
16561691
parameters=model.parameters or {},
16571692
)
@@ -2228,22 +2263,28 @@ def create_async_retriever(
22282263
urls_extractor=urls_extractor,
22292264
)
22302265

2231-
return AsyncRetriever(
2266+
async_job_partition_router = AsyncJobPartitionRouter(
22322267
job_orchestrator_factory=lambda stream_slices: AsyncJobOrchestrator(
22332268
job_repository,
22342269
stream_slices,
2235-
JobTracker(
2236-
1
2237-
), # FIXME eventually make the number of concurrent jobs in the API configurable. Until then, we limit to 1
2270+
JobTracker(1),
2271+
# FIXME eventually make the number of concurrent jobs in the API configurable. Until then, we limit to 1
22382272
self._message_repository,
2239-
has_bulk_parent=False, # FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk
2273+
has_bulk_parent=False,
2274+
# FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk
22402275
),
2241-
record_selector=record_selector,
22422276
stream_slicer=stream_slicer,
22432277
config=config,
22442278
parameters=model.parameters or {},
22452279
)
22462280

2281+
return AsyncRetriever(
2282+
record_selector=record_selector,
2283+
stream_slicer=async_job_partition_router,
2284+
config=config,
2285+
parameters=model.parameters or {},
2286+
)
2287+
22472288
@staticmethod
22482289
def create_spec(model: SpecModel, config: Config, **kwargs: Any) -> Spec:
22492290
return Spec(

airbyte_cdk/sources/declarative/partition_routers/__init__.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,18 @@
22
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
33
#
44

5+
from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_router import AsyncJobPartitionRouter
56
from airbyte_cdk.sources.declarative.partition_routers.cartesian_product_stream_slicer import CartesianProductStreamSlicer
67
from airbyte_cdk.sources.declarative.partition_routers.list_partition_router import ListPartitionRouter
78
from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import SinglePartitionRouter
89
from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import SubstreamPartitionRouter
910
from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
1011

11-
__all__ = ["CartesianProductStreamSlicer", "ListPartitionRouter", "SinglePartitionRouter", "SubstreamPartitionRouter", "PartitionRouter"]
12+
__all__ = [
13+
"AsyncJobPartitionRouter",
14+
"CartesianProductStreamSlicer",
15+
"ListPartitionRouter",
16+
"SinglePartitionRouter",
17+
"SubstreamPartitionRouter",
18+
"PartitionRouter"
19+
]
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2+
3+
from dataclasses import InitVar, dataclass, field
4+
from typing import Any, Callable, Iterable, Mapping, Optional
5+
6+
from airbyte_cdk.models import FailureType
7+
from airbyte_cdk.sources.declarative.async_job.job_orchestrator import (
8+
AsyncJobOrchestrator,
9+
AsyncPartition,
10+
)
11+
from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import (
12+
SinglePartitionRouter,
13+
)
14+
from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer
15+
from airbyte_cdk.sources.types import Config, StreamSlice
16+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
17+
18+
19+
@dataclass
20+
class AsyncJobPartitionRouter(StreamSlicer):
21+
"""
22+
Partition router that creates async jobs in a source API, periodically polls for job
23+
completion, and supplies the completed job URL locations as stream slices so that
24+
records can be extracted.
25+
"""
26+
27+
config: Config
28+
parameters: InitVar[Mapping[str, Any]]
29+
job_orchestrator_factory: Callable[[Iterable[StreamSlice]], AsyncJobOrchestrator]
30+
stream_slicer: StreamSlicer = field(
31+
default_factory=lambda: SinglePartitionRouter(parameters={})
32+
)
33+
34+
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
35+
self._job_orchestrator_factory = self.job_orchestrator_factory
36+
self._job_orchestrator: Optional[AsyncJobOrchestrator] = None
37+
self._parameters = parameters
38+
39+
def stream_slices(self) -> Iterable[StreamSlice]:
40+
slices = self.stream_slicer.stream_slices()
41+
self._job_orchestrator = self._job_orchestrator_factory(slices)
42+
43+
for completed_partition in self._job_orchestrator.create_and_get_completed_partitions():
44+
yield StreamSlice(
45+
partition=dict(completed_partition.stream_slice.partition)
46+
| {"partition": completed_partition},
47+
cursor_slice=completed_partition.stream_slice.cursor_slice,
48+
)
49+
50+
def fetch_records(self, partition: AsyncPartition) -> Iterable[Mapping[str, Any]]:
51+
"""
52+
This method of fetching records extends beyond what a PartitionRouter/StreamSlicer should
53+
be responsible for. However, this was added in because the JobOrchestrator is required to
54+
retrieve records. And without defining fetch_records() on this class, we're stuck with either
55+
passing the JobOrchestrator to the AsyncRetriever or storing it on multiple classes.
56+
"""
57+
58+
if not self._job_orchestrator:
59+
raise AirbyteTracedException(
60+
message="Invalid state within AsyncJobRetriever. Please contact Airbyte Support",
61+
internal_message="AsyncPartitionRepository is expected to be accessed only after `stream_slices`",
62+
failure_type=FailureType.system_error,
63+
)
64+
65+
return self._job_orchestrator.fetch_records(partition=partition)

0 commit comments

Comments
 (0)