Skip to content

Commit

Permalink
feat:🆕 PR #9 from DP6/firestore-source
Browse files Browse the repository at this point in the history
feat:🆕  Add Firestore source
  • Loading branch information
joaquimsn committed May 3, 2021
2 parents 488dc58 + c50a5ac commit 5151d4b
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 3 deletions.
14 changes: 11 additions & 3 deletions megalist_dataflow/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from mappers.ads_ssd_hashing_mapper import AdsSSDHashingMapper
from mappers.ads_user_list_pii_hashing_mapper import AdsUserListPIIHashingMapper
from sources.spreadsheet_execution_source import SpreadsheetExecutionSource
from sources.firestore_execution_source import FirestoreExecutionSource
from sources.batches_from_executions import BatchesFromExecutions
from uploaders.appsflyer.appsflyer_s2s_uploader_async import AppsFlyerS2SUploaderDoFn
from uploaders.campaign_manager.campaign_manager_conversion_uploader import CampaignManagerConversionUploaderDoFn
Expand Down Expand Up @@ -185,11 +186,18 @@ def run(argv=None):
dataflow_options.access_token,
dataflow_options.refresh_token)

sheets_config = SheetsConfig(oauth_credentials)
if dataflow_options.setup_sheet_id.is_accessible():
sheets_config = SheetsConfig(oauth_credentials)

with beam.Pipeline(options=pipeline_options) as pipeline:
executions = (pipeline | 'Load executions' >> beam.io.Read(
SpreadsheetExecutionSource(sheets_config, dataflow_options.setup_sheet_id)))
if dataflow_options.setup_sheet_id.is_accessible():
executions = (pipeline | 'Load executions' >> beam.io.Read(
SpreadsheetExecutionSource(sheets_config, dataflow_options.setup_sheet_id)))
elif dataflow_options.setup_firestore_collection.is_accessible():
executions = (pipeline | 'Load executions' >> beam.io.Read(
FirestoreExecutionSource(dataflow_options.setup_firestore_collection)))
else:
raise Exception('No valid parameter source (setup_sheet_id/setup_firestore_collection) included in the arguments')

executions | GoogleAdsSSDStep(
oauth_credentials, dataflow_options, AdsSSDHashingMapper())
Expand Down
2 changes: 2 additions & 0 deletions megalist_dataflow/models/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def _add_argparse_args(cls, parser):
# Set up
parser.add_value_provider_argument(
'--setup_sheet_id', help='Id of Spreadsheet with execution info')
parser.add_value_provider_argument(
'--setup_firestore_collection', help='Name of Firestore collection with execution info')
parser.add_value_provider_argument(
'--bq_ops_dataset',
help='Auxliary bigquery dataset used for Megalista operations')
Expand Down
127 changes: 127 additions & 0 deletions megalist_dataflow/sources/firestore_execution_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import distutils.util
import logging

from apache_beam.options.value_provider import ValueProvider

from google.cloud import firestore
from sources.base_bounded_source import BaseBoundedSource
from models.execution import Destination, DestinationType
from models.execution import Execution, AccountConfig
from models.execution import Source, SourceType


class FirestoreExecutionSource(BaseBoundedSource):
"""
Read Execution data from a Firestore collection. The collection name is set-up in the parameter "setup_firestore_collection"
"""

def __init__(
self,
setup_firestore_collection: ValueProvider
):
super().__init__()
self._setup_firestore_collection = setup_firestore_collection

def _do_count(self):
# TODO: implement count
return 3

def read(self, range_tracker):
def document_to_dict(doc):
if not doc.exists:
return None
doc_dict = doc.to_dict()
doc_dict['id'] = doc.id
return doc_dict

firestore_collection = self._setup_firestore_collection.get()
logging.getLogger("megalista.FirestoreExecutionSource").info(f"Loading Firestore collection {firestore_collection}...")
db = firestore.Client()
entries = db.collection(self._setup_firestore_collection.get()).where('active', '==', 'yes').stream()
entries = [document_to_dict(doc) for doc in entries]

account_data = document_to_dict(db.collection(self._setup_firestore_collection.get()).document('account_config').get())

if not account_data:
raise Exception('Firestore collection is absent')
google_ads_id = account_data.get('google_ads_id', 'empty')
mcc_trix = account_data.get('mcc_trix', 'FALSE')
mcc = False if mcc_trix is None else bool(distutils.util.strtobool(mcc_trix))
app_id = account_data.get('app_id', 'empty')
google_analytics_account_id = account_data.get('google_analytics_account_id', 'empty')
campaign_manager_account_id = account_data.get('campaign_manager_account_id', 'empty')

account_config = AccountConfig(google_ads_id, mcc, google_analytics_account_id, campaign_manager_account_id, app_id)
logging.getLogger("megalista.FirestoreExecutionSource").info(f"Loaded: {account_config}")

sources = self._read_sources(entries)
destinations = self._read_destination(entries)
if entries:
for entry in entries:
if entry['active'].upper() == 'YES':
logging.getLogger("megalista.FirestoreExecutionSource").info(
f"Executing step Source:{sources[entry['id'] + '_source'].source_name} -> Destination:{destinations[entry['id'] + '_destination'].destination_name}")
yield Execution(account_config, sources[entry['id'] + '_source'], destinations[entry['id'] + '_destination'])
else:
logging.getLogger("megalista.FirestoreExecutionSource").warn("No schedules found!")

@staticmethod
def _read_sources(entries):
sources = {}
if entries:
for entry in entries:
metadata = [entry['bq_dataset'], entry['bq_table']] #TODO: flexibilize for other source types
source = Source(entry['id'] + '_source', SourceType[entry['source']], metadata)
sources[source.source_name] = source
else:
logging.getLogger("megalista.FirestoreExecutionSource").warn("No sources found!")
return sources

@staticmethod
def _read_destination(entries):
def create_metadata_list(entry):
metadata_list = {
'ADS_OFFLINE_CONVERSION': ['gads_conversion_name'],
'ADS_SSD_UPLOAD': ['gads_conversion_name', 'gads_external_upload_id'],
'ADS_CUSTOMER_MATCH_CONTACT_INFO_UPLOAD': ['gads_audience_name', 'gads_operation', 'gads_hash'],
'ADS_CUSTOMER_MATCH_MOBILE_DEVICE_ID_UPLOAD': ['gads_audience_name', 'gads_operation'],
'ADS_CUSTOMER_MATCH_USER_ID_UPLOAD': ['gads_audience_name', 'gads_operation'],
'GA_MEASUREMENT_PROTOCOL': ['google_analytics_property_id', 'google_analytics_non_interaction'],
'CM_OFFLINE_CONVERSION': ['campaign_manager_floodlight_activity_id', 'campaign_manager_floodlight_configuration_id'],
'APPSFLYER_S2S_EVENTS': ['appsflyer_app_id'],
}

entry_type = entry['type']
metadata = metadata_list.get(entry_type, None)
if not metadata:
raise Exception(f'Upload type not implemented: {entry_type}')
entry_metadata = []
for m in metadata:
if m in entry:
entry_metadata.append(entry[m])
else:
raise Exception(f'Missing field in Firestore document for {entry_type}: {m}')
return entry_metadata


destinations = {}
if entries:
for entry in entries:
destination = Destination(entry['id'] + '_destination', DestinationType[entry['type']], create_metadata_list(entry))
destinations[destination.destination_name] = destination
else:
logging.getLogger("megalista.FirestoreExecutionSource").warn("No destinations found!")
return destinations

0 comments on commit 5151d4b

Please sign in to comment.