From 7896b2c952ba1a2d7a108bd7db2b4ccbe57470f7 Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Wed, 6 Nov 2024 16:07:13 +0500 Subject: [PATCH 01/30] chore: Update folder name --- docs/conf.py | 6 +++--- {edx_prefectutils => edx_argoutils}/__init__.py | 0 {edx_prefectutils => edx_argoutils}/bigquery.py | 0 {edx_prefectutils => edx_argoutils}/common.py | 0 {edx_prefectutils => edx_argoutils}/edx_api_client.py | 0 {edx_prefectutils => edx_argoutils}/email_unsubscribes.py | 0 {edx_prefectutils => edx_argoutils}/hubspot_leads.py | 0 {edx_prefectutils => edx_argoutils}/mysql.py | 2 +- {edx_prefectutils => edx_argoutils}/paypal.py | 2 +- {edx_prefectutils => edx_argoutils}/paypal_xml.py | 0 {edx_prefectutils => edx_argoutils}/record.py | 0 {edx_prefectutils => edx_argoutils}/s3.py | 0 {edx_prefectutils => edx_argoutils}/sitemap.py | 0 {edx_prefectutils => edx_argoutils}/snowflake.py | 2 +- {edx_prefectutils => edx_argoutils}/vault_secrets.py | 0 tests/test_bigquery.py | 2 +- tests/test_common.py | 2 +- tests/test_edx_api_client.py | 2 +- tests/test_mysql.py | 2 +- tests/test_paypal.py | 2 +- tests/test_paypal_xml.py | 2 +- tests/test_s3.py | 2 +- tests/test_sitemap.py | 2 +- tests/test_snowflake.py | 2 +- tests/test_vault_secrets.py | 2 +- 25 files changed, 16 insertions(+), 16 deletions(-) rename {edx_prefectutils => edx_argoutils}/__init__.py (100%) rename {edx_prefectutils => edx_argoutils}/bigquery.py (100%) rename {edx_prefectutils => edx_argoutils}/common.py (100%) rename {edx_prefectutils => edx_argoutils}/edx_api_client.py (100%) rename {edx_prefectutils => edx_argoutils}/email_unsubscribes.py (100%) rename {edx_prefectutils => edx_argoutils}/hubspot_leads.py (100%) rename {edx_prefectutils => edx_argoutils}/mysql.py (99%) rename {edx_prefectutils => edx_argoutils}/paypal.py (98%) rename {edx_prefectutils => edx_argoutils}/paypal_xml.py (100%) rename {edx_prefectutils => edx_argoutils}/record.py (100%) rename {edx_prefectutils => edx_argoutils}/s3.py (100%) rename {edx_prefectutils => edx_argoutils}/sitemap.py (100%) rename {edx_prefectutils => edx_argoutils}/snowflake.py (99%) rename {edx_prefectutils => edx_argoutils}/vault_secrets.py (100%) diff --git a/docs/conf.py b/docs/conf.py index a904ff7..11fb21e 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,7 +21,7 @@ import sys sys.path.insert(0, os.path.abspath('..')) -import edx_prefectutils +import edx_argoutils # -- General configuration --------------------------------------------- @@ -55,9 +55,9 @@ # the built documents. # # The short X.Y version. -version = edx_prefectutils.__version__ +version = edx_argoutils.__version__ # The full version, including alpha/beta/rc tags. -release = edx_prefectutils.__version__ +release = edx_argoutils.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/edx_prefectutils/__init__.py b/edx_argoutils/__init__.py similarity index 100% rename from edx_prefectutils/__init__.py rename to edx_argoutils/__init__.py diff --git a/edx_prefectutils/bigquery.py b/edx_argoutils/bigquery.py similarity index 100% rename from edx_prefectutils/bigquery.py rename to edx_argoutils/bigquery.py diff --git a/edx_prefectutils/common.py b/edx_argoutils/common.py similarity index 100% rename from edx_prefectutils/common.py rename to edx_argoutils/common.py diff --git a/edx_prefectutils/edx_api_client.py b/edx_argoutils/edx_api_client.py similarity index 100% rename from edx_prefectutils/edx_api_client.py rename to edx_argoutils/edx_api_client.py diff --git a/edx_prefectutils/email_unsubscribes.py b/edx_argoutils/email_unsubscribes.py similarity index 100% rename from edx_prefectutils/email_unsubscribes.py rename to edx_argoutils/email_unsubscribes.py diff --git a/edx_prefectutils/hubspot_leads.py b/edx_argoutils/hubspot_leads.py similarity index 100% rename from edx_prefectutils/hubspot_leads.py rename to edx_argoutils/hubspot_leads.py diff --git a/edx_prefectutils/mysql.py b/edx_argoutils/mysql.py similarity index 99% rename from edx_prefectutils/mysql.py rename to edx_argoutils/mysql.py index c4db16c..7e23561 100644 --- a/edx_prefectutils/mysql.py +++ b/edx_argoutils/mysql.py @@ -8,7 +8,7 @@ from prefect.engine import signals from prefect.utilities.logging import get_logger -from edx_prefectutils.snowflake import MANIFEST_FILE_NAME +from edx_argoutils.snowflake import MANIFEST_FILE_NAME def create_mysql_connection(credentials: dict, database: str, autocommit: bool = False): diff --git a/edx_prefectutils/paypal.py b/edx_argoutils/paypal.py similarity index 98% rename from edx_prefectutils/paypal.py rename to edx_argoutils/paypal.py index 9c5c2ca..805aa1d 100644 --- a/edx_prefectutils/paypal.py +++ b/edx_argoutils/paypal.py @@ -11,7 +11,7 @@ from prefect import config, task from prefect.engine import signals -from edx_prefectutils.s3 import get_s3_path_for_date, list_object_keys_from_s3 +from edx_argoutils.s3 import get_s3_path_for_date, list_object_keys_from_s3 def check_paypal_report(sftp_connection, remote_filename, check_column_name): diff --git a/edx_prefectutils/paypal_xml.py b/edx_argoutils/paypal_xml.py similarity index 100% rename from edx_prefectutils/paypal_xml.py rename to edx_argoutils/paypal_xml.py diff --git a/edx_prefectutils/record.py b/edx_argoutils/record.py similarity index 100% rename from edx_prefectutils/record.py rename to edx_argoutils/record.py diff --git a/edx_prefectutils/s3.py b/edx_argoutils/s3.py similarity index 100% rename from edx_prefectutils/s3.py rename to edx_argoutils/s3.py diff --git a/edx_prefectutils/sitemap.py b/edx_argoutils/sitemap.py similarity index 100% rename from edx_prefectutils/sitemap.py rename to edx_argoutils/sitemap.py diff --git a/edx_prefectutils/snowflake.py b/edx_argoutils/snowflake.py similarity index 99% rename from edx_prefectutils/snowflake.py rename to edx_argoutils/snowflake.py index 0e4b379..ed14fac 100644 --- a/edx_prefectutils/snowflake.py +++ b/edx_argoutils/snowflake.py @@ -16,7 +16,7 @@ from prefect.tasks.aws import s3 from prefect.utilities.logging import get_logger -from edx_prefectutils import s3 as s3_utils +from edx_argoutils import s3 as s3_utils MANIFEST_FILE_NAME = 'manifest.json' EXPORT_MAX_FILESIZE = 104857600 diff --git a/edx_prefectutils/vault_secrets.py b/edx_argoutils/vault_secrets.py similarity index 100% rename from edx_prefectutils/vault_secrets.py rename to edx_argoutils/vault_secrets.py diff --git a/tests/test_bigquery.py b/tests/test_bigquery.py index e9283bb..7cd475f 100644 --- a/tests/test_bigquery.py +++ b/tests/test_bigquery.py @@ -7,7 +7,7 @@ from prefect.core import Flow from pytest_mock import mocker # noqa: F401 -from edx_prefectutils import bigquery +from edx_argoutils import bigquery def test_cleanup_gcs_files(mocker): # noqa: F811 diff --git a/tests/test_common.py b/tests/test_common.py index ddb597f..65c6e43 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -6,7 +6,7 @@ from prefect.core import Flow -from edx_prefectutils import common +from edx_argoutils import common def test_generate_dates(): diff --git a/tests/test_edx_api_client.py b/tests/test_edx_api_client.py index e3214d9..b7fc42e 100644 --- a/tests/test_edx_api_client.py +++ b/tests/test_edx_api_client.py @@ -9,7 +9,7 @@ from ddt import data, ddt, unpack from mock import patch -from edx_prefectutils.edx_api_client import EdxApiClient +from edx_argoutils.edx_api_client import EdxApiClient FAKE_AUTH_URL = 'http://example.com/oauth2/access_token' FAKE_CLIENT_ID = 'aclientid' diff --git a/tests/test_mysql.py b/tests/test_mysql.py index bf76375..be1dc58 100644 --- a/tests/test_mysql.py +++ b/tests/test_mysql.py @@ -4,7 +4,7 @@ from prefect.engine import signals from pytest_mock import mocker # noqa: F401 -from edx_prefectutils import mysql as utils_mysql +from edx_argoutils import mysql as utils_mysql @pytest.fixture diff --git a/tests/test_paypal.py b/tests/test_paypal.py index 7c109f7..7870747 100644 --- a/tests/test_paypal.py +++ b/tests/test_paypal.py @@ -6,7 +6,7 @@ from mock import Mock -from edx_prefectutils.paypal import get_paypal_filename +from edx_argoutils.paypal import get_paypal_filename def test_get_paypal_filename(): diff --git a/tests/test_paypal_xml.py b/tests/test_paypal_xml.py index 67d5258..0e608c7 100644 --- a/tests/test_paypal_xml.py +++ b/tests/test_paypal_xml.py @@ -6,7 +6,7 @@ import httpretty from ddt import data, ddt, unpack -from edx_prefectutils.paypal_xml import (ColumnMetadata, +from edx_argoutils.paypal_xml import (ColumnMetadata, PaypalApiRequestFailedError, PaypalMalformedResponseError, PaypalReportDataRequest, diff --git a/tests/test_s3.py b/tests/test_s3.py index 3fe95cb..aa4be2d 100644 --- a/tests/test_s3.py +++ b/tests/test_s3.py @@ -5,7 +5,7 @@ from mock import patch from mock.mock import MagicMock -from edx_prefectutils.s3 import delete_s3_directory +from edx_argoutils.s3 import delete_s3_directory @patch("edx_prefectutils.s3.list_object_keys_from_s3.run") diff --git a/tests/test_sitemap.py b/tests/test_sitemap.py index 79b140e..2792a77 100644 --- a/tests/test_sitemap.py +++ b/tests/test_sitemap.py @@ -8,7 +8,7 @@ from mock import Mock, patch from prefect import context -from edx_prefectutils.sitemap import fetch_sitemap, fetch_sitemap_urls +from edx_argoutils.sitemap import fetch_sitemap, fetch_sitemap_urls SCRAPED_AT = '2021-10-22T15:14:16.683985+00:00' diff --git a/tests/test_snowflake.py b/tests/test_snowflake.py index 9af501b..20adc87 100755 --- a/tests/test_snowflake.py +++ b/tests/test_snowflake.py @@ -14,7 +14,7 @@ from pytest_mock import mocker # noqa: F401 from snowflake.connector import ProgrammingError -from edx_prefectutils import snowflake +from edx_argoutils import snowflake def test_qualified_table_name(): diff --git a/tests/test_vault_secrets.py b/tests/test_vault_secrets.py index 30d0179..57b872f 100644 --- a/tests/test_vault_secrets.py +++ b/tests/test_vault_secrets.py @@ -7,7 +7,7 @@ from prefect import Flow, task, unmapped from pytest_mock import mocker # noqa: F401 -from edx_prefectutils import vault_secrets +from edx_argoutils import vault_secrets @task From 93031a213257ec7ad8c06a2aa8a192d20b71dfc5 Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Thu, 7 Nov 2024 12:57:51 +0500 Subject: [PATCH 02/30] chore: update package name --- .coveragerc | 2 +- .github/ISSUE_TEMPLATE.md | 2 +- CONTRIBUTING.rst | 24 ++++++++++++------------ Makefile | 8 ++++---- README.rst | 13 +++++-------- docs/Makefile | 2 +- docs/conf.py | 26 +++++++++++++------------- docs/index.rst | 2 +- docs/installation.rst | 16 ++++++++-------- docs/make.bat | 2 +- docs/usage.rst | 4 ++-- edx_argoutils/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 14 +++++++------- tests/test_bigquery.py | 2 +- tests/test_common.py | 2 +- tests/test_edx_api_client.py | 2 +- tests/test_s3.py | 4 ++-- tests/test_snowflake.py | 10 +++++----- tests/test_vault_secrets.py | 2 +- tox.ini | 4 ++-- 21 files changed, 71 insertions(+), 74 deletions(-) diff --git a/.coveragerc b/.coveragerc index 7b815a0..4936a25 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,4 +1,4 @@ [run] branch = True data_file = .coverage -source=edx_prefectutils +source=edx_argoutils diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 480e412..eb8a4fd 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -1,4 +1,4 @@ -* edx-prefectutils version: +* edx-argoutils version: * Python version: * Operating System: diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 6e052f9..0555c75 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -15,7 +15,7 @@ Types of Contributions Report Bugs ~~~~~~~~~~~ -Report bugs at https://github.com/edx/edx-prefectutils/issues. +Report bugs at https://github.com/2uinc/edx-argoutils/issues. If you are reporting a bug, please include: @@ -38,14 +38,14 @@ and "help wanted" is open to whoever wants to implement it. Write Documentation ~~~~~~~~~~~~~~~~~~~ -edx-prefectutils could always use more documentation, whether as part of the -official edx-prefectutils docs, in docstrings, or even on the web in blog posts, +edx-argoutils could always use more documentation, whether as part of the +official edx-argoutils docs, in docstrings, or even on the web in blog posts, articles, and such. Submit Feedback ~~~~~~~~~~~~~~~ -The best way to send feedback is to file an issue at https://github.com/edx/edx-prefectutils/issues. +The best way to send feedback is to file an issue at https://github.com/2uinc/edx-argoutils/issues. If you are proposing a feature: @@ -57,17 +57,17 @@ If you are proposing a feature: Get Started! ------------ -Ready to contribute? Here's how to set up `edx-prefectutils` for local development. +Ready to contribute? Here's how to set up `edx-argoutils` for local development. -1. Fork the `edx-prefectutils` repo on GitHub. +1. Fork the `edx-argoutils` repo on GitHub. 2. Clone your fork locally:: - $ git clone git@github.com:your_name_here/edx-prefectutils.git + $ git clone git@github.com:your_name_here/edx-argoutils.git 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: - $ mkvirtualenv edx-prefectutils - $ cd edx-prefectutils/ + $ mkvirtualenv edx-argoutils + $ cd edx-argoutils/ $ python setup.py develop 4. Create a branch for local development:: @@ -79,7 +79,7 @@ Ready to contribute? Here's how to set up `edx-prefectutils` for local developme 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: - $ flake8 edx_prefectutils tests + $ flake8 edx_argoutils tests $ python setup.py test or pytest $ tox @@ -103,7 +103,7 @@ Before you submit a pull request, check that it meets these guidelines: your new functionality into a function with a docstring, and add the feature to the list in README.rst. 3. The pull request should work for Python 3.5, 3.6, 3.7 and 3.8, and for PyPy. Check - https://github.com/edx/edx-prefectutils/actions?query=workflow%3A%22Python+CI%22 + https://github.com/2uinc/edx-argoutils/actions?query=workflow%3A%22Python+CI%22 and make sure that the tests pass for all supported Python versions. Tips @@ -111,7 +111,7 @@ Tips To run a subset of tests:: -$ pytest tests.test_edx_prefectutils +$ pytest tests.test_edx_argoutils Deploying diff --git a/Makefile b/Makefile index 145c3c0..3911d20 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,7 @@ clean-test: ## remove test and coverage artifacts rm -fr .pytest_cache lint: ## check style with flake8 - flake8 edx_prefectutils tests + flake8 edx_argoutils tests test: ## run tests quickly with the default Python pytest @@ -57,15 +57,15 @@ test-all: ## run tests on every Python version with tox tox coverage: ## check code coverage quickly with the default Python - coverage run --source edx_prefectutils -m pytest + coverage run --source edx_argoutils -m pytest coverage report -m coverage html $(BROWSER) htmlcov/index.html docs: ## generate Sphinx HTML documentation, including API docs - rm -f docs/edx_prefectutils.rst + rm -f docs/edx_argoutils.rst rm -f docs/modules.rst - sphinx-apidoc -o docs/ edx_prefectutils + sphinx-apidoc -o docs/ edx_argoutils $(MAKE) -C docs clean $(MAKE) -C docs html $(BROWSER) docs/_build/html/index.html diff --git a/README.rst b/README.rst index 6ad6e8e..8d89790 100644 --- a/README.rst +++ b/README.rst @@ -1,5 +1,5 @@ ================ -edx-prefectutils +edx-argoutils ================ @@ -7,7 +7,7 @@ edx-prefectutils -A utility code written by edX specifically for edX Prefect Flows. It assists writing in edX Prefect Flows. +A utility code written by edX specifically for edX Argo Flows. It assists writing in edX Argo Flows. @@ -20,14 +20,11 @@ PyPI Package Release -------------------- - This repository publishes packages on PyPI, when you update this repository make sure to create a new release so latest changes would be published on PyPI and become available for use. - Bump the `version`_ to match the version that will be released. Once the pull request with the updated version is merged into master, create a new release from GitHub Web UI, using the same version number. You can refer to `How to create GitHub release`_ for more information on this process. -- `edx-prefectutils PyPI`_ package is used in `prefect-flows`_ repository, you will have to update edx-prefectutils package version in `requirements`_ so it will pick latest PyPI package version. -.. _`version`: https://github.com/edx/edx-prefectutils/blob/master/edx_prefectutils/__init__.py#L5 -.. _`How to create GitHub release`: https://docs.github.com/en/github/administering-a-repository/releasing-projects-on-github/managing-releases-in-a-repository -.. _`edx-prefectutils PyPI`: https://pypi.org/project/edx-prefectutils/ -.. _`prefect-flows`: https://github.com/edx/prefect-flows -.. _`requirements`: https://github.com/edx/prefect-flows/blob/master/requirements.txt#L7 +.. _`version`: https://github.com/2uinc/edx-argoutils/blob/master/edx_argoutils/__init__.py#L5 +.. _`How to create GitHub release`: https://docs.github.com/en/github/administering-a-repository/releasing-projects-on-github/managing-releases-in-a-repository +.. _`edx-argoutils PyPI`: https://pypi.org/project/edx-argoutils/ Credits diff --git a/docs/Makefile b/docs/Makefile index bebcfed..46b8533 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -4,7 +4,7 @@ # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = python -msphinx -SPHINXPROJ = edx_prefectutils +SPHINXPROJ = edx_argoutils SOURCEDIR = . BUILDDIR = _build diff --git a/docs/conf.py b/docs/conf.py index 11fb21e..eea4b4d 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# edx_prefectutils documentation build configuration file, created by +# edx_argoutils documentation build configuration file, created by # sphinx-quickstart on Fri Jun 9 13:47:02 2017. # # This file is execfile()d with the current directory set to its @@ -46,9 +46,9 @@ top_level_doc = 'index' # General information about the project. -project = 'edx-prefectutils' -copyright = "2020, Julia Eskew" -author = "Julia Eskew" +project = 'edx-argoutils' +copyright = "2024, Abdul Rafey" +author = "Abdul Rafey" # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout @@ -100,7 +100,7 @@ # -- Options for HTMLHelp output --------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'edx_prefectutilsdoc' +htmlhelp_basename = 'edx_argoutilsdoc' # -- Options for LaTeX output ------------------------------------------ @@ -127,9 +127,9 @@ # (source start file, target name, title, author, documentclass # [howto, manual, or own class]). latex_documents = [ - (top_level_doc, 'edx_prefectutils.tex', - 'edx-prefectutils Documentation', - 'Julia Eskew', 'manual'), + (top_level_doc, 'edx_argoutils.tex', + 'edx-argoutils Documentation', + 'Abdul Rafey', 'manual'), ] @@ -138,8 +138,8 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (top_level_doc, 'edx_prefectutils', - 'edx-prefectutils Documentation', + (top_level_doc, 'edx_argoutils', + 'edx-argoutils Documentation', [author], 1) ] @@ -150,10 +150,10 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (top_level_doc, 'edx_prefectutils', - 'edx-prefectutils Documentation', + (top_level_doc, 'edx_argoutils', + 'edx-argoutils Documentation', author, - 'edx_prefectutils', + 'edx_argoutils', 'One line description of project.', 'Miscellaneous'), ] diff --git a/docs/index.rst b/docs/index.rst index 6ee957c..657cb66 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,4 +1,4 @@ -Welcome to edx-prefectutils's documentation! +Welcome to edx-argoutils's documentation! ========================================= .. toctree:: diff --git a/docs/installation.rst b/docs/installation.rst index 85b2a40..2dcd72f 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -8,13 +8,13 @@ Installation Stable release -------------- -To install edx-prefectutils, run this command in your terminal: +To install edx-argoutils, run this command in your terminal: .. code-block:: console - $ pip install edx_prefectutils + $ pip install edx_argoutils -This is the preferred method to install edx-prefectutils, as it will always install the most recent stable release. +This is the preferred method to install edx-argoutils, as it will always install the most recent stable release. If you don't have `pip`_ installed, this `Python installation guide`_ can guide you through the process. @@ -26,19 +26,19 @@ you through the process. From sources ------------ -The sources for edx-prefectutils can be downloaded from the `Github repo`_. +The sources for edx-argoutils can be downloaded from the `Github repo`_. You can either clone the public repository: .. code-block:: console - $ git clone git://github.com/edx/edx-prefectutils + $ git clone git://github.com/2uinc/edx-argoutils Or download the `tarball`_: .. code-block:: console - $ curl -OJL https://github.com/edx/edx-prefectutils/tarball/master + $ curl -OJL https://github.com/2uinc/edx-argoutils/tarball/master Once you have a copy of the source, you can install it with: @@ -47,5 +47,5 @@ Once you have a copy of the source, you can install it with: $ python setup.py install -.. _Github repo: https://github.com/edx/edx-prefectutils -.. _tarball: https://github.com/edx/edx-prefectutils/tarball/master +.. _Github repo: https://github.com/2uinc/edx-argoutils +.. _tarball: https://github.com/2uinc/edx-argoutils/tarball/master diff --git a/docs/make.bat b/docs/make.bat index 5b7a6bd..c5c12af 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -9,7 +9,7 @@ if "%SPHINXBUILD%" == "" ( ) set SOURCEDIR=. set BUILDDIR=_build -set SPHINXPROJ=edx_prefectutils +set SPHINXPROJ=edx_argoutils if "%1" == "" goto help diff --git a/docs/usage.rst b/docs/usage.rst index 47d996b..be4f3cc 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -2,6 +2,6 @@ Usage ===== -To use edx-prefectutils in a project:: +To use edx-argoutils in a project:: - import edx_prefectutils + import edx_argoutils diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 7dfb5e8..8017261 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -1,5 +1,5 @@ """ -Top-level package for edx-prefectutils. +Top-level package for edx-argoutils. """ __version__ = '2.4.4' diff --git a/setup.cfg b/setup.cfg index eb35ff5..3fa9d0a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,7 +7,7 @@ tag = True search = version='{current_version}' replace = version='{new_version}' -[bumpversion:file:edx_prefectutils/__init__.py] +[bumpversion:file:edx_argoutils/__init__.py] search = __version__ = '{current_version}' replace = __version__ = '{new_version}' diff --git a/setup.py b/setup.py index d1ac6b9..c9aa1d3 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ def is_requirement(line): line.startswith('-c') ) -VERSION = get_version('edx_prefectutils', '__init__.py') +VERSION = get_version('edx_argoutils', '__init__.py') if sys.argv[-1] == 'tag': print("Tagging the version on github:") @@ -83,22 +83,22 @@ def is_requirement(line): 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.8', ], - description="Utility code to assist in writing Prefect Flows.", + description="Utility code to assist in writing Argo Flows.", entry_points={ 'console_scripts': [ - 'edx_prefectutils=edx_prefectutils.cli:main', + 'edx_argoutils=edx_argoutils.cli:main', ], }, install_requires=load_requirements('requirements/base.in'), long_description=README, include_package_data=True, - keywords='edx_prefectutils', - name='edx-prefectutils', - packages=find_packages(include=['edx_prefectutils', 'edx_prefectutils.*']), + keywords='edx_argoutils', + name='edx-argoutils', + packages=find_packages(include=['edx_argoutils', 'edx_argoutils.*']), setup_requires=setup_requirements, test_suite='tests', tests_require=test_requirements, - url='https://github.com/edx/edx-prefectutils', + url='https://github.com/2uinc/edx-argoutils', version=VERSION, zip_safe=False, ) diff --git a/tests/test_bigquery.py b/tests/test_bigquery.py index 7cd475f..6cf9427 100644 --- a/tests/test_bigquery.py +++ b/tests/test_bigquery.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Tests for BigQuery utils in the `edx_prefectutils` package. +Tests for BigQuery utils in the `edx_argoutils` package. """ from prefect.core import Flow diff --git a/tests/test_common.py b/tests/test_common.py index 65c6e43..ffcbb2c 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Tests for BigQuery utils in the `edx_prefectutils` package. +Tests for BigQuery utils in the `edx_argoutils` package. """ from prefect.core import Flow diff --git a/tests/test_edx_api_client.py b/tests/test_edx_api_client.py index b7fc42e..46cb98e 100644 --- a/tests/test_edx_api_client.py +++ b/tests/test_edx_api_client.py @@ -39,7 +39,7 @@ def utcnow(cls): """Return the time specified by the time offset""" return self.current_time + timedelta(seconds=self.time_offset) - datetime_patcher = patch('edx_prefectutils.edx_api_client.datetime', MockDateTime) + datetime_patcher = patch('edx_argoutils.edx_api_client.datetime', MockDateTime) datetime_patcher.start() self.addCleanup(datetime_patcher.stop) diff --git a/tests/test_s3.py b/tests/test_s3.py index aa4be2d..9d47bca 100644 --- a/tests/test_s3.py +++ b/tests/test_s3.py @@ -8,8 +8,8 @@ from edx_argoutils.s3 import delete_s3_directory -@patch("edx_prefectutils.s3.list_object_keys_from_s3.run") -@patch("edx_prefectutils.s3.get_boto_client") +@patch("edx_argoutils.s3.list_object_keys_from_s3.run") +@patch("edx_argoutils.s3.get_boto_client") def test_delete_s3_directory(boto_client_mock, list_object_keys_from_s3_mock): """ Test the delete_s3_directory task diff --git a/tests/test_snowflake.py b/tests/test_snowflake.py index 20adc87..9ddb940 100755 --- a/tests/test_snowflake.py +++ b/tests/test_snowflake.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Tests for Snowflake utils in the `edx_prefectutils` package. +Tests for Snowflake utils in the `edx_argoutils` package. """ import json @@ -306,7 +306,7 @@ def test_export_snowflake_table_to_s3_with_exception(mock_sf_connection): def test_export_snowflake_table_to_s3_overwrite(mock_sf_connection): # noqa: F811 mock_cursor = mock_sf_connection.cursor() - with mock.patch('edx_prefectutils.s3.delete_s3_directory.run') as mock_delete_s3_directory: + with mock.patch('edx_argoutils.s3.delete_s3_directory.run') as mock_delete_s3_directory: with Flow("test") as f: snowflake.export_snowflake_table_to_s3( sf_credentials={}, @@ -336,7 +336,7 @@ def test_export_snowflake_table_to_s3_overwrite(mock_sf_connection): # noqa: F8 def test_export_snowflake_table_to_s3_no_escape(mock_sf_connection): # noqa: F811 mock_cursor = mock_sf_connection.cursor() - with mock.patch('edx_prefectutils.s3.delete_s3_directory.run'): + with mock.patch('edx_argoutils.s3.delete_s3_directory.run'): with Flow("test") as f: snowflake.export_snowflake_table_to_s3( sf_credentials={}, @@ -363,7 +363,7 @@ def test_export_snowflake_table_to_s3_no_escape(mock_sf_connection): # noqa: F8 def test_export_snowflake_table_to_s3_no_enclosure(mock_sf_connection): # noqa: F811 mock_cursor = mock_sf_connection.cursor() - with mock.patch('edx_prefectutils.s3.delete_s3_directory.run'): + with mock.patch('edx_argoutils.s3.delete_s3_directory.run'): with Flow("test") as f: snowflake.export_snowflake_table_to_s3( sf_credentials={}, @@ -390,7 +390,7 @@ def test_export_snowflake_table_to_s3_no_enclosure(mock_sf_connection): # noqa: def test_export_snowflake_table_to_s3_no_null_if(mock_sf_connection): # noqa: F811 mock_cursor = mock_sf_connection.cursor() - with mock.patch('edx_prefectutils.s3.delete_s3_directory.run'): + with mock.patch('edx_argoutils.s3.delete_s3_directory.run'): with Flow("test") as f: snowflake.export_snowflake_table_to_s3( sf_credentials={}, diff --git a/tests/test_vault_secrets.py b/tests/test_vault_secrets.py index 57b872f..5330c03 100644 --- a/tests/test_vault_secrets.py +++ b/tests/test_vault_secrets.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Tests for Hashicorp Vault secrets utils in the `edx_prefectutils` package. +Tests for Hashicorp Vault secrets utils in the `edx_argoutils` package. """ from prefect import Flow, task, unmapped diff --git a/tox.ini b/tox.ini index 1c2301b..a311852 100644 --- a/tox.ini +++ b/tox.ini @@ -20,7 +20,7 @@ deps = -r{toxinidir}/requirements/test.txt commands = touch tests/__init__.py - flake8 edx_prefectutils tests + flake8 edx_argoutils tests rm tests/__init__.py - isort --check-only --diff edx_prefectutils tests + isort --check-only --diff edx_argoutils tests From 745805e036e9ca0a85b8e50e21cd6681f38c3e0d Mon Sep 17 00:00:00 2001 From: kmaepa Date: Thu, 7 Nov 2024 12:51:35 +0200 Subject: [PATCH 03/30] chore: add generate_date_range func --- edx_argoutils/common.py | 51 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/edx_argoutils/common.py b/edx_argoutils/common.py index c0d8f04..104eb83 100644 --- a/edx_argoutils/common.py +++ b/edx_argoutils/common.py @@ -12,6 +12,8 @@ from opaque_keys.edx.keys import CourseKey from prefect import task from prefect.engine.results import PrefectResult +from datetime import datetime, timedelta, date + @task @@ -110,3 +112,52 @@ def get_filename_safe_course_id(course_id, replacement_char='_'): # We represent the first four with \w. # TODO: Once we support courses with unicode characters, we will need to revisit this. return re.sub(r'[^\w\.\-]', six.text_type(replacement_char), filename) + +def generate_date_range(start_date= None, end_date= None, is_daily: bool = None): + """ + Generate a list of dates depending on parameters passed. Dates are inclusive. + Custom dates is top priority: start_date & end_date are set, is_daily = False + Daily run: start_date & end_date are both None, is_daily = True + True-up run: start_date & end_date are both None, is_daily = False + + Args: + start_date (str): The start date in YYYY-MM-DD format + end_date (str): The end date in YYYY-MM-DD format + is_daily (bool): Designates if last completed day to run + """ + + if start_date is not None and end_date is not None and is_daily is False: + # Manual run: user entered parameters for custom dates + #logger.info("Setting dates for manual run...") + #start_date= start_date.strftime('%Y-%m-%d') + start_date = datetime.strptime(start_date, "%Y-%m-%d").date() + end_date = datetime.strptime(end_date, "%Y-%m-%d").date() + #end_date= end_date.strftime('%Y-%m-%d') + + elif start_date is None and end_date is None and is_daily is True: + # Daily run: minus 2 lag completed day, eg. if today is 9/14, output is 9/12 + # this is due to timing issues with data freshness for STRIPE_RAW <> Snowflake share + minus_two_lag_date = datetime.strptime(str(date.today()), "%Y-%m-%d").date() - timedelta(days=2) + start_date, end_date = minus_two_lag_date, minus_two_lag_date + + elif start_date is None and end_date is None and is_daily is False: + # True-up: Calculate the last completed month + today = datetime.now().date() + current_month_start = datetime(today.year, today.month, 1).date() + last_month_end = current_month_start - timedelta(days=1) + last_month_start = datetime(last_month_end.year, last_month_end.month, 1).date() + + start_date = last_month_start + end_date = last_month_end + + else: + raise Exception("Incorrect parameters passed!") + + current_date = start_date + date_list = [] + + while current_date <= end_date: + date_list.append(current_date) + current_date += timedelta(days=1) + + return date_list From b411651ee1d59dd538b050b13c23fe669e473525 Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Thu, 7 Nov 2024 16:32:28 +0500 Subject: [PATCH 04/30] chore: change version to 1 --- edx_argoutils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 8017261..5807acb 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '2.4.4' +__version__ = '1.0' From 8296e5e65cf9eac16112d68b5c785024e84d7c43 Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Thu, 7 Nov 2024 16:43:43 +0500 Subject: [PATCH 05/30] chore: remove deacticated user as author --- AUTHORS.rst | 2 +- setup.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 27c46b6..e41b875 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -5,7 +5,7 @@ Credits Development Lead ---------------- -* Julia Eskew +* Abdul Rafey Contributors ------------ diff --git a/setup.py b/setup.py index c9aa1d3..ec05f30 100644 --- a/setup.py +++ b/setup.py @@ -71,8 +71,8 @@ def is_requirement(line): test_requirements = ['pytest>=3', ] setup( - author="Julia Eskew", - author_email='jeskew@edx.org', + author="Abdul Rafey", + author_email='arafey@2u.com', python_requires='>=3.8', license='AGPL', classifiers=[ From 980d4f59d80233093cff20d67b3e7f15a20fb5c4 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Thu, 7 Nov 2024 17:12:10 +0500 Subject: [PATCH 06/30] refactor: refactored common.py file --- edx_argoutils/__init__.py | 2 +- edx_argoutils/common.py | 38 +++++++++++-------------------- tests/test_common.py | 47 +++++++++++++++------------------------ 3 files changed, 32 insertions(+), 55 deletions(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 5807acb..c03a8d7 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0' +__version__ = '1.0.2' diff --git a/edx_argoutils/common.py b/edx_argoutils/common.py index 104eb83..13f2b00 100644 --- a/edx_argoutils/common.py +++ b/edx_argoutils/common.py @@ -1,87 +1,75 @@ """ -Utility methods and tasks for use from a Prefect flow. +Utility functions for use in Argo flows. """ -import datetime import itertools import re - -import prefect import six from opaque_keys import InvalidKeyError from opaque_keys.edx.keys import CourseKey -from prefect import task -from prefect.engine.results import PrefectResult from datetime import datetime, timedelta, date -@task def get_date(date: str): """ Return today's date string if date is None. Otherwise return the passed parameter value. - prefect.context.today is only available at task level, so we cannot use it as a default parameter value. """ if date is None: - return prefect.context.today + return datetime.today().strftime('%Y-%m-%d') else: return date -@task(result=PrefectResult()) def generate_dates(start_date: str, end_date: str, date_format: str = "%Y%m%d"): """ Generates a list of date strings in the format specified by `date_format` from start_date up to but excluding end_date. """ if not start_date: - start_date = prefect.context.yesterday + start_date = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d") if not end_date: - end_date = prefect.context.today + end_date = datetime.today().strftime("%Y-%m-%d") - parsed_start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") - parsed_end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") + parsed_start_date = datetime.strptime(start_date, "%Y-%m-%d") + parsed_end_date = datetime.strptime(end_date, "%Y-%m-%d") dates = [] while parsed_start_date < parsed_end_date: dates.append(parsed_start_date) - parsed_start_date = parsed_start_date + datetime.timedelta(days=1) + parsed_start_date = parsed_start_date + timedelta(days=1) return [date.strftime(date_format) for date in dates] -@task def generate_month_start_dates(start_date: str, end_date: str, date_format: str = "%Y-%m-%d"): """ Return a list of first days of months within the specified date range. If start_date or end_date is not provided, defaults to yesterday or today respectively. - prefect.context.today is only available at task level, so we cannot use it as a default parameter value. """ if not start_date: - start_date = prefect.context.yesterday + start_date = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d") if not end_date: - end_date = prefect.context.today + end_date = datetime.today().strftime("%Y-%m-%d") # Since our intention is to extract first day of months, we will start by modifying the start and end date # to represent the first day of month. - parsed_start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d").replace(day=1) - parsed_end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d").replace(day=1) + parsed_start_date = datetime.strptime(start_date, "%Y-%m-%d").replace(day=1) + parsed_end_date = datetime.strptime(end_date, "%Y-%m-%d").replace(day=1) dates = [] current_date = parsed_start_date while current_date <= parsed_end_date: dates.append(current_date) # The addition of 32 days to current_date and then setting the day to 1 is a way to ensure that we move to # the beginning of the next month, even if the month doesn't have exactly 32 days. - current_date += datetime.timedelta(days=32) + current_date += timedelta(days=32) current_date = current_date.replace(day=1) return [date.strftime(date_format) for date in dates] -@task def get_unzipped_cartesian_product(input_lists: list): """ - Generate an unzipped cartesian product of the given list of lists, useful for - generating task parameters for mapping. + Generate an unzipped cartesian product of the given list of lists. For example, get_unzipped_cartesian_product([[1, 2, 3], ["a", "b", "c"]]) would return: diff --git a/tests/test_common.py b/tests/test_common.py index ffcbb2c..0e89c45 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -1,44 +1,33 @@ #!/usr/bin/env python """ -Tests for BigQuery utils in the `edx_argoutils` package. +Tests for Common utils in the `edx_argoutils`. """ -from prefect.core import Flow - -from edx_argoutils import common +from edx_prefectutils import common def test_generate_dates(): - with Flow("test") as f: - task = common.generate_dates( - start_date='2020-01-01', - end_date='2020-01-05' - ) - state = f.run() - assert state.is_successful() - assert state.result[task].result == ['20200101', '20200102', '20200103', '20200104'] + result = common.generate_dates( + start_date='2020-01-01', + end_date='2020-01-05' + ) + assert result == ['20200101', '20200102', '20200103', '20200104'] def test_generate_month_start_dates(): - with Flow("test") as f: - task = common.generate_month_start_dates( - start_date='2023-01-31', - end_date='2023-05-05' - ) - state = f.run() - assert state.is_successful() - assert state.result[task].result == ['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01', '2023-05-01'] + result = common.generate_month_start_dates( + start_date='2023-01-31', + end_date='2023-05-05' + ) + assert result == ['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01', '2023-05-01'] def test_get_unzipped_cartesian_product(): - with Flow("test") as f: - task = common.get_unzipped_cartesian_product( - input_lists=[[1, 2, 3], ["a", "b", "c"]] - ) - state = f.run() - assert state.is_successful() - assert state.result[task].result == [ - (1, 1, 1, 2, 2, 2, 3, 3, 3), - ("a", "b", "c", "a", "b", "c", "a", "b", "c") + result = common.get_unzipped_cartesian_product( + input_lists=[[1, 2, 3], ["a", "b", "c"]] + ) + assert result == [ + (1, 1, 1, 2, 2, 2, 3, 3, 3), + ("a", "b", "c", "a", "b", "c", "a", "b", "c") ] From 9cddbd20a0437b98f8776545d8a66450db3ad58c Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Thu, 7 Nov 2024 17:41:59 +0500 Subject: [PATCH 07/30] refactor: refactored sitemap.py --- edx_argoutils/__init__.py | 2 +- edx_argoutils/sitemap.py | 27 +++++++++++++-------------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index c03a8d7..0d90168 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.2' +__version__ = '1.0.3' diff --git a/edx_argoutils/sitemap.py b/edx_argoutils/sitemap.py index 8a32f9c..734ffc6 100644 --- a/edx_argoutils/sitemap.py +++ b/edx_argoutils/sitemap.py @@ -1,19 +1,16 @@ """ -Tasks for pulling sitemap data. +Functions for pulling sitemap data. """ import json import xml.etree.ElementTree as ET from os.path import basename, join, splitext from urllib.parse import urlparse - -import prefect +import boto3 import requests -from prefect import task -from prefect.tasks.aws import s3 +from common import getdate -@task def fetch_sitemap_urls(sitemap_index_url: str) -> str: """ Fetches a list of sitemap urls by parsing sitemap index file. @@ -25,12 +22,11 @@ def fetch_sitemap_urls(sitemap_index_url: str) -> str: return sitemap_urls -@task def fetch_sitemap(sitemap_url: str): """ Fetches sitemap data from a given sitemap URL. """ - scraped_at = str(prefect.context.date) + scraped_at = getdate(None) r = requests.get(sitemap_url) sitemap_xml = r.text tree = ET.fromstring(sitemap_xml) @@ -44,17 +40,20 @@ def fetch_sitemap(sitemap_url: str): return sitemap_filename, json.dumps(sitemap_json) -@task def write_sitemap_to_s3(sitemap_data: str, s3_bucket: str, s3_path: str): """ Writes sitemap data in JSON format to S3. """ filename, sitemap_json = sitemap_data - date_path = f'{prefect.context.today}/{filename}.json' - s3_key = join(s3_path, date_path) - s3.S3Upload(bucket=s3_bucket).run( - sitemap_json, - key=s3_key, + today = getdate(None) + date_path = f'{today}/{filename}.json' + s3_key = f'{s3_path}/{date_path}' + s3_client = boto3.client('s3') + s3_client.put_object( + Bucket=s3_bucket, + Key=s3_key, + Body=sitemap_json, + ContentType='application/json' ) return date_path From 3ae8e96ea2b9b824d7c70daef6e64342d3341e3e Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Thu, 7 Nov 2024 19:09:11 +0500 Subject: [PATCH 08/30] refactor: fix sitemap refactoring --- edx_argoutils/__init__.py | 2 +- edx_argoutils/sitemap.py | 15 +++-- tests/test_sitemap.py | 125 ++++++++++++++++++++++---------------- 3 files changed, 82 insertions(+), 60 deletions(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 0d90168..4a813aa 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.3' +__version__ = '1.0.4' diff --git a/edx_argoutils/sitemap.py b/edx_argoutils/sitemap.py index 734ffc6..d71591f 100644 --- a/edx_argoutils/sitemap.py +++ b/edx_argoutils/sitemap.py @@ -8,7 +8,7 @@ from urllib.parse import urlparse import boto3 import requests -from common import getdate +from .common import get_date def fetch_sitemap_urls(sitemap_index_url: str) -> str: @@ -26,7 +26,7 @@ def fetch_sitemap(sitemap_url: str): """ Fetches sitemap data from a given sitemap URL. """ - scraped_at = getdate(None) + scraped_at = get_date(None) r = requests.get(sitemap_url) sitemap_xml = r.text tree = ET.fromstring(sitemap_xml) @@ -40,15 +40,20 @@ def fetch_sitemap(sitemap_url: str): return sitemap_filename, json.dumps(sitemap_json) -def write_sitemap_to_s3(sitemap_data: str, s3_bucket: str, s3_path: str): +def write_sitemap_to_s3(sitemap_data: str, s3_bucket: str, s3_path: str, credentials: dict = {}): """ Writes sitemap data in JSON format to S3. """ filename, sitemap_json = sitemap_data - today = getdate(None) + today = get_date(None) date_path = f'{today}/{filename}.json' s3_key = f'{s3_path}/{date_path}' - s3_client = boto3.client('s3') + s3_client = boto3.client( + 's3', + aws_access_key_id=credentials.get('AccessKeyId'), + aws_secret_access_key=credentials.get('SecretAccessKey'), + aws_session_token=credentials.get('SessionToken') + ) s3_client.put_object( Bucket=s3_bucket, Key=s3_key, diff --git a/tests/test_sitemap.py b/tests/test_sitemap.py index 2792a77..41741a7 100644 --- a/tests/test_sitemap.py +++ b/tests/test_sitemap.py @@ -1,67 +1,84 @@ """ -Tests for sitemap tasks. +Tests for sitemap functions. +pytest test_sitemap.py """ import json - +import unittest +from unittest.mock import Mock, patch import requests -from mock import Mock, patch -from prefect import context - from edx_argoutils.sitemap import fetch_sitemap, fetch_sitemap_urls SCRAPED_AT = '2021-10-22T15:14:16.683985+00:00' -@patch.object(requests, 'get') -def test_fetch_sitemap_urls(mockget): - mockresponse = Mock() - mockget.return_value = mockresponse - mockresponse.text = """ - - - https://www.foo.com/sitemap-0.xml - - - https://www.foo.com/sitemap-1.xml - - - """ - expected_output = ['https://www.foo.com/sitemap-0.xml', 'https://www.foo.com/sitemap-1.xml'] - task = fetch_sitemap_urls - assert task.run(sitemap_index_url='dummy_url') == expected_output +class TestSitemapTasks(unittest.TestCase): + + @patch.object(requests, 'get') + def test_fetch_sitemap_urls(self, mockget): + # Mock the response from requests.get + mockresponse = Mock() + mockget.return_value = mockresponse + mockresponse.text = """ + + + https://www.foo.com/sitemap-0.xml + + + https://www.foo.com/sitemap-1.xml + + + """ + + # Expected output for the mock sitemap index response + expected_output = ['https://www.foo.com/sitemap-0.xml', 'https://www.foo.com/sitemap-1.xml'] + + # Call the function (directly, without Prefect context) + result = fetch_sitemap_urls(sitemap_index_url='dummy_url') + + # Check if the result matches the expected output + self.assertEqual(result, expected_output) + + + @patch.object(requests, 'get') + def test_fetch_sitemap(self, mockget): + # Mock the response from requests.get + mockresponse = Mock() + mockget.return_value = mockresponse + mockresponse.text = """ + + + https://www.foo.come/terms-service + daily + 0.7 + + + https://www.foo.come/policy + daily + 0.7 + + + https://www.foo.come/policy/security + daily + 0.7 + + + """ + + # Expected output for the mock sitemap response + expected_output = [ + {'scraped_at': SCRAPED_AT, 'url': 'https://www.foo.come/terms-service'}, + {'scraped_at': SCRAPED_AT, 'url': 'https://www.foo.come/policy'}, + {'scraped_at': SCRAPED_AT, 'url': 'https://www.foo.come/policy/security'}, + ] + + # Manually pass SCRAPED_AT (instead of using Prefect context) + sitemap_url = 'https://www.foo.com/sitemap-0.xml' + sitemap_filename, sitemap_json = fetch_sitemap(sitemap_url=sitemap_url, scraped_at=SCRAPED_AT) + # Check if the result matches the expected output + self.assertEqual((sitemap_filename, json.loads(sitemap_json)), ('sitemap-0', expected_output)) -@patch.object(requests, 'get') -def test_fetch_sitemap(mockget): - mockresponse = Mock() - mockget.return_value = mockresponse - mockresponse.text = """ - # noqa: E501 - - https://www.foo.come/terms-service - daily - 0.7 - - - https://www.foo.come/policy - daily - 0.7 - - - https://www.foo.come/policy/security - daily - 0.7 - - - """ - expected_output = [ - {'scraped_at': SCRAPED_AT, 'url': 'https://www.foo.come/terms-service'}, - {'scraped_at': SCRAPED_AT, 'url': 'https://www.foo.come/policy'}, - {'scraped_at': SCRAPED_AT, 'url': 'https://www.foo.come/policy/security'}, - ] - task = fetch_sitemap - with context(date=SCRAPED_AT): - sitemap_filename, sitemap_json = task.run(sitemap_url='https://www.foo.com/sitemap-0.xml') - assert (sitemap_filename, json.loads(sitemap_json)) == ('sitemap-0', expected_output) +if __name__ == '__main__': + unittest.main() From 68591dbe1d60982e2bcec53d471be71d0326eb73 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Thu, 7 Nov 2024 21:42:58 +0500 Subject: [PATCH 09/30] fix: remove prefect references from edx api class --- edx_argoutils/__init__.py | 2 +- edx_argoutils/edx_api_client.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 4a813aa..497b775 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.4' +__version__ = '1.0.5' diff --git a/edx_argoutils/edx_api_client.py b/edx_argoutils/edx_api_client.py index d365b0c..8fb6a15 100644 --- a/edx_argoutils/edx_api_client.py +++ b/edx_argoutils/edx_api_client.py @@ -4,7 +4,7 @@ import backoff import requests -from prefect.utilities.logging import get_logger +import logging from requests.auth import AuthBase DEFAULT_RETRY_STATUS_CODES = ( @@ -16,6 +16,7 @@ 520, # This is a custom Cloudwatch code for "Unknown error". ) DEFAULT_TIMEOUT_SECONDS = 7200 +logger = logging.getLogger("edx_api_client") class EdxApiClient(object): @@ -57,7 +58,6 @@ def authenticated_session(self): def ensure_oauth_access_token(self): """Retrieves OAuth 2.0 access token using the client credentials grant and stores it in the request session.""" - logger = get_logger() now = datetime.utcnow() if self._expires_at is None or now >= self._expires_at: logger.info('Token is expired or missing, requesting a new one.') @@ -203,7 +203,6 @@ def __call__(self, r): def log_response_hook(response, *args, **kwargs): # pylint: disable=unused-argument """Log summary information about every request made.""" - logger = get_logger() logger.info( "[{}] [{}] [{}] {}".format( response.request.method, response.status_code, response.elapsed.total_seconds(), response.url From fd922c7ab550d24e00cd422df831ae73913b0225 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Thu, 7 Nov 2024 22:01:25 +0500 Subject: [PATCH 10/30] refactor: refactor writing report to s3 func --- edx_argoutils/__init__.py | 2 +- edx_argoutils/s3.py | 41 ++++++++++++++++++++++----------------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 497b775..7c42346 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.5' +__version__ = '1.0.6' diff --git a/edx_argoutils/s3.py b/edx_argoutils/s3.py index 36f561a..a0c7e02 100644 --- a/edx_argoutils/s3.py +++ b/edx_argoutils/s3.py @@ -6,7 +6,8 @@ from prefect import task from prefect.tasks.aws import s3 from prefect.utilities.aws import get_boto_client - +import boto3 +import logging @task def delete_s3_directory(bucket: str = None, prefix: str = None, credentials: dict = None): @@ -67,29 +68,33 @@ def list_object_keys_from_s3(bucket: str = None, prefix: str = '', credentials: return [] -def get_s3_path_for_date(date): +def get_s3_path_for_file(filename): # The path and file name inside our given bucket and S3 prefix to write the file to - return '{date}/{date}.json'.format(date=date) + return '{filename}/{filename}.json'.format(filename=filename) -@task def write_report_to_s3(download_results: tuple, s3_bucket: str, s3_path: str, credentials: dict = None): - logger = prefect.context.get("logger") - - date, report_str = download_results - date_path = get_s3_path_for_date(date) - s3_key = s3_path + date_path - logger.info("Writing report to S3 for {} to {}".format(date, s3_key)) - - s3.S3Upload(bucket=s3_bucket).run( - report_str, - key=s3_key, - credentials=credentials + logger = logging.getLogger("s3") + + filename, report_str = download_results + file_path = get_s3_path_for_file(filename) + s3_key = s3_path + file_path + logger.info("Writing report to S3 for {} to {}".format(filename, s3_key)) + + s3_client = boto3.client( + 's3', + aws_access_key_id=credentials.get('AccessKeyId'), + aws_secret_access_key=credentials.get('SecretAccessKey'), + aws_session_token=credentials.get('SessionToken') ) - - return date_path + s3_client.put_object( + Bucket=s3_bucket, + Key=s3_key, + Body=report_str, + ContentType='application/json' + ) + return file_path -@task def get_s3_url(s3_bucket, s3_path): return 's3://{bucket}/{path}'.format(bucket=s3_bucket, path=s3_path) From 3770cf91f510be25e97109916ec7d5e66813a1f0 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Fri, 8 Nov 2024 18:13:10 +0500 Subject: [PATCH 11/30] chore: fixes and remove task decorator from load_s3_data_to_snowflake --- edx_argoutils/__init__.py | 2 +- edx_argoutils/sitemap.py | 2 +- edx_argoutils/snowflake.py | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 7c42346..e8bf13c 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.6' +__version__ = '1.0.7' diff --git a/edx_argoutils/sitemap.py b/edx_argoutils/sitemap.py index d71591f..e7aca2c 100644 --- a/edx_argoutils/sitemap.py +++ b/edx_argoutils/sitemap.py @@ -47,7 +47,7 @@ def write_sitemap_to_s3(sitemap_data: str, s3_bucket: str, s3_path: str, credent filename, sitemap_json = sitemap_data today = get_date(None) date_path = f'{today}/{filename}.json' - s3_key = f'{s3_path}/{date_path}' + s3_key = f'{s3_path}{date_path}' s3_client = boto3.client( 's3', aws_access_key_id=credentials.get('AccessKeyId'), diff --git a/edx_argoutils/snowflake.py b/edx_argoutils/snowflake.py index ed14fac..e6e2877 100644 --- a/edx_argoutils/snowflake.py +++ b/edx_argoutils/snowflake.py @@ -224,7 +224,6 @@ def load_ga_data_to_snowflake( sf_connection.close() -@task def load_s3_data_to_snowflake( date: str, date_property: str, From 0ed9a5f76c91c0484447c9e444d52fbca0a0dc57 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Fri, 8 Nov 2024 21:55:05 +0500 Subject: [PATCH 12/30] refactor: refactored s3 funcs --- edx_argoutils/__init__.py | 2 +- edx_argoutils/s3.py | 54 +++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index e8bf13c..6570598 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.7' +__version__ = '1.0.9' diff --git a/edx_argoutils/s3.py b/edx_argoutils/s3.py index a0c7e02..dd6948c 100644 --- a/edx_argoutils/s3.py +++ b/edx_argoutils/s3.py @@ -1,15 +1,26 @@ """ -S3 related common methods and tasks for Prefect +S3 related common methods """ -import prefect -from prefect import task -from prefect.tasks.aws import s3 -from prefect.utilities.aws import get_boto_client import boto3 import logging -@task +logger = logging.getLogger("s3") + +def get_s3_client(credentials: dict = None): + s3_client = None + if credentials: + s3_client = boto3.client( + 's3', + aws_access_key_id=credentials.get('AccessKeyId'), + aws_secret_access_key=credentials.get('SecretAccessKey'), + aws_session_token=credentials.get('SessionToken') + ) + else: + s3_client = boto3.client('s3') + + return s3_client + def delete_s3_directory(bucket: str = None, prefix: str = None, credentials: dict = None): """ Deletes all objects with the given S3 directory (prefix) from the given bucket. @@ -19,10 +30,9 @@ def delete_s3_directory(bucket: str = None, prefix: str = None, credentials: dic prefix (str): The S3 prefix to delete the objects from. credentials (dict): The AWS credentials to use. """ - s3_keys = list_object_keys_from_s3.run(bucket, prefix, credentials) + s3_keys = list_object_keys_from_s3(bucket, prefix, credentials) if s3_keys: - s3_client = get_boto_client('s3', credentials=credentials) - logger = prefect.context.get("logger") + s3_client = get_s3_client(credentials) logger.info("Deleting S3 keys: {}".format(s3_keys)) s3_client.delete_objects( Bucket=bucket, @@ -32,7 +42,6 @@ def delete_s3_directory(bucket: str = None, prefix: str = None, credentials: dic ) -@task def delete_object_from_s3(key: str = None, bucket: str = None, credentials: dict = None, ): """ Delete an object from S3. @@ -41,11 +50,10 @@ def delete_object_from_s3(key: str = None, bucket: str = None, credentials: dict bucket (str): Name of the S3 bucket to delete from. credentials (dict): AWS credentials, if None boto will fall back the usual methods of resolution. """ - s3_client = get_boto_client("s3", credentials=credentials) + s3_client = get_s3_client(credentials) s3_client.delete_object(Bucket=bucket, Key=key) -@task def list_object_keys_from_s3(bucket: str = None, prefix: str = '', credentials: dict = None, ): """ List objects key names from an S3 bucket that match the given prefix. @@ -54,11 +62,13 @@ def list_object_keys_from_s3(bucket: str = None, prefix: str = '', credentials: bucket (str): Name of the S3 bucket to search from. credentials (dict): AWS credentials, if None boto will fall back the usual methods of resolution. """ - s3_client = get_boto_client("s3", credentials=credentials) + s3_client = get_s3_client(credentials) + + found_objects = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix) - logger = prefect.context.get("logger") - logger.info(found_objects) + logger.info(f"Found objects: {found_objects}") + # edx_legacy/segment-config/dev/load_segment_config_to_snowflake/2024-11-08/2024-11-08.json if found_objects['KeyCount']: return [ @@ -68,25 +78,19 @@ def list_object_keys_from_s3(bucket: str = None, prefix: str = '', credentials: return [] -def get_s3_path_for_file(filename): +def get_s3_path_for_date(filename): # The path and file name inside our given bucket and S3 prefix to write the file to return '{filename}/{filename}.json'.format(filename=filename) def write_report_to_s3(download_results: tuple, s3_bucket: str, s3_path: str, credentials: dict = None): - logger = logging.getLogger("s3") - filename, report_str = download_results - file_path = get_s3_path_for_file(filename) + file_path = get_s3_path_for_date(filename) s3_key = s3_path + file_path logger.info("Writing report to S3 for {} to {}".format(filename, s3_key)) - s3_client = boto3.client( - 's3', - aws_access_key_id=credentials.get('AccessKeyId'), - aws_secret_access_key=credentials.get('SecretAccessKey'), - aws_session_token=credentials.get('SessionToken') - ) + s3_client = get_s3_client(credentials) + s3_client.put_object( Bucket=s3_bucket, Key=s3_key, From 8ea7c41df2916d07fb74d7e069461030ec9a4ae9 Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Mon, 11 Nov 2024 16:13:23 +0500 Subject: [PATCH 13/30] chore: update paypal to remove prefect --- edx_argoutils/paypal.py | 50 +++++++++++++---------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/edx_argoutils/paypal.py b/edx_argoutils/paypal.py index 805aa1d..fc01b53 100644 --- a/edx_argoutils/paypal.py +++ b/edx_argoutils/paypal.py @@ -5,14 +5,17 @@ import datetime import fnmatch import json +import logging -import prefect from paramiko import SFTPClient, Transport -from prefect import config, task -from prefect.engine import signals - from edx_argoutils.s3 import get_s3_path_for_date, list_object_keys_from_s3 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', +) +log = logging.getLogger("load_paypal_to_snowflake") + def check_paypal_report(sftp_connection, remote_filename, check_column_name): """ @@ -71,8 +74,7 @@ def get_paypal_filename(date, prefix, connection, remote_path): """ Get remote filename. Sample remote filename: DDR-20190822.01.008.CSV """ - logger = prefect.context.get("logger") - logger.info(connection) + log.info(connection) date_string = date.strftime('%Y%m%d') pattern = (prefix, date_string, 'CSV') remote_filepattern = "*".join(pattern) @@ -87,13 +89,6 @@ class RemoteFileNotFoundError(Exception): pass -# Retry every 10 minutes for 5 hours! This should hopefully handle situations when the report is abnormally late. -@task( - max_retries=30, - retry_delay=datetime.timedelta(minutes=10), - # Skip this retry filter until we upgrade to prefect 1.2.x since it is a new feature. - retry_on=RemoteFileNotFoundError, -) def fetch_paypal_report( date: str, paypal_credentials: dict, @@ -106,38 +101,25 @@ def fetch_paypal_report( port: str = None, remote_path: str = None, aws_credentials: dict = None, - argo: bool = False, ): - paypal_config = getattr(config, 'paypal', None) - - host = getattr(paypal_config, 'host', None) or host - port = getattr(paypal_config, 'port', None) or port - remote_path = getattr(paypal_config, 'remote_path', None) or remote_path - - logger = prefect.context.get("logger") - logger.info("Pulling Paypal report for {}".format(date)) + log.info("Pulling Paypal report for {}".format(date)) if not overwrite: # If we're not overwriting and the file already exists, raise a skip date_path = get_s3_path_for_date(date) s3_key = s3_path + date_path - logger.info("Checking for existence of: {}".format(s3_key)) + log.info("Checking for existence of: {}".format(s3_key)) - existing_file = list_object_keys_from_s3.run(s3_bucket, s3_key, aws_credentials) + existing_file = list_object_keys_from_s3(s3_bucket, s3_key, aws_credentials) if existing_file: - if not argo: - raise signals.SKIP( - 'File {} already exists and we are not overwriting. Skipping.'.format(s3_key) - ) - else: - logger.info( - 'File {} already exists and we are not overwriting. Skipping.'.format(s3_key) - ) - return + log.info( + 'File {} already exists and we are not overwriting. Skipping.'.format(s3_key) + ) + return else: - logger.info("File not found, continuing download for {}.".format(date)) + log.info("File not found, continuing download for {}.".format(date)) transport = Transport(host, port) transport.connect( From bde093b049e69958ce46f50d48106c268b431b10 Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Mon, 11 Nov 2024 16:13:58 +0500 Subject: [PATCH 14/30] chore: bump version 1.0.10 --- edx_argoutils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 6570598..82f8de3 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.9' +__version__ = '1.0.10' From 177b50ce2abb3ff7246ea91d7521f026e5b4943b Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Mon, 11 Nov 2024 21:39:44 +0500 Subject: [PATCH 15/30] chore: update load_s3_data_to_snowflake to remove old approach --- HISTORY.rst | 2 +- edx_argoutils/__init__.py | 2 +- edx_argoutils/snowflake.py | 14 ++++++++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index aa2ba1f..56b29fd 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,7 +2,7 @@ History ======= -0.1.0 (2020-05-11) +0.1.0 (2024-11-07) ------------------ * First release on PyPI. diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 82f8de3..4821ff0 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.10' +__version__ = '1.0.11' diff --git a/edx_argoutils/snowflake.py b/edx_argoutils/snowflake.py index e6e2877..671a230 100644 --- a/edx_argoutils/snowflake.py +++ b/edx_argoutils/snowflake.py @@ -1,9 +1,10 @@ """ -Utility methods and tasks for working with Snowflake from a Prefect flow. +Utility methods and tasks for working with Snowflake from a Argo flow. """ import json import os from collections import namedtuple +import logging from typing import List, TypedDict from urllib.parse import urlparse @@ -20,6 +21,10 @@ MANIFEST_FILE_NAME = 'manifest.json' EXPORT_MAX_FILESIZE = 104857600 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', +) class SFCredentials(TypedDict, total=False): @@ -281,9 +286,9 @@ def load_s3_data_to_snowflake( disable_existence_check (bool, optional): Whether to disable check for existing data, useful when always appending to the table regardless of any existing data for that provided `date` """ - logger = get_logger() + logger = logging.getLogger("Snowflake Utility") if not file and not pattern: - raise signals.FAIL('Either `file` or `pattern` must be specified to run this task.') + raise ValueError("Either `file` or `pattern` must be specified to run this task.") sf_connection = create_snowflake_connection(sf_credentials, sf_role, warehouse=sf_warehouse) @@ -319,7 +324,8 @@ def load_s3_data_to_snowflake( raise if row and not overwrite: - raise signals.SKIP('Skipping task as data for the date exists and no overwrite was provided.') + logger.info("Skipping task as data for the date exists and no overwrite was provided.") + return else: logger.info("Continuing with S3 load for {}".format(date)) From 2126aefb9d29d71e032e3cc0cd1114f074526c84 Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Tue, 19 Nov 2024 13:03:03 +0500 Subject: [PATCH 16/30] chore: Remove prefect from export_snowflake_table_to_s3 --- edx_argoutils/snowflake.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/edx_argoutils/snowflake.py b/edx_argoutils/snowflake.py index 671a230..3b7f3c9 100644 --- a/edx_argoutils/snowflake.py +++ b/edx_argoutils/snowflake.py @@ -13,9 +13,6 @@ from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import serialization from prefect import task -from prefect.engine import signals -from prefect.tasks.aws import s3 -from prefect.utilities.logging import get_logger from edx_argoutils import s3 as s3_utils @@ -420,7 +417,6 @@ def load_s3_data_to_snowflake( sf_connection.close() -@task def export_snowflake_table_to_s3( sf_credentials: SFCredentials, sf_database: str, @@ -470,7 +466,7 @@ def export_snowflake_table_to_s3( copy option. generate_manifest (bool, optional): Whether to generate a manifest file in S3. Defaults to `FALSE`. """ - logger = get_logger() + logger = logging.getLogger("Export Snowflake Table to S3 Utility") sf_connection = create_snowflake_connection( credentials=sf_credentials, @@ -489,7 +485,7 @@ def export_snowflake_table_to_s3( if overwrite: logger.info("Deleting existing data in S3 bucket: {bucket} with prefix: {prefix}".format( bucket=export_bucket, prefix=export_prefix)) - s3_utils.delete_s3_directory.run(export_bucket, export_prefix) + s3_utils.delete_s3_directory(export_bucket, export_prefix) escape_clause = '' if escape_unenclosed_field is None \ else "ESCAPE_UNENCLOSED_FIELD = NONE" if escape_unenclosed_field == 'NONE' \ @@ -557,14 +553,16 @@ def export_snowflake_table_to_s3( "urls": s3_file_paths, } ) - s3.S3Upload(bucket=export_bucket).run( - json.dumps(manifest_file_content), - key=s3_manifest_file_prefix + s3_client = s3_utils.get_s3_client() + s3_client.put_object( + Bucket=export_bucket, + Key=s3_manifest_file_prefix, + Body=json.dumps(manifest_file_content) ) except snowflake.connector.ProgrammingError as e: if 'Files already existing at the unload destination' in e.msg: logger.error("Files already exist at {destination}".format(destination=export_path)) - raise signals.FAIL('Files already exist. Use overwrite option to force unloading.') + raise Exception('Files already exist. Use overwrite option to force unloading.') else: raise finally: From 23e60e63f2a050962a00844e1542a6985e7d4d8e Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Tue, 19 Nov 2024 13:06:57 +0500 Subject: [PATCH 17/30] chore: bump version --- edx_argoutils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 4821ff0..1c7ca71 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.11' +__version__ = '1.0.12' From 84e15664361f40eff1f68f87687a5d8e40965349 Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Tue, 19 Nov 2024 13:28:00 +0500 Subject: [PATCH 18/30] chore: update mysql functions --- edx_argoutils/__init__.py | 2 +- edx_argoutils/mysql.py | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 1c7ca71..a8afaca 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.12' +__version__ = '1.0.13' diff --git a/edx_argoutils/mysql.py b/edx_argoutils/mysql.py index 7e23561..97976da 100644 --- a/edx_argoutils/mysql.py +++ b/edx_argoutils/mysql.py @@ -1,15 +1,16 @@ """ Tasks for interacting with Aurora MySQL. """ +import logging import os - import mysql.connector -from prefect import task -from prefect.engine import signals -from prefect.utilities.logging import get_logger from edx_argoutils.snowflake import MANIFEST_FILE_NAME +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', +) def create_mysql_connection(credentials: dict, database: str, autocommit: bool = False): @@ -44,7 +45,6 @@ def create_mysql_connection(credentials: dict, database: str, autocommit: bool = return connection -@task def load_s3_data_to_mysql( aurora_credentials: dict, database: str, @@ -98,7 +98,7 @@ def _drop_temp_tables(table, connection): query = "DROP TABLE IF EXISTS {table}".format(table=table) connection.cursor().execute(query) - logger = get_logger() + logger = logging.getLogger("load_s3_data_to_mysql") connection = create_mysql_connection(aurora_credentials, database) @@ -120,7 +120,7 @@ def _drop_temp_tables(table, connection): table=table, table_schema=table_schema ) - logger.debug(query) + logger.info(query) connection.cursor().execute(query) # Check for existing data @@ -130,7 +130,8 @@ def _drop_temp_tables(table, connection): row = cursor.fetchone() if row and not overwrite: - raise signals.SKIP('Skipping task as data already exists in the dest. table and no overwrite was provided.') + logger.info('Skipping task as data already exists in the destination table and no overwrite was provided.') + return # Create a temp table for loading data if overwrite and overwrite_with_temp_table: @@ -141,7 +142,7 @@ def _drop_temp_tables(table, connection): try: if row and overwrite and not overwrite_with_temp_table: query = "DELETE FROM {table} {record_filter}".format(table=table, record_filter=record_filter) - logger.debug("Deleting existing data for {table}".format(table=table)) + logger.info("Deleting existing data for {table}".format(table=table)) connection.cursor().execute(query) if use_manifest: @@ -177,7 +178,7 @@ def _drop_temp_tables(table, connection): # Commit if we're not getting an implicit commit from RENAME. connection.commit() except Exception as e: - logger.error(str(e)) + logger.info(str(e)) connection.rollback() raise finally: From ad5a22b9a4bdc2d9bcde797bc9f9c8ece9f774fa Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Mon, 25 Nov 2024 17:27:48 +0500 Subject: [PATCH 19/30] fix: fixed mysql username cred --- edx_argoutils/__init__.py | 2 +- edx_argoutils/mysql.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index a8afaca..ed72755 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.13' +__version__ = '1.0.14' diff --git a/edx_argoutils/mysql.py b/edx_argoutils/mysql.py index 97976da..dc4e513 100644 --- a/edx_argoutils/mysql.py +++ b/edx_argoutils/mysql.py @@ -14,7 +14,7 @@ def create_mysql_connection(credentials: dict, database: str, autocommit: bool = False): - user = credentials['user'] + user = credentials['username'] password = credentials['password'] host = credentials['host'] From f652cde60c219b04f177aafb673f839f533a6c4a Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Fri, 29 Nov 2024 15:31:00 +0500 Subject: [PATCH 20/30] chore: update export_snowflake_table_to_s3 to handle aws_credentials --- edx_argoutils/snowflake.py | 5 +++-- tests/test_common.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/edx_argoutils/snowflake.py b/edx_argoutils/snowflake.py index 3b7f3c9..3e564da 100644 --- a/edx_argoutils/snowflake.py +++ b/edx_argoutils/snowflake.py @@ -434,6 +434,7 @@ def export_snowflake_table_to_s3( overwrite: bool = True, single: bool = False, generate_manifest: bool = False, + aws_credentials: dict = None, ): """ @@ -485,7 +486,7 @@ def export_snowflake_table_to_s3( if overwrite: logger.info("Deleting existing data in S3 bucket: {bucket} with prefix: {prefix}".format( bucket=export_bucket, prefix=export_prefix)) - s3_utils.delete_s3_directory(export_bucket, export_prefix) + s3_utils.delete_s3_directory(export_bucket, export_prefix, aws_credentials) escape_clause = '' if escape_unenclosed_field is None \ else "ESCAPE_UNENCLOSED_FIELD = NONE" if escape_unenclosed_field == 'NONE' \ @@ -553,7 +554,7 @@ def export_snowflake_table_to_s3( "urls": s3_file_paths, } ) - s3_client = s3_utils.get_s3_client() + s3_client = s3_utils.get_s3_client(aws_credentials) s3_client.put_object( Bucket=export_bucket, Key=s3_manifest_file_prefix, diff --git a/tests/test_common.py b/tests/test_common.py index 0e89c45..497a019 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -4,7 +4,7 @@ Tests for Common utils in the `edx_argoutils`. """ -from edx_prefectutils import common +from edx_argoutils import common def test_generate_dates(): From bf2c5fa92d0c461cb8ba2848b15897e020183e83 Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Fri, 29 Nov 2024 15:31:24 +0500 Subject: [PATCH 21/30] chore: bump version --- edx_argoutils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index ed72755..fd8b9e1 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.14' +__version__ = '1.0.15' From 7b1013d45af61c5df91e967a8212faeb97e04966 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Fri, 6 Dec 2024 23:40:38 +0500 Subject: [PATCH 22/30] chore: added functionality for multiple files upload to snowflake --- edx_argoutils/__init__.py | 2 +- edx_argoutils/snowflake.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index fd8b9e1..6c0558c 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.15' +__version__ = '1.0.16' diff --git a/edx_argoutils/snowflake.py b/edx_argoutils/snowflake.py index 3e564da..e6be30a 100644 --- a/edx_argoutils/snowflake.py +++ b/edx_argoutils/snowflake.py @@ -376,7 +376,9 @@ def load_s3_data_to_snowflake( if file: logger.info("Loading file {}".format(file)) - files_paramater = "FILES = ( '{}' )".format(file) + files = file.split(',') + # files_paramater = "FILES = ( '{}' )".format(file) + files_parameter = "FILES = ({})".format(", ".join(["'{}'".format(f) for f in files])) if pattern: logger.info("Loading pattern {}".format(pattern)) From 53be214d9dbebbcfe130dcf9cbf3861cc3afb106 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Sat, 7 Dec 2024 01:11:59 +0500 Subject: [PATCH 23/30] fix: fix variable name in snowflake.py --- edx_argoutils/__init__.py | 2 +- edx_argoutils/snowflake.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 6c0558c..eee0975 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.16' +__version__ = '1.0.17' diff --git a/edx_argoutils/snowflake.py b/edx_argoutils/snowflake.py index e6be30a..3cc8534 100644 --- a/edx_argoutils/snowflake.py +++ b/edx_argoutils/snowflake.py @@ -378,7 +378,7 @@ def load_s3_data_to_snowflake( logger.info("Loading file {}".format(file)) files = file.split(',') # files_paramater = "FILES = ( '{}' )".format(file) - files_parameter = "FILES = ({})".format(", ".join(["'{}'".format(f) for f in files])) + files_paramater = "FILES = ({})".format(", ".join(["'{}'".format(f) for f in files])) if pattern: logger.info("Loading pattern {}".format(pattern)) From 0329e4d9305d04a0a8b863faca141996e06dcaf4 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Fri, 24 Jan 2025 18:32:25 +0500 Subject: [PATCH 24/30] chore: return all objects in list_object_keys_from_s3 function --- edx_argoutils/s3.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/edx_argoutils/s3.py b/edx_argoutils/s3.py index dd6948c..018f5f2 100644 --- a/edx_argoutils/s3.py +++ b/edx_argoutils/s3.py @@ -63,19 +63,32 @@ def list_object_keys_from_s3(bucket: str = None, prefix: str = '', credentials: credentials (dict): AWS credentials, if None boto will fall back the usual methods of resolution. """ s3_client = get_s3_client(credentials) - - found_objects = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix) + all_object_keys = [] - logger.info(f"Found objects: {found_objects}") + response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix) + + if 'Contents' in response: + all_object_keys.extend([o['Key'] for o in response['Contents']]) + + while response.get('IsTruncated'): # Check if there are more objects to fetch + # Use the NextContinuationToken to get the next batch of results + response = s3_client.list_objects_v2( + Bucket=bucket, + Prefix=prefix, + ContinuationToken=response['NextContinuationToken'] + ) + + # Add the new batch of results to the list + if 'Contents' in response: + all_object_keys.extend([o['Key'] for o in response['Contents']]) + + # Log the total number of found objects + logger.info(f"Total objects found: {len(all_object_keys)}") + logger.info(f"Found objects: {all_object_keys}") # edx_legacy/segment-config/dev/load_segment_config_to_snowflake/2024-11-08/2024-11-08.json - if found_objects['KeyCount']: - return [ - o['Key'] for o in found_objects['Contents'] - ] - else: - return [] + return all_object_keys def get_s3_path_for_date(filename): From 3f644fe2b7dec844339a30c4f68213cdb1ae637a Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Fri, 24 Jan 2025 18:35:24 +0500 Subject: [PATCH 25/30] chore: incremented version --- edx_argoutils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index eee0975..55b3648 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.17' +__version__ = '1.0.18' From f524d5209d37dd139eaaf2588bbb38364b67360d Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Mon, 27 Jan 2025 18:38:14 +0500 Subject: [PATCH 26/30] chore: remove task from GA function --- edx_argoutils/__init__.py | 2 +- edx_argoutils/snowflake.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/edx_argoutils/__init__.py b/edx_argoutils/__init__.py index 55b3648..92dc86a 100644 --- a/edx_argoutils/__init__.py +++ b/edx_argoutils/__init__.py @@ -2,4 +2,4 @@ Top-level package for edx-argoutils. """ -__version__ = '1.0.18' +__version__ = '1.0.19' diff --git a/edx_argoutils/snowflake.py b/edx_argoutils/snowflake.py index 3cc8534..b51b012 100644 --- a/edx_argoutils/snowflake.py +++ b/edx_argoutils/snowflake.py @@ -99,10 +99,6 @@ def qualified_stage_name(database, schema, table) -> str: ) -@task -@backoff.on_exception(backoff.expo, - snowflake.connector.ProgrammingError, - max_tries=3) def load_ga_data_to_snowflake( sf_credentials: SFCredentials, sf_database: str, From 377e22911daf53a445e6aeb4d2e23fa7bf8048ed Mon Sep 17 00:00:00 2001 From: Abdul Rafey Date: Mon, 27 Jan 2025 18:39:41 +0500 Subject: [PATCH 27/30] chore: remove unused import --- edx_argoutils/snowflake.py | 1 - 1 file changed, 1 deletion(-) diff --git a/edx_argoutils/snowflake.py b/edx_argoutils/snowflake.py index b51b012..d7ae6d0 100644 --- a/edx_argoutils/snowflake.py +++ b/edx_argoutils/snowflake.py @@ -12,7 +12,6 @@ import snowflake.connector from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import serialization -from prefect import task from edx_argoutils import s3 as s3_utils From bf9f7c878462d5857c34ca4d210bf9bea51570e7 Mon Sep 17 00:00:00 2001 From: kmaepa Date: Thu, 30 Jan 2025 09:38:32 +0200 Subject: [PATCH 28/30] chore: add test_generate_date_range unit test --- tests/test_common.py | 46 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/test_common.py b/tests/test_common.py index 497a019..71515b3 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -5,6 +5,8 @@ """ from edx_argoutils import common +from datetime import datetime, date +from unittest.mock import patch def test_generate_dates(): @@ -31,3 +33,47 @@ def test_get_unzipped_cartesian_product(): (1, 1, 1, 2, 2, 2, 3, 3, 3), ("a", "b", "c", "a", "b", "c", "a", "b", "c") ] + + +def test_generate_date_range(): + # Test Case 1: Custom date range (is_daily=False) + result = common.generate_date_range( + start_date='2025-01-01', + end_date='2025-01-05', + is_daily=False + ) + assert result == [ + datetime.strptime(date, '%Y-%m-%d').date() + for date in ['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04', '2025-01-05'] + ] + + # Test Case 2: Daily run (is_daily=True) with mock + fixed_today = date(2025, 1, 28) # Assume today's date is 2025-01-28 + with patch('edx_argoutils.common.date') as mock_date: + mock_date.today.return_value = fixed_today + mock_date.side_effect = lambda *args, **kwargs: date(*args, **kwargs) + + result = common.generate_date_range(is_daily=True) + expected = [date(2025, 1, 26)] # Two days before fixed_today + assert result == expected, f"Expected {expected}, but got {result}" + + #Test Case 3: True-up scenario (is_daily=False, no start_date and end_date) + with patch('edx_argoutils.common.date') as mock_date: + mock_date.today.return_value = fixed_today + mock_date.side_effect = lambda *args, **kwargs: date(*args, **kwargs) + + result = common.generate_date_range(is_daily=False) + expected = [date(2024, 12, d) for d in range(1, 32)] # Last completed month + assert result == expected, f"Expected {expected}, but got {result}" + + # Test Case 4: Invalid parameters + try: + common.generate_date_range( + start_date="2025-01-01", + end_date=None, + is_daily=False + ) + except Exception as e: + assert str(e) == "Incorrect parameters passed!" + else: + assert False, "Expected an exception but none was raised!" \ No newline at end of file From cee832fe842054b7be046e0068d0a6551bd13522 Mon Sep 17 00:00:00 2001 From: kmaepa Date: Thu, 30 Jan 2025 09:52:03 +0200 Subject: [PATCH 29/30] chore: add unit test test_generate_date_range --- tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_common.py b/tests/test_common.py index 71515b3..f85eb3f 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -63,7 +63,7 @@ def test_generate_date_range(): mock_date.side_effect = lambda *args, **kwargs: date(*args, **kwargs) result = common.generate_date_range(is_daily=False) - expected = [date(2024, 12, d) for d in range(1, 32)] # Last completed month + expected = [date(2024, 12, d) for d in range(1, 32)] #Last completed month assert result == expected, f"Expected {expected}, but got {result}" # Test Case 4: Invalid parameters From 449b461b30f9bb2e34a462bf613ed4783973315b Mon Sep 17 00:00:00 2001 From: kmaepa Date: Thu, 30 Jan 2025 13:54:54 +0200 Subject: [PATCH 30/30] chore: add unit test for get_filename_safe_course_id --- tests/test_common.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test_common.py b/tests/test_common.py index f85eb3f..63ed115 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -7,7 +7,7 @@ from edx_argoutils import common from datetime import datetime, date from unittest.mock import patch - +from opaque_keys.edx.keys import CourseKey def test_generate_dates(): result = common.generate_dates( @@ -35,6 +35,18 @@ def test_get_unzipped_cartesian_product(): ] +def test_valid_course_id(): + course_id = "course-v1:BerkeleyX+CS198.SDC.1+1T2021" + result = "BerkeleyX_CS198.SDC.1_1T2021" + with patch.object(CourseKey, 'from_string') as mock_from_string: + mock_from_string.return_value._to_string.return_value = result + assert result == common.get_filename_safe_course_id(course_id) + +def test_invalid_course_id(): + course_id = "BerkeleyX!CS198.SDC.1!1T2021" + result = "BerkeleyX_CS198.SDC.1_1T2021" + assert result == common.get_filename_safe_course_id(course_id) + def test_generate_date_range(): # Test Case 1: Custom date range (is_daily=False) result = common.generate_date_range(