diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml index a59ff5a7..226210e3 100644 --- a/.github/workflows/container.yml +++ b/.github/workflows/container.yml @@ -31,7 +31,7 @@ jobs: uses: actions/checkout@v4 - name: Log in to the Container registry - uses: docker/login-action@0d4c9c5ea7693da7b068278f7b52bda2a190a446 + uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} @@ -45,7 +45,7 @@ jobs: - name: Build Docker image (PCW) if: ${{ matrix.suffix == 'main' }} - uses: docker/build-push-action@ca052bb54ab0790a636c9b5f226502c73d547a25 + uses: docker/build-push-action@32945a339266b759abcbdc89316275140b0fc960 with: context: . file: containers/Dockerfile @@ -54,7 +54,7 @@ jobs: labels: ${{ steps.meta.outputs.labels }} - name: Build Docker image (K8S) if: ${{ matrix.suffix == 'k8s' }} - uses: docker/build-push-action@ca052bb54ab0790a636c9b5f226502c73d547a25 + uses: docker/build-push-action@32945a339266b759abcbdc89316275140b0fc960 with: context: . file: containers/Dockerfile_${{ matrix.suffix }} @@ -77,7 +77,7 @@ jobs: uses: actions/checkout@v4 - name: Log in to the Container registry - uses: docker/login-action@0d4c9c5ea7693da7b068278f7b52bda2a190a446 + uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} @@ -91,7 +91,7 @@ jobs: - name: Build and push Docker image (PCW) if: ${{ matrix.suffix == 'main' }} - uses: docker/build-push-action@ca052bb54ab0790a636c9b5f226502c73d547a25 + uses: docker/build-push-action@32945a339266b759abcbdc89316275140b0fc960 with: context: . file: containers/Dockerfile @@ -100,7 +100,7 @@ jobs: labels: ${{ steps.meta.outputs.labels }} - name: Build and push Docker image (K8S) if: ${{ matrix.suffix == 'k8s' }} - uses: docker/build-push-action@ca052bb54ab0790a636c9b5f226502c73d547a25 + uses: docker/build-push-action@32945a339266b759abcbdc89316275140b0fc960 with: context: . file: containers/Dockerfile_${{ matrix.suffix }} diff --git a/README.md b/README.md index 7db6d2b8..d1c4c5eb 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ > **Anton Smorodskyi**: YES, constantly! :partygeeko: I see it in every palm on the beach ! PublicCloud-Watcher (PCW) is a web app which monitors, displays and deletes resources on various Cloud Service Providers (CSPs). -PCW has two main flows : +PCW has three main flows : 1. **Update run ( implemented in [ocw/lib/db.py](ocw/lib/db.py) )** Executed every 45 minutes. Concentrates on deleting VMs (in case of Azure Resource Groups). - Each update scans accounts defined in configuration file and writes the obtained results into a local sqlite database. Newly discovered entities get assigned an obligatory time-to-life value (TTL). TTL may be taken from tag `openqa_ttl` if entity is tagged with such tag if not PCW will check `pcw.ini` for `updaterun/default_ttl` setting and if setting is not defined than PCW will use hard-coded value from [webui/settings.py](webui/settings.py). Database has a web UI where you can manually trigger certain entity deletion. @@ -19,45 +19,47 @@ PCW has two main flows : 2. **Cleanup ( implemented in [ocw/lib/cleanup.py](ocw/lib/cleanup.py) )** Execution via django command. Concentrates on everything except VM deletion. This vary a lot per CSP so let's clarify that on per provider level. - For Azure such entities monitored (check details in [ocw/lib/azure.py](ocw/lib/azure.py)): a. bootdiagnostics - b. Blobs in `sle-images` container - c. Disks assigned to certain resource groups - d. Images assigned to certain resource groups + b. Blobs in all containers + c. Disks assigned to certain resource group defined in pcw.ini ('azure-storage-resourcegroup') + d. Images assigned to certain resource group defined in pcw.ini ('azure-storage-resourcegroup') + e. Image versions assigned to certain resource group defined in pcw.ini ('azure-storage-resourcegroup') - For EC2 such entities monitored (check details in [ocw/lib/ec2.py](ocw/lib/ec2.py)): a. Images in all regions defined b. Snapshots in all region defined c. Volumes in all regions defined d. VPC's ( deletion of VPC means deletion of all assigned to VPC entities first ( security groups , networks etc. )) - For GCE deleting disks, images & network resources (check details in [ocw/lib/gce.py](ocw/lib/gce.py)) - - For Openstack deleting instances, images & keypairs (check details in [ocw/lib/openstack.py](ocw/lib/openstack.py) +3. **Dump entities quantity ( implemented in [ocw/lib/dumpstate.py](ocw/lib/dumpstate.py) )**. To be able to react fast on possible bugs in PCW and/or unexpected creation of many resources there is ability to dump real time data from each CSP into defined InfluxDB instance. This allow building real-time dashboards and/or setup notification flow. The fastest way to run PCW is via the provided containers, as described in the [Running a container](#running-a-container) section. -## Install +# Usage + +## Python virtualenv + +### Requirements files PCW has 3 sets of virtual env requirements files : - [requirements.txt](requirements.txt) common usage for everything except K8S related cleanups - [requirements_k8s.txt](requirements_k8s.txt) due to high volume of dependencies needed only in single use case (k8s cleanups) they excluded in independent category - [requirements_test.txt](requirements_test.txt) contains dependencies allowing to run pcw's unit tests -It's recommended to setup `pcw` in a virtual environment to avoid package collisions: - -```bash -virtualenv venv -. venv/bin/activate -pip install -r requirements.txt -``` - -## Configure and run +### Configuration Configuration of PCW happens via a global config file in `/etc/pcw.ini`. See [templates/pcw.ini](templates/pcw.ini) for a configuration template. To start, copy the template over: +```bash cp templates/pwc.ini /etc/pcw.ini +``` +### CSP credentials To be able to connect to CSP PCW needs Service Principal details. Depending on namespaces defined in `pcw.ini` PCW will expect some JSON files to be created -under `/var/pcw/[namespace name]/[Azure/EC2/GCE/Openstack].json`. See [templates/var/example_namespace/](templates/var/example_namespace/) for examples. +under `/var/pcw/[namespace name]/[Azure/EC2/GCE].json`. See [templates/var/example_namespace/](templates/var/example_namespace/) for examples. PCW supports email notifications about left-over instances. See the `notify` section therein and their corresponding comments. +### Build and run + ```bash # Setup virtual environment virtualenv env @@ -79,7 +81,9 @@ python manage.py runserver By default, PCW runs on http://127.0.0.1:8000/ -## Building PCW containers +## PCW in container + +### Available containers In [containers](containers/) folder you main find several Dockerfiles to build several different images: @@ -87,12 +91,12 @@ In [containers](containers/) folder you main find several Dockerfiles to build s - [Dockerfile_k8s](containers/Dockerfile_k8s) image based on [bci-python3.11](https://registry.suse.com/categories/bci-devel/repositories/bci-python311) and can be used to run k8s cleanup - [Dockerfile_k8s_dev](containers/Dockerfile_k8s_dev) and [Dockerfile_dev](containers/Dockerfile_dev) images which contains same set of dependencies as [Dockerfile](containers/Dockerfile) and [Dockerfile_k8s](containers/Dockerfile_k8s) and expect PCW source code to be mounted as volumes. Very usefull for development experiments -## Running a container +### Execution You can use the already build containers within [this repository](https://github.com/orgs/SUSE/packages?repo_name=pcw): ```bash -podman pull ghcr.io/suse/pcw:latest +podman pull ghcr.io/suse/pcw_main:latest podman pull ghcr.io/suse/pcw_k8s:latest ``` @@ -104,7 +108,7 @@ The PCW container supports two volumes to be mounted: To create a container using e.g. the data directory `/srv/pcw` for both volumes and expose port 8000, run the following: ```bash -podman create --hostname pcw --name pcw -v /srv/pcw/pcw.ini:/etc/pcw.ini -v /srv/pcw/db:/pcw/db -v :/var/pcw -p 8000:8000/tcp ghcr.io/suse/pcw:latest +podman create --hostname pcw --name pcw -v /srv/pcw/pcw.ini:/etc/pcw.ini -v /srv/pcw/db:/pcw/db -v :/var/pcw -p 8000:8000/tcp ghcr.io/suse/pcw_main:latest podman start pcw ``` @@ -113,7 +117,7 @@ The `pcw` container runs by default the [/pcw/container-startup](containers/cont ```bash podman exec pcw /pcw/container-startup help -podman run -ti --rm --hostname pcw --name pcw -v /srv/pcw/pcw.ini:/etc/pcw.ini -v :/var/pcw -v /srv/pcw/db:/pcw/db -p 8000:8000/tcp ghcr.io/suse/pcw:latest /pcw/container-startup help +podman run -ti --rm --hostname pcw --name pcw -v /srv/pcw/pcw.ini:/etc/pcw.ini -v :/var/pcw -v /srv/pcw/db:/pcw/db -p 8000:8000/tcp ghcr.io/suse/pcw_main:latest /pcw/container-startup help ``` To create an user within the created container named `pcw`, run @@ -122,33 +126,36 @@ To create an user within the created container named `pcw`, run podman exec pcw /pcw/container-startup createuser admin USE_A_STRONG_PASSWORD ``` -## Devel version of container +### Devel version There is [devel version](containers/Dockerfile_dev) of container file. Main difference is that source files are not copied into image but expected to be mounted via volume. This ease development in environment close as much as possible to production run. Expected use would be : ```bash -make podman-container-devel +make container-devel podman run -v :/etc/pcw.ini -v :/var/pcw -v :/pcw -t pcw-devel "python3 manage.py " ``` +## Test and debug -## Codecov - -Running codecov locally require installation of `pytest pytest-cov codecov`. -Then you can run it with +### Testing ```bash -BROWSER=$(xdg-settings get default-web-browser) -pytest -v --cov=./ --cov-report=html && $BROWSER htmlcov/index.html +virtualenv . +source bin/activate +pip install -r requirements_test.txt +make test ``` -and explore the results in your browser +The tests contain a Selenium test for the webUI that uses Podman. Make sure that you have the latest [geckodriver](https://github.com/mozilla/geckodriver/releases) installed anywhere in your `PATH` and that the `podman.socket` is enabled: +`systemctl --user enable --now podman.socket` + +Set the `SKIP_SELENIUM` environment variable when running `pytest` or `make test` to skip the Selenium test. -## Debug +### Debug -To simplify problem investigation pcw has two [django commands](https://docs.djangoproject.com/en/3.1/howto/custom-management-commands/) : +To simplify problem investigation pcw has several [django commands](https://docs.djangoproject.com/en/3.1/howto/custom-management-commands/) : [cleanup](ocw/management/commands/cleanup.py) @@ -160,17 +167,3 @@ To simplify problem investigation pcw has two [django commands](https://docs.dja those allows triggering core functionality without web UI. It is highly recommended to use `dry_run = True` in `pcw.ini` in such cases. - -## Testing - -```bash -virtualenv . -source bin/activate -pip install -r requirements_test.txt -make test -``` - -The tests contain a Selenium test for the webUI that uses Podman. Make sure that you have the latest [geckodriver](https://github.com/mozilla/geckodriver/releases) installed anywhere in your `PATH` and that the `podman.socket` is enabled: -`systemctl --user enable --now podman.socket` - -Set the `SKIP_SELENIUM` environment variable when running `pytest` or `make test` to skip the Selenium test. diff --git a/containers/Dockerfile_k8s b/containers/Dockerfile_k8s index 93694f6d..c9f91ca9 100644 --- a/containers/Dockerfile_k8s +++ b/containers/Dockerfile_k8s @@ -1,6 +1,6 @@ FROM registry.suse.com/bci/python:3.11 -RUN zypper -n in gcc tar gzip kubernetes1.24-client aws-cli && zypper clean && rm -rf /var/cache +RUN zypper -n in gcc tar gzip kubernetes1.28-client aws-cli && zypper clean && rm -rf /var/cache # Google cli installation RUN curl -sf https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-415.0.0-linux-x86_64.tar.gz | tar -zxf - -C /opt \ diff --git a/containers/Dockerfile_k8s_dev b/containers/Dockerfile_k8s_dev index d5aaa542..e285e3c1 100644 --- a/containers/Dockerfile_k8s_dev +++ b/containers/Dockerfile_k8s_dev @@ -1,6 +1,6 @@ FROM registry.suse.com/bci/python:3.11 -RUN zypper -n in gcc tar gzip kubernetes1.24-client aws-cli && zypper clean && rm -rf /var/cache +RUN zypper -n in gcc tar gzip kubernetes1.28-client aws-cli && zypper clean && rm -rf /var/cache # Google cli installation RUN curl -sf https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-415.0.0-linux-x86_64.tar.gz | tar -zxf - -C /opt \ diff --git a/ocw/enums.py b/ocw/enums.py index 6523ba57..370e8e97 100644 --- a/ocw/enums.py +++ b/ocw/enums.py @@ -20,7 +20,6 @@ class ProviderChoice(ChoiceEnum): GCE = 'Google' EC2 = 'EC2' AZURE = 'Azure' - OSTACK = 'Openstack' @staticmethod def from_str(provider): @@ -30,8 +29,6 @@ def from_str(provider): return ProviderChoice.EC2 if provider.upper() == ProviderChoice.AZURE: return ProviderChoice.AZURE - if provider.upper() == ProviderChoice.OSTACK: - return ProviderChoice.OSTACK raise ValueError(f"{provider} is not convertable to ProviderChoice") diff --git a/ocw/lib/cleanup.py b/ocw/lib/cleanup.py index 75cdd22e..22f204cc 100644 --- a/ocw/lib/cleanup.py +++ b/ocw/lib/cleanup.py @@ -4,7 +4,6 @@ from ocw.lib.azure import Azure from ocw.lib.ec2 import EC2 from ocw.lib.gce import GCE -from ocw.lib.openstack import Openstack from ocw.lib.eks import EKS from ocw.lib.emailnotify import send_mail, send_cluster_notification from ocw.enums import ProviderChoice @@ -26,9 +25,6 @@ def cleanup_run(): if ProviderChoice.GCE in providers: GCE(namespace).cleanup_all() - if ProviderChoice.OSTACK in providers: - Openstack(namespace).cleanup_all() - except Exception as ex: logger.exception("[%s] Cleanup failed!", namespace) send_mail(f'{type(ex).__name__} on Cleanup in [{namespace}]', traceback.format_exc()) diff --git a/ocw/lib/db.py b/ocw/lib/db.py index 3358eac4..8931b461 100644 --- a/ocw/lib/db.py +++ b/ocw/lib/db.py @@ -8,7 +8,7 @@ from ocw.apps import getScheduler from webui.PCWConfig import PCWConfig from ..models import Instance, StateChoice, ProviderChoice, CspInfo -from .emailnotify import send_mail, send_leftover_notification +from .emailnotify import send_mail from .azure import Azure from .ec2 import EC2 from .gce import GCE @@ -155,7 +155,6 @@ def update_run() -> None: traceback.format_exc()) auto_delete_instances() - send_leftover_notification() RUNNING = False if not error_occured: LAST_UPDATE = datetime.now(timezone.utc) diff --git a/ocw/lib/dump_state.py b/ocw/lib/dump_state.py index f7256626..0d7bbcb0 100644 --- a/ocw/lib/dump_state.py +++ b/ocw/lib/dump_state.py @@ -4,6 +4,7 @@ from webui.PCWConfig import PCWConfig from ocw.lib.azure import Azure from ocw.lib.ec2 import EC2 +from ocw.lib.gce import GCE from ocw.enums import ProviderChoice from ocw.lib.influx import Influx @@ -65,6 +66,43 @@ def dump_state(): namespace, EC2(namespace).count_all_volumes ) + Influx().dump_resource( + ProviderChoice.EC2.value, + Influx.VPC_QUANTITY, + namespace, + EC2(namespace).count_all_vpc + ) + if ProviderChoice.GCE in providers: + Influx().dump_resource( + ProviderChoice.GCE.value, + Influx.VMS_QUANTITY, + namespace, + GCE(namespace).count_all_instances + ) + Influx().dump_resource( + ProviderChoice.GCE.value, + Influx.IMAGES_QUANTITY, + namespace, + GCE(namespace).count_all_images + ) + Influx().dump_resource( + ProviderChoice.GCE.value, + Influx.DISK_QUANTITY, + namespace, + GCE(namespace).count_all_disks + ) + Influx().dump_resource( + ProviderChoice.GCE.value, + Influx.BLOB_QUANTITY, + namespace, + GCE(namespace).count_all_blobs + ) + Influx().dump_resource( + ProviderChoice.GCE.value, + Influx.NETWORK_QUANTITY, + namespace, + GCE(namespace).count_all_networks + ) except Exception: logger.exception( "[%s] Dump state failed!: \n %s", namespace, traceback.format_exc() diff --git a/ocw/lib/ec2.py b/ocw/lib/ec2.py index 92b80340..d4eb33d6 100644 --- a/ocw/lib/ec2.py +++ b/ocw/lib/ec2.py @@ -325,7 +325,12 @@ def vpc_can_be_deleted(self, resource_vpc, vpc_id) -> bool: def report_cleanup_results(self, vpc_errors: list, vpc_notify: list, vpc_locked: list) -> None: if len(vpc_errors) > 0: - send_mail(f'Errors on VPC deletion in [{self._namespace}]', '\n'.join(vpc_errors)) + # this is most common error message which we can not fix. + # So no point to spam us with notifications about it + known_error = "An error occurred (DependencyViolation) when calling the DeleteVpc operation" + filtered = [x for x in vpc_errors if known_error not in x] + if len(filtered) > 0: + send_mail(f'Errors on VPC deletion in [{self._namespace}]', '\n'.join(vpc_errors)) if len(vpc_notify) > 0: send_mail(f'{len(vpc_notify)} VPC\'s should be deleted, skipping due vpc-notify-only=True', ','.join(vpc_notify)) if len(vpc_locked) > 0: @@ -345,6 +350,13 @@ def count_all_volumes(self) -> int: all_volumes_cnt += len(response['Volumes']) return all_volumes_cnt + def count_all_vpc(self) -> int: + all_vpcs = 0 + for region in self.all_regions: + response = self.ec2_client(region).describe_vpcs(Filters=[{'Name': 'isDefault', 'Values': ['false']}]) + all_vpcs += len(response['Vpcs']) + return all_vpcs + def cleanup_images(self, valid_period_days: float) -> None: self.log_dbg('Call cleanup_images') for region in self.all_regions: diff --git a/ocw/lib/emailnotify.py b/ocw/lib/emailnotify.py index d0141613..f2fe6dd4 100644 --- a/ocw/lib/emailnotify.py +++ b/ocw/lib/emailnotify.py @@ -1,4 +1,3 @@ -from datetime import timedelta import smtplib import logging from email.mime.text import MIMEText @@ -6,7 +5,6 @@ from django.urls import reverse from webui.PCWConfig import PCWConfig from webui.settings import build_absolute_uri -from ..models import Instance logger = logging.getLogger(__name__) @@ -31,22 +29,6 @@ def draw_instance_table(objects): return table.draw() -def send_leftover_notification(): - if PCWConfig.has('notify'): - all_instances = Instance.objects - all_instances = all_instances.filter(active=True, age__gt=timedelta(hours=PCWConfig.get_feature_property( - 'notify', 'age-hours'))).exclude(ignore=True) - body_prefix = f"Message from {build_absolute_uri()}\n\n" - # Handle namespaces - for namespace in PCWConfig.get_namespaces_for('notify'): - receiver_email = PCWConfig.get_feature_property('notify', 'to', namespace) - namespace_objects = all_instances.filter(namespace=namespace) - if namespace_objects.filter(notified=False).count() > 0 and receiver_email: - send_mail(f'CSP left overs - {namespace}', - body_prefix + draw_instance_table(namespace_objects), receiver_email=receiver_email) - all_instances.update(notified=True) - - def send_cluster_notification(namespace, clusters): if len(clusters) and PCWConfig.has('notify'): clusters_str = '' diff --git a/ocw/lib/gce.py b/ocw/lib/gce.py index 95cc85cb..1cb1bee2 100644 --- a/ocw/lib/gce.py +++ b/ocw/lib/gce.py @@ -187,9 +187,14 @@ def cleanup_images(self) -> None: self.log_dbg(f"{len(images)} images found") for image in images: if self.is_outdated(parse(image["creationTimestamp"]).astimezone(timezone.utc)): - self._delete_resource( - self.compute_client().images, image["name"], project=self.project, image=image["name"] - ) + labels = image.get('labels', []) + pcw_ignore_tag = 'pcw_ignore' in labels + if pcw_ignore_tag: + self.log_dbg(f"Ignoring {image['name']} due to 'pcw_ignore' label set to '1'") + else: + self._delete_resource( + self.compute_client().images, image["name"], project=self.project, image=image["name"] + ) def cleanup_firewalls(self) -> None: self.log_dbg("Firewalls cleanup") @@ -259,3 +264,22 @@ def cleanup_networks(self) -> None: self._delete_resource( self.compute_client().networks, network["name"], project=self.project, network=network["name"] ) + + def count_all_instances(self) -> int: + return len(self.list_all_instances()) + + def count_all_images(self) -> int: + return len(self._paginated(self.compute_client().images, project=self.project)) + + def count_all_disks(self) -> int: + all_disks = 0 + for region in self.list_regions(): + for zone in self.list_zones(region): + all_disks += len(self._paginated(self.compute_client().disks, project=self.project, zone=zone)) + return all_disks + + def count_all_blobs(self) -> int: + return len(self._paginated(self.storage_client().objects, bucket=self.__bucket)) + + def count_all_networks(self) -> int: + return len(self._paginated(self.compute_client().networks, project=self.project)) diff --git a/ocw/lib/influx.py b/ocw/lib/influx.py index 9b921c5f..a24e50d9 100644 --- a/ocw/lib/influx.py +++ b/ocw/lib/influx.py @@ -17,8 +17,11 @@ class Influx: VMS_QUANTITY: str = "vms_quantity" IMAGES_QUANTITY: str = "images_quantity" DISK_QUANTITY: str = "disk_quantity" + BLOB_QUANTITY: str = "blob_quantity" VOLUMES_QUANTITY: str = "volumes_quanity" IMAGE_VERSION_QUANTITY: str = "img_version_quantity" + VPC_QUANTITY: str = "vpc_quantity" + NETWORK_QUANTITY: str = "network_quantity" NAMESPACE_TAG: str = "namespace" def __init__(self) -> None: diff --git a/ocw/lib/openstack.py b/ocw/lib/openstack.py deleted file mode 100644 index 1c2cb9e3..00000000 --- a/ocw/lib/openstack.py +++ /dev/null @@ -1,116 +0,0 @@ -from datetime import datetime, timezone -from typing import Dict -from dateutil.parser import parse -import openstack -from openstack.exceptions import OpenStackCloudException -from webui.PCWConfig import PCWConfig -from webui.settings import DEBUG -from .provider import Provider - - -class Openstack(Provider): - __instances: Dict[str, "Openstack"] = {} - - def __init__(self, namespace: str): - super().__init__(namespace) - self.client() - - def __new__(cls, namespace: str): - if namespace not in Openstack.__instances: - Openstack.__instances[namespace] = self = object.__new__(cls) - self.__client = None - return Openstack.__instances[namespace] - - def client(self) -> None: - if self.__client is None: - self.__client = openstack.connect( - debug=bool(DEBUG), - auth_url=self.get_data('auth_url'), - project_name=self.get_data('project_name'), - username=self.get_data('username'), - password=self.get_data('password'), - region_name=self.get_data('region_name'), - user_domain_name=self.get_data('user_domain_name'), - project_id=self.get_data('project_id'), - load_envvars=False, # Avoid reading OS_* environment variables - load_yaml_config=False, # Avoid reading clouds.yaml - ) - return self.__client - - def is_outdated(self, timestamp: str, param: str) -> bool: - now = datetime.now(timezone.utc) - max_days = PCWConfig.get_feature_property('cleanup', param, self._namespace) - return (now - parse(timestamp).astimezone(timezone.utc)).days > max_days - - def cleanup_all(self) -> None: - self._cleanup_instances() - self._cleanup_images() - self._cleanup_keypairs() - - def _cleanup_instances(self) -> None: - # Delete VM's & associated floating IP address(es) - try: - servers = [vm for vm in self.client().compute.servers() if vm.name.startswith("openqa-vm-")] - except OpenStackCloudException as exc: - self.log_warn("Got exception while listing instances: {}", exc) - return - self.log_dbg("Found {} servers", len(servers)) - for server in servers: - if self.is_outdated(server.created_at, "openstack-vm-max-age-days"): - if self.dry_run: - self.log_info("Instance termination {} skipped due to dry run mode", server.name) - else: - self.log_info("Deleting instance {}", server.name) - try: - if not self.client().delete_server( - server.name, - wait=False, - timeout=180, - delete_ips=True, # Delete floating IP address - delete_ip_retry=1): - self.log_err("Failed to delete instance {}", server.name) - except OpenStackCloudException as exc: - self.log_warn("Got exception while deleting instance {}: {}", server.name, exc) - - def _cleanup_images(self) -> None: - try: - images = [image for image in self.client().image.images() if "openqa" in image.tags] - except OpenStackCloudException as exc: - self.log_warn("Got exception while listing images: {}", exc) - return - self.log_dbg("Found {} images", len(images)) - for image in images: - if self.is_outdated(image.created_at, "openstack-image-max-age-days"): - if self.dry_run: - self.log_info("Image deletion {} skipped due to dry run mode", image.name) - else: - self.log_info("Deleting image {}", image.name) - try: - if not self.client().delete_image( - image.name, - wait=False, - timeout=3600): - self.log_err("Failed to delete image {}", image.name) - except OpenStackCloudException as exc: - self.log_warn("Got exception while deleting image {}: {}", image.name, exc) - - def _cleanup_keypairs(self) -> None: - try: - keypairs = [keypair for keypair in self.client().list_keypairs() if keypair.name.startswith("openqa")] - except OpenStackCloudException as exc: - self.log_warn("Got exception while listing keypairs: {}", exc) - return - self.log_dbg("Found {} keypairs", len(keypairs)) - for keypair in keypairs: - if keypair.created_at is None: - keypair.created_at = self.client().compute.get_keypair(keypair.name).created_at - if self.is_outdated(keypair.created_at, "openstack-key-max-days"): - if self.dry_run: - self.log_info("Keypair deletion {} skipped due to dry run mode", keypair.name) - else: - self.log_info("Deleting keypair {}", keypair.name) - try: - if not self.client().delete_keypair(keypair.name): - self.log_err("Failed to delete keypair {}", keypair.name) - except OpenStackCloudException as exc: - self.log_warn("Got exception while deleting keypair {}: {}", keypair.name, exc) diff --git a/ocw/migrations/0013_remove_instance_notified_alter_instance_provider.py b/ocw/migrations/0013_remove_instance_notified_alter_instance_provider.py new file mode 100644 index 00000000..fe2d4945 --- /dev/null +++ b/ocw/migrations/0013_remove_instance_notified_alter_instance_provider.py @@ -0,0 +1,22 @@ +# Generated by Django 5.0.9 on 2024-09-16 19:49 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('ocw', '0012_rename_vault_namespace_instance_namespace_and_more'), + ] + + operations = [ + migrations.RemoveField( + model_name='instance', + name='notified', + ), + migrations.AlterField( + model_name='instance', + name='provider', + field=models.CharField(choices=[('GCE', 'Google'), ('EC2', 'EC2'), ('AZURE', 'Azure')], max_length=8), + ), + ] diff --git a/ocw/models.py b/ocw/models.py index 06d1c10f..9fb79bec 100644 --- a/ocw/models.py +++ b/ocw/models.py @@ -32,7 +32,6 @@ class Instance(models.Model): instance_id = models.CharField(max_length=200) region = models.CharField(max_length=64, default='') namespace = models.CharField(max_length=64, default='') - notified = models.BooleanField(default=False) ignore = models.BooleanField(default=False) TAG_IGNORE = 'pcw_ignore' diff --git a/ocw/tables.py b/ocw/tables.py index ab664c42..c60d4851 100644 --- a/ocw/tables.py +++ b/ocw/tables.py @@ -33,19 +33,6 @@ def render(self, record): return "" -class MailColumn(tables.BooleanColumn): - @property - def header(self): - return "" - - def render(self, value, record, bound_column): - value = self._get_bool_value(record, value, bound_column) - if value: - return format_html('Email notification was send', - static('img/notified.png')) - return "" - - class TagsColumn(tables.TemplateColumn): def __init__(self, template_name=None, **extra): @@ -58,7 +45,6 @@ def header(self): class InstanceTable(tables.Table): tags = TagsColumn() - notified = MailColumn() type = tables.Column(accessor=A('get_type')) first_seen = tables.DateTimeColumn(format='M d Y') last_seen = tables.DateTimeColumn(format='M d Y') diff --git a/requirements.txt b/requirements.txt index fdb13ee0..9908a24b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,7 +17,6 @@ oauth2client google-api-python-client==2.131.0 google-cloud-storage==2.16.0 openqa_client -openstacksdk~=3.1.0 python-dateutil apscheduler kubernetes diff --git a/templates/pcw.ini b/templates/pcw.ini index b1e2c0bb..fc5dec3f 100644 --- a/templates/pcw.ini +++ b/templates/pcw.ini @@ -36,12 +36,6 @@ ec2-max-age-days = 1 gce-skip-networks = default,tf-network # Max age of data storage resources ( used in Azure and GCE ) max-age-hours = 1 -# Max age for images in Openstack -openstack-image-max-age-days = 3 -# Max age for VM's in Openstack -openstack-vm-max-age-days = 1 -# Max age for keys in Openstack -openstack-key-max-days = 1 # Specify with which namespace, we will do the cleanup. # if not specifed default/namespaces list will be taken instead namespaces = qac, sapha diff --git a/tests/test_db.py b/tests/test_db.py index 47e06573..b6907468 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -179,16 +179,12 @@ def mocked__update_provider(arg1, arg2, arg3): def mocked_auto_delete_instances(): call_stack.append('auto_delete_instances') - def mocked_send_leftover_notification(): - call_stack.append('send_leftover_notification') - monkeypatch.setattr('ocw.lib.db._update_provider', mocked__update_provider) monkeypatch.setattr('ocw.lib.db.auto_delete_instances', mocked_auto_delete_instances) - monkeypatch.setattr('ocw.lib.db.send_leftover_notification', mocked_send_leftover_notification) update_run() - assert call_stack == ['_update_provider', 'auto_delete_instances', 'send_leftover_notification'] + assert call_stack == ['_update_provider', 'auto_delete_instances'] def test_update_run_update_provider_throw_exception(update_run_patch, monkeypatch): @@ -202,20 +198,16 @@ def mocked__update_provider(arg1, arg2, arg3): def mocked_auto_delete_instances(): call_stack.append('auto_delete_instances') - def mocked_send_leftover_notification(): - call_stack.append('send_leftover_notification') - def mocked_send_mail(arg1, arg2): call_stack.append('send_mail') monkeypatch.setattr('ocw.lib.db._update_provider', mocked__update_provider) monkeypatch.setattr('ocw.lib.db.auto_delete_instances', mocked_auto_delete_instances) - monkeypatch.setattr('ocw.lib.db.send_leftover_notification', mocked_send_leftover_notification) monkeypatch.setattr('ocw.lib.db.send_mail', mocked_send_mail) update_run() - assert call_stack == ['_update_provider', 'send_mail', 'auto_delete_instances', 'send_leftover_notification'] + assert call_stack == ['_update_provider', 'send_mail', 'auto_delete_instances'] def test_delete_instances_azure(monkeypatch): diff --git a/tests/test_ec2.py b/tests/test_ec2.py index ff1f9474..501e333e 100644 --- a/tests/test_ec2.py +++ b/tests/test_ec2.py @@ -532,3 +532,7 @@ def test_count_all_volumes(ec2_patch): 'Tags': [{'Key': 'pcw_ignore', 'Value': '1'}]}, ] } assert ec2_patch.count_all_volumes() == 3 + + +def test_count_all_vpcs(ec2_patch_for_vpc): + assert ec2_patch_for_vpc.count_all_vpc() == 1 diff --git a/tests/test_gce.py b/tests/test_gce.py index baf3f6dd..f33ff829 100644 --- a/tests/test_gce.py +++ b/tests/test_gce.py @@ -253,3 +253,35 @@ def __init__(self, content) -> None: assert GCE.get_error_reason(MockHttpError({'error': {'errors': []}})) == "unknown" assert GCE.get_error_reason(MockHttpError({'error': {'errors': [{}]}})) == "unknown" assert GCE.get_error_reason(MockHttpError({'error': {'errors': [{'reason': 'aaa'}]}})) == "aaa" + + +def test_count_all_instances(gce): + with ( + patch.object(gce, 'list_regions', return_value=['region1']), + patch.object(gce, 'list_zones', return_value=['zone1']), + ): + assert gce.count_all_instances() == 2 + + +def test_count_all_images(gce): + with (patch.object(gce, '_paginated', return_value=[1, 2, 3, 4])): + assert gce.count_all_images() == 4 + + +def test_count_all_disks(gce): + with ( + patch.object(gce, 'list_regions', return_value=['region1']), + patch.object(gce, 'list_zones', return_value=['zone1']), + patch.object(gce, '_paginated', return_value=[1, 2, 3, 4]), + ): + assert gce.count_all_disks() == 4 + + +def test_count_all_blobs(gce): + with (patch.object(gce, '_paginated', return_value=[1, 2, 3, 4])): + assert gce.count_all_blobs() == 4 + + +def test_count_all_networks(gce): + with (patch.object(gce, '_paginated', return_value=[1, 2, 3, 4])): + assert gce.count_all_networks() == 4 diff --git a/tests/test_openstack.py b/tests/test_openstack.py deleted file mode 100644 index 64821709..00000000 --- a/tests/test_openstack.py +++ /dev/null @@ -1,181 +0,0 @@ -from collections import namedtuple -from unittest.mock import MagicMock, patch -from datetime import datetime, timezone, timedelta -from pytest import fixture -from ocw.lib.openstack import Openstack -from webui.PCWConfig import PCWConfig - - -def assert_not_called_with(self, *args, **kwargs): - try: - self.assert_called_with(*args, **kwargs) - except AssertionError: - return - raise AssertionError('Expected %s to not have been called.' % self._format_mock_call_signature(args, kwargs)) - - -MagicMock.assert_not_called_with = assert_not_called_with - - -@fixture -def openstack_instance(): - with patch.object(Openstack, 'read_auth_json', return_value={}): - with patch.object(Openstack, 'get_data', return_value="CustomRegion"): - with patch('openstack.connect') as mock_connect: - mock_client = MagicMock() - mock_connect.return_value = mock_client - yield Openstack('test_namespace') - - -def test_is_outdated(openstack_instance): - now = datetime.now(timezone.utc) - - max_days = 10 - patch.object(PCWConfig, 'get_feature_property', return_value=max_days) - - # Test cases with different timestamps and max_days values - test_cases = [ - # Timestamp is within the valid range - { - "timestamp": (now - timedelta(days=1)).isoformat(), - "expected": False, - }, - # Timestamp is exactly at the max_days limit - { - "timestamp": (now - timedelta(days=max_days)).isoformat(), - "expected": True, - }, - # Timestamp exceeds the max_days limit - { - "timestamp": (now - timedelta(days=max_days+1)).isoformat(), - "expected": True, - }, - # Timestamp is in the future - { - "timestamp": (now + timedelta(days=max_days+1)).isoformat(), - "expected": False, - }, - ] - - for test in test_cases: - assert openstack_instance.is_outdated(test["timestamp"], "openstack-vm-max-age-days") == test["expected"] - - -def test_cleanup_all(openstack_instance): - openstack_instance.cleanup_all() - openstack_instance.client().compute.servers.assert_called_once() - openstack_instance.client().image.images.assert_called_once() - openstack_instance.client().list_keypairs.assert_called_once() - - -def test_cleanup_instances(openstack_instance): - # Prepare test data - outdated_server = MagicMock() - outdated_server.name = 'openqa-vm-outdated' - outdated_server.created_at = (datetime.now(timezone.utc) - timedelta(days=8)).isoformat() - - recent_server = MagicMock() - recent_server.name = 'openqa-vm-recent' - recent_server.created_at = (datetime.now(timezone.utc) - timedelta(days=1)).isoformat() - - openstack_instance.client().compute.servers.return_value = [outdated_server, recent_server] - - # Test with dry_run=False - openstack_instance.dry_run = False - openstack_instance._cleanup_instances() - - kwargs = {'wait': False, 'timeout': 180, 'delete_ips': True, 'delete_ip_retry': 1} - openstack_instance.client().delete_server.assert_called_once_with(outdated_server.name, **kwargs) - openstack_instance.client().delete_server.assert_not_called_with(recent_server.name) - - # Reset mocks - openstack_instance.client().delete_server.reset_mock() - - # Test with dry_run=True - openstack_instance.dry_run = True - openstack_instance._cleanup_instances() - - openstack_instance.client().delete_server.assert_not_called() - - -def test_cleanup_images(openstack_instance): - Image = namedtuple('Image', ['name', 'created_at', 'tags']) - - # Prepare test data - max_days = 7 - patch.object(PCWConfig, 'get_feature_property', return_value=max_days) - images = [ - Image( - name='openqa-image-outdated', - created_at=(datetime.now(timezone.utc) - timedelta(days=max_days+1)).isoformat(), - tags=['openqa'], - ), - Image( - name='openqa-image-recent', - created_at=(datetime.now(timezone.utc) - timedelta(days=1)).isoformat(), - tags=['openqa'], - ), - Image( - name='not-openqa-image', - created_at=(datetime.now(timezone.utc) - timedelta(days=max_days+1)).isoformat(), - tags=[], - ), - ] - openstack_instance.client().image.images.return_value = images - - # Test with dry_run=False - openstack_instance.dry_run = False - openstack_instance._cleanup_images() - - kwargs = {'wait': False, 'timeout': 3600} - openstack_instance.client().delete_image.assert_called_once_with(images[0].name, **kwargs) - openstack_instance.client().delete_image.assert_not_called_with(images[1].name) - openstack_instance.client().delete_image.assert_not_called_with(images[2].name) - - # Reset mocks - openstack_instance.client().delete_image.reset_mock() - - # Test with dry_run=True - openstack_instance.dry_run = True - openstack_instance._cleanup_images() - - openstack_instance.client().delete_image.assert_not_called() - - -def test_cleanup_keypairs(openstack_instance): - Keypair = namedtuple('Keypair', ['name', 'created_at']) - - # Prepare test data - max_days = 3 - keypairs = [ - Keypair( - name='openqa-keypair-outdated', - created_at=(datetime.now(timezone.utc) - timedelta(days=max_days+1)).isoformat(), - ), - Keypair( - name='openqa-keypair-recent', - created_at=(datetime.now(timezone.utc) - timedelta(days=1)).isoformat(), - ), - Keypair( - name='not-openqa-keypair', - created_at=(datetime.now(timezone.utc) - timedelta(days=max_days+1)).isoformat(), - ), - ] - openstack_instance.client().list_keypairs.return_value = keypairs - - # Test with dry_run=False - openstack_instance.dry_run = False - openstack_instance._cleanup_keypairs() - - openstack_instance.client().delete_keypair.assert_called_once_with(keypairs[0].name) - openstack_instance.client().delete_keypair.assert_not_called_with(keypairs[1].name) - openstack_instance.client().delete_keypair.assert_not_called_with(keypairs[2].name) - - # Reset mocks - openstack_instance.client().delete_keypair.reset_mock() - - # Test with dry_run=True - openstack_instance.dry_run = True - openstack_instance._cleanup_keypairs() - - openstack_instance.client().delete_keypair.assert_not_called() diff --git a/tests/test_pcwconfig.py b/tests/test_pcwconfig.py index b7710a55..75a90bcf 100644 --- a/tests/test_pcwconfig.py +++ b/tests/test_pcwconfig.py @@ -94,7 +94,7 @@ def test_get_namespaces_for_feature_default_feature_exists_namespace_in_feature( def test_get_providers_for_not_existed_feature(pcw_file): providers = PCWConfig.get_providers_for('get_providers_for', 'not_existent') assert type(providers) is list - assert not {'EC2', 'AZURE', 'GCE', 'OSTACK'} ^ set(providers) + assert not {'EC2', 'AZURE', 'GCE'} ^ set(providers) def test_get_providers_for_existed_feature(pcw_file): diff --git a/webui/PCWConfig.py b/webui/PCWConfig.py index 90f8489a..6284dc8e 100644 --- a/webui/PCWConfig.py +++ b/webui/PCWConfig.py @@ -61,9 +61,6 @@ def get_feature_property(feature: str, feature_property: str, namespace: str | N 'cleanup/ec2-max-age-days': {'default': -1, 'return_type': int}, 'cleanup/gce-bucket': {'default': None, 'return_type': str}, 'cleanup/max-age-hours': {'default': 24 * 7, 'return_type': int}, - 'cleanup/openstack-image-max-age-days': {'default': 3, 'return_type': int}, - 'cleanup/openstack-vm-max-age-days': {'default': 1, 'return_type': int}, - 'cleanup/openstack-key-max-days': {'default': 1, 'return_type': int}, 'updaterun/default_ttl': {'default': 44400, 'return_type': int}, 'notify/to': {'default': None, 'return_type': str}, 'notify/age-hours': {'default': 12, 'return_type': int}, @@ -94,7 +91,7 @@ def get_namespaces_for(feature: str) -> list: @staticmethod def get_providers_for(feature: str, namespace: str): return ConfigFile().getList(f'{feature}.namespace.{namespace}/providers', - ConfigFile().getList(f'{feature}/providers', ['EC2', 'AZURE', 'GCE', 'OSTACK'])) + ConfigFile().getList(f'{feature}/providers', ['EC2', 'AZURE', 'GCE'])) @staticmethod def get_k8s_clusters_for_provider(namespace: str, provider: str) -> list: