From 5759903b7de246519c143cc9e6458d50ec5a51a2 Mon Sep 17 00:00:00 2001 From: weingartlorenz Date: Tue, 7 May 2024 20:03:56 +0200 Subject: [PATCH] This is a component designed to download the Xview dataset Signed-off-by: weingartlorenz --- .../input/input-Xview-download.cwl | 57 +++++ .../input/input-Xview-download.ipynb | 213 ++++++++++++++++++ .../input/input-Xview-download.job.yaml | 30 +++ .../input/input-Xview-download.yaml | 31 +++ 4 files changed, 331 insertions(+) create mode 100644 component-library/input/input-Xview-download.cwl create mode 100644 component-library/input/input-Xview-download.ipynb create mode 100644 component-library/input/input-Xview-download.job.yaml create mode 100644 component-library/input/input-Xview-download.yaml diff --git a/component-library/input/input-Xview-download.cwl b/component-library/input/input-Xview-download.cwl new file mode 100644 index 00000000..bdca414f --- /dev/null +++ b/component-library/input/input-Xview-download.cwl @@ -0,0 +1,57 @@ +cwlVersion: v1.2 +class: CommandLineTool + +baseCommand: "claimed" + +inputs: + component: + type: string + default: docker.io/mdorzweiler/claimed-input-xview-download:0.1 + inputBinding: + position: 1 + prefix: --component + log_level: + type: string + default: "INFO" + inputBinding: + position: 2 + prefix: --log_level + username: + type: string + default: None + inputBinding: + position: 3 + prefix: --username + password: + type: string + default: None + inputBinding: + position: 4 + prefix: --password + move_to_dir: + type: string + default: None + inputBinding: + position: 5 + prefix: --move_to_dir + chromedriver_path: + type: string + default: None + inputBinding: + position: 6 + prefix: --chromedriver_path + max_download_time: + type: string + default: None + inputBinding: + position: 7 + prefix: --max_download_time + label: + type: string + default: None + inputBinding: + position: 8 + prefix: --label + + +outputs: [] diff --git a/component-library/input/input-Xview-download.ipynb b/component-library/input/input-Xview-download.ipynb new file mode 100644 index 00000000..7428ee04 --- /dev/null +++ b/component-library/input/input-Xview-download.ipynb @@ -0,0 +1,213 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "147f9480", + "metadata": {}, + "source": [ + "## Xview Dataset Download \n", + "\n", + "This component is designed to download a labeled overhead image dataset, provided a chromedriver, to a specified location. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c185c1f0", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install selenium" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc0554b5", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import os\n", + "import shutil\n", + "import time\n", + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "from urllib.parse import urlparse\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "866d16c3", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# username for the Xview webpage to authorize login\n", + "username = os.environ.get('username')\n", + "\n", + "# password for the Xview webpage to authorize login\n", + "password = os.environ.get('password')\n", + "\n", + "# move_to_dir the directory where the dataset should be saved\n", + "move_to_dir = os.environ.get('move_to_dir')\n", + "\n", + "# chromedriver_path the directory where the local copy of chromedriver is saved\n", + "chromedriver_path = os.environ.get('chromedriver_path')\n", + "\n", + "# max_download_time before timeout, must be ajusted acording to the file size and internet speed\n", + "max_download_time = os.environ.get('max_download_time')\n", + "\n", + "# The label of the file desired to download.\n", + "# Chose from \"TI.zip\", \"TL.zip\", \"VI.zip\", \"TI.tgz\", \"TL.tgz\", \"VI.tgz, \n", + "# standing for TI=Traning Images, TL=Training Lables, VI=Validation Images\n", + "label = os.environ.get('label')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "794506c5", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label): \n", + " \n", + " # Set Chrome options to automatically download files to the specified directory\n", + " options = webdriver.ChromeOptions()\n", + " prefs = {\n", + " \"download.default_directory\": move_to_dir,\n", + " \"download.prompt_for_download\": False,\n", + " \"download.directory_upgrade\": True,\n", + " \"safebrowsing.enabled\": True\n", + " }\n", + " options.add_experimental_option(\"prefs\", prefs)\n", + "\n", + " # Start a new instance of Chrome web browser\n", + " driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)\n", + " \n", + " # Open the login page\n", + " url_login = r'https://challenge.xviewdataset.org/login'\n", + " driver.get(url_login)\n", + "\n", + " # Find the username and password fields and enter credentials\n", + " username_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'email')))\n", + " password_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'password')))\n", + " username_field.send_keys(username)\n", + " password_field.send_keys(password)\n", + "\n", + " # Find and click the login button\n", + " login_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn.primary')))\n", + " login_button.click()\n", + " \n", + " # Wait for the page to load after login\n", + " time.sleep(1)\n", + " \n", + " # Open the Download page\n", + " url_download = r'https://challenge.xviewdataset.org/download-links'\n", + " driver.get(url_download)\n", + " \n", + " # Wait for the overlay element to be present\n", + " overlay_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'overlay--active')))\n", + "\n", + " # Remove the automaic pop-up overlay \n", + " body_element = driver.find_element_by_tag_name('body')\n", + " body_element.click()\n", + " time.sleep(1)\n", + " \n", + " # Switch between the possible download files\n", + " search_text = \"\"\n", + " match label:\n", + " case \"TI.zip\":\n", + " search_text = '//a[contains(text(), \"Download Training Images (zip)\")]'\n", + " case \"TL.zip\":\n", + " search_text = '//a[contains(text(), \"Download Training Labels (zip)\")]'\n", + " case \"VI.zip\":\n", + " search_text = '//a[contains(text(), \"Download Validation Images (zip)\")]'\n", + " case \"TI.tgz\":\n", + " search_text = '//a[contains(text(), \"Download Training Images (tgz)\")]'\n", + " case \"TL.tgz\":\n", + " search_text = '//a[contains(text(), \"Download Training Labels (tgz)\")]'\n", + " case \"VI.tgz\":\n", + " search_text = '//a[contains(text(), \"Download Validation Images (tgz)\")]'\n", + " case _:\n", + " raise ValueError(\"Error: This is an invalid download option\") \n", + " \n", + " # Wait for the download link to be present\n", + " download_link_element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, search_text)))\n", + " \n", + " # Get the dynamic download link from the href attribute\n", + " download_link = download_link_element.get_attribute('href')\n", + " \n", + " # Download the dataset using the obtained link\n", + " if download_link:\n", + " driver.get(download_link)\n", + " print(\"Dataset download started successfully.\")\n", + " \n", + " # Extract the filename from the download link URL\n", + " parsed_url = urlparse(download_link)\n", + " filename = parsed_url.path.split('/')[-1]\n", + " downloaded_file = os.path.join(move_to_dir, filename)\n", + " print(downloaded_file)\n", + " \n", + " # Check if the download directory exists\n", + " if not os.path.exists(move_to_dir):\n", + " os.makedirs(move_to_dir)\n", + " \n", + " # Wait for the file to be completely downloaded\n", + " start_time = time.time()\n", + " \n", + " while True:\n", + " if os.path.exists(downloaded_file) and os.path.getsize(downloaded_file) > 0:\n", + " print(\"File downloaded successfully.\")\n", + " break\n", + " elif time.time() - start_time > max_download_time:\n", + " print(\"Error: Maximum wait time exceeded.\")\n", + " break\n", + " else:\n", + " time.sleep(5)\n", + " \n", + " else:\n", + " print(\"Failed to get the download link.\")\n", + "\n", + " # Close the browser\n", + " driver.quit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7b2f96d", + "metadata": {}, + "outputs": [], + "source": [ + "login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/component-library/input/input-Xview-download.job.yaml b/component-library/input/input-Xview-download.job.yaml new file mode 100644 index 00000000..721b3718 --- /dev/null +++ b/component-library/input/input-Xview-download.job.yaml @@ -0,0 +1,30 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: input-xview-download +spec: + template: + spec: + containers: + - name: input-xview-download + image: docker.io/mdorzweiler/claimed-input-xview-download:0.1 + workingDir: /opt/app-root/src/ + command: ["/opt/app-root/bin/ipython","claimed_input-Xview-download.ipynb"] + env: + - name: log_level + value: value_of_log_level + - name: username + value: value_of_username + - name: password + value: value_of_password + - name: move_to_dir + value: value_of_move_to_dir + - name: chromedriver_path + value: value_of_chromedriver_path + - name: max_download_time + value: value_of_max_download_time + - name: label + value: value_of_label + restartPolicy: OnFailure + imagePullSecrets: + - name: image_pull_secret \ No newline at end of file diff --git a/component-library/input/input-Xview-download.yaml b/component-library/input/input-Xview-download.yaml new file mode 100644 index 00000000..67c6651d --- /dev/null +++ b/component-library/input/input-Xview-download.yaml @@ -0,0 +1,31 @@ +name: input-xview-download +description: "## Xview Dataset Download – CLAIMED V0.1" + +inputs: +- {name: log_level, type: String, description: "update log level", default: "INFO"} +- {name: username, type: String, description: "username for the Xview webpage to authorize login"} +- {name: password, type: String, description: "password for the Xview webpage to authorize login"} +- {name: move_to_dir, type: String, description: "move_to_dir the directory where the dataset should be saved"} +- {name: chromedriver_path, type: String, description: "chromedriver_path the directory where the local copy of chromedriver is saved"} +- {name: max_download_time, type: String, description: "max_download_time before timeout, must be ajusted acording to the file size and internet speed"} +- {name: label, type: String, description: "standing for TI=Traning Images, TL=Training Lables, VI=Validation Images"} + + +outputs: + + +implementation: + container: + image: docker.io/mdorzweiler/claimed-input-xview-download:0.1 + command: + - sh + - -ec + - | + ipython ./claimed_input-Xview-download.ipynb log_level="${0}" username="${1}" password="${2}" move_to_dir="${3}" chromedriver_path="${4}" max_download_time="${5}" label="${6}" + - {inputValue: log_level} + - {inputValue: username} + - {inputValue: password} + - {inputValue: move_to_dir} + - {inputValue: chromedriver_path} + - {inputValue: max_download_time} + - {inputValue: label}