Skip to content

Commit

Permalink
Merge pull request #265 from weingartlorenz/main
Browse files Browse the repository at this point in the history
This is a component designed to download the Xview dataset
  • Loading branch information
romeokienzler authored May 31, 2024
2 parents 2dcb166 + 5759903 commit b6c60c3
Show file tree
Hide file tree
Showing 4 changed files with 331 additions and 0 deletions.
57 changes: 57 additions & 0 deletions component-library/input/input-Xview-download.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
cwlVersion: v1.2
class: CommandLineTool

baseCommand: "claimed"

inputs:
component:
type: string
default: docker.io/mdorzweiler/claimed-input-xview-download:0.1
inputBinding:
position: 1
prefix: --component
log_level:
type: string
default: "INFO"
inputBinding:
position: 2
prefix: --log_level
username:
type: string
default: None
inputBinding:
position: 3
prefix: --username
password:
type: string
default: None
inputBinding:
position: 4
prefix: --password
move_to_dir:
type: string
default: None
inputBinding:
position: 5
prefix: --move_to_dir
chromedriver_path:
type: string
default: None
inputBinding:
position: 6
prefix: --chromedriver_path
max_download_time:
type: string
default: None
inputBinding:
position: 7
prefix: --max_download_time
label:
type: string
default: None
inputBinding:
position: 8
prefix: --label


outputs: []
213 changes: 213 additions & 0 deletions component-library/input/input-Xview-download.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "147f9480",
"metadata": {},
"source": [
"## Xview Dataset Download \n",
"\n",
"This component is designed to download a labeled overhead image dataset, provided a chromedriver, to a specified location. \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c185c1f0",
"metadata": {},
"outputs": [],
"source": [
"!pip install selenium"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dc0554b5",
"metadata": {},
"outputs": [],
"source": [
"\n",
"import os\n",
"import shutil\n",
"import time\n",
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from urllib.parse import urlparse\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "866d16c3",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# username for the Xview webpage to authorize login\n",
"username = os.environ.get('username')\n",
"\n",
"# password for the Xview webpage to authorize login\n",
"password = os.environ.get('password')\n",
"\n",
"# move_to_dir the directory where the dataset should be saved\n",
"move_to_dir = os.environ.get('move_to_dir')\n",
"\n",
"# chromedriver_path the directory where the local copy of chromedriver is saved\n",
"chromedriver_path = os.environ.get('chromedriver_path')\n",
"\n",
"# max_download_time before timeout, must be ajusted acording to the file size and internet speed\n",
"max_download_time = os.environ.get('max_download_time')\n",
"\n",
"# The label of the file desired to download.\n",
"# Chose from \"TI.zip\", \"TL.zip\", \"VI.zip\", \"TI.tgz\", \"TL.tgz\", \"VI.tgz, \n",
"# standing for TI=Traning Images, TL=Training Lables, VI=Validation Images\n",
"label = os.environ.get('label')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "794506c5",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label): \n",
" \n",
" # Set Chrome options to automatically download files to the specified directory\n",
" options = webdriver.ChromeOptions()\n",
" prefs = {\n",
" \"download.default_directory\": move_to_dir,\n",
" \"download.prompt_for_download\": False,\n",
" \"download.directory_upgrade\": True,\n",
" \"safebrowsing.enabled\": True\n",
" }\n",
" options.add_experimental_option(\"prefs\", prefs)\n",
"\n",
" # Start a new instance of Chrome web browser\n",
" driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)\n",
" \n",
" # Open the login page\n",
" url_login = r'https://challenge.xviewdataset.org/login'\n",
" driver.get(url_login)\n",
"\n",
" # Find the username and password fields and enter credentials\n",
" username_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'email')))\n",
" password_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'password')))\n",
" username_field.send_keys(username)\n",
" password_field.send_keys(password)\n",
"\n",
" # Find and click the login button\n",
" login_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn.primary')))\n",
" login_button.click()\n",
" \n",
" # Wait for the page to load after login\n",
" time.sleep(1)\n",
" \n",
" # Open the Download page\n",
" url_download = r'https://challenge.xviewdataset.org/download-links'\n",
" driver.get(url_download)\n",
" \n",
" # Wait for the overlay element to be present\n",
" overlay_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'overlay--active')))\n",
"\n",
" # Remove the automaic pop-up overlay \n",
" body_element = driver.find_element_by_tag_name('body')\n",
" body_element.click()\n",
" time.sleep(1)\n",
" \n",
" # Switch between the possible download files\n",
" search_text = \"\"\n",
" match label:\n",
" case \"TI.zip\":\n",
" search_text = '//a[contains(text(), \"Download Training Images (zip)\")]'\n",
" case \"TL.zip\":\n",
" search_text = '//a[contains(text(), \"Download Training Labels (zip)\")]'\n",
" case \"VI.zip\":\n",
" search_text = '//a[contains(text(), \"Download Validation Images (zip)\")]'\n",
" case \"TI.tgz\":\n",
" search_text = '//a[contains(text(), \"Download Training Images (tgz)\")]'\n",
" case \"TL.tgz\":\n",
" search_text = '//a[contains(text(), \"Download Training Labels (tgz)\")]'\n",
" case \"VI.tgz\":\n",
" search_text = '//a[contains(text(), \"Download Validation Images (tgz)\")]'\n",
" case _:\n",
" raise ValueError(\"Error: This is an invalid download option\") \n",
" \n",
" # Wait for the download link to be present\n",
" download_link_element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, search_text)))\n",
" \n",
" # Get the dynamic download link from the href attribute\n",
" download_link = download_link_element.get_attribute('href')\n",
" \n",
" # Download the dataset using the obtained link\n",
" if download_link:\n",
" driver.get(download_link)\n",
" print(\"Dataset download started successfully.\")\n",
" \n",
" # Extract the filename from the download link URL\n",
" parsed_url = urlparse(download_link)\n",
" filename = parsed_url.path.split('/')[-1]\n",
" downloaded_file = os.path.join(move_to_dir, filename)\n",
" print(downloaded_file)\n",
" \n",
" # Check if the download directory exists\n",
" if not os.path.exists(move_to_dir):\n",
" os.makedirs(move_to_dir)\n",
" \n",
" # Wait for the file to be completely downloaded\n",
" start_time = time.time()\n",
" \n",
" while True:\n",
" if os.path.exists(downloaded_file) and os.path.getsize(downloaded_file) > 0:\n",
" print(\"File downloaded successfully.\")\n",
" break\n",
" elif time.time() - start_time > max_download_time:\n",
" print(\"Error: Maximum wait time exceeded.\")\n",
" break\n",
" else:\n",
" time.sleep(5)\n",
" \n",
" else:\n",
" print(\"Failed to get the download link.\")\n",
"\n",
" # Close the browser\n",
" driver.quit()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7b2f96d",
"metadata": {},
"outputs": [],
"source": [
"login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
30 changes: 30 additions & 0 deletions component-library/input/input-Xview-download.job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: batch/v1
kind: Job
metadata:
name: input-xview-download
spec:
template:
spec:
containers:
- name: input-xview-download
image: docker.io/mdorzweiler/claimed-input-xview-download:0.1
workingDir: /opt/app-root/src/
command: ["/opt/app-root/bin/ipython","claimed_input-Xview-download.ipynb"]
env:
- name: log_level
value: value_of_log_level
- name: username
value: value_of_username
- name: password
value: value_of_password
- name: move_to_dir
value: value_of_move_to_dir
- name: chromedriver_path
value: value_of_chromedriver_path
- name: max_download_time
value: value_of_max_download_time
- name: label
value: value_of_label
restartPolicy: OnFailure
imagePullSecrets:
- name: image_pull_secret
31 changes: 31 additions & 0 deletions component-library/input/input-Xview-download.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: input-xview-download
description: "## Xview Dataset Download – CLAIMED V0.1"

inputs:
- {name: log_level, type: String, description: "update log level", default: "INFO"}
- {name: username, type: String, description: "username for the Xview webpage to authorize login"}
- {name: password, type: String, description: "password for the Xview webpage to authorize login"}
- {name: move_to_dir, type: String, description: "move_to_dir the directory where the dataset should be saved"}
- {name: chromedriver_path, type: String, description: "chromedriver_path the directory where the local copy of chromedriver is saved"}
- {name: max_download_time, type: String, description: "max_download_time before timeout, must be ajusted acording to the file size and internet speed"}
- {name: label, type: String, description: "standing for TI=Traning Images, TL=Training Lables, VI=Validation Images"}


outputs:


implementation:
container:
image: docker.io/mdorzweiler/claimed-input-xview-download:0.1
command:
- sh
- -ec
- |
ipython ./claimed_input-Xview-download.ipynb log_level="${0}" username="${1}" password="${2}" move_to_dir="${3}" chromedriver_path="${4}" max_download_time="${5}" label="${6}"
- {inputValue: log_level}
- {inputValue: username}
- {inputValue: password}
- {inputValue: move_to_dir}
- {inputValue: chromedriver_path}
- {inputValue: max_download_time}
- {inputValue: label}

0 comments on commit b6c60c3

Please sign in to comment.