Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update/hackathon feedback #35

Merged
merged 7 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions datalab/datalab_session/analysis/get_tif.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datalab.datalab_session.file_utils import create_tif, get_fits
from datalab.datalab_session.s3_utils import key_exists, add_file_to_bucket, get_s3_url
from datalab.datalab_session.file_utils import create_tif
from datalab.datalab_session.s3_utils import key_exists, add_file_to_bucket, get_s3_url, get_fits

def get_tif(input: dict):
"""
Expand Down
5 changes: 4 additions & 1 deletion datalab/datalab_session/analysis/line_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from astropy import coordinates

from datalab.datalab_session.file_utils import scale_points, get_hdu
from datalab.datalab_session.s3_utils import get_fits

# For creating an array of brightness along a user drawn line
def line_profile(input: dict):
Expand All @@ -19,7 +20,9 @@ def line_profile(input: dict):
y2 (int): The y coordinate of the ending point
}
"""
sci_hdu = get_hdu(input['basename'], 'SCI')
fits_path = get_fits(input['basename'])

sci_hdu = get_hdu(fits_path, 'SCI')

x_points, y_points = scale_points(input["height"], input["width"], sci_hdu.data.shape[0], sci_hdu.data.shape[1], x_points=[input["x1"], input["x2"]], y_points=[input["y1"], input["y2"]])

Expand Down
7 changes: 5 additions & 2 deletions datalab/datalab_session/analysis/source_catalog.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import numpy as np

from datalab.datalab_session.file_utils import get_hdu, scale_points
from datalab.datalab_session.s3_utils import get_fits

def source_catalog(input: dict):
"""
Returns a dict representing the source catalog data with x,y coordinates and flux values
"""
cat_hdu = get_hdu(input['basename'], 'CAT')
sci_hdu = get_hdu(input['basename'], 'SCI')
fits_path = get_fits(input['basename'])

cat_hdu = get_hdu(fits_path, 'CAT')
sci_hdu = get_hdu(fits_path, 'SCI')

# The number of sources to send back to the frontend, default 50
SOURCE_CATALOG_COUNT = min(50, len(cat_hdu.data["x"]))
Expand Down
25 changes: 12 additions & 13 deletions datalab/datalab_session/data_operations/data_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from django.core.cache import cache
import numpy as np

from datalab.datalab_session.s3_utils import get_fits
from datalab.datalab_session.tasks import execute_data_operation
from datalab.datalab_session.file_utils import get_hdu

Expand Down Expand Up @@ -57,7 +58,7 @@ def perform_operation(self):
status = self.get_status()
if status == 'PENDING' or status == 'FAILED':
self.set_status('IN_PROGRESS')
self.set_percent_completion(0.0)
self.set_operation_progress(0.0)
# This asynchronous task will call the operate() method on the proper operation
execute_data_operation.send(self.name(), self.input_data)

Expand All @@ -78,15 +79,16 @@ def set_message(self, message: str):
def get_message(self) -> str:
return cache.get(f'operation_{self.cache_key}_message', '')

def set_percent_completion(self, percent_completed: float):
cache.set(f'operation_{self.cache_key}_percent_completion', percent_completed, CACHE_DURATION)
def set_operation_progress(self, percent_completed: float):
cache.set(f'operation_{self.cache_key}_progress', percent_completed, CACHE_DURATION)

def get_percent_completion(self) -> float:
return cache.get(f'operation_{self.cache_key}_percent_completion', 0.0)
def get_operation_progress(self) -> float:
return cache.get(f'operation_{self.cache_key}_progress', 0.0)

def set_output(self, output_data: dict):
def set_output(self, output):
output_data = {'output_files': output if isinstance(output, list) else [output]}
self.set_status('COMPLETED')
self.set_percent_completion(1.0)
self.set_operation_progress(1.0)
cache.set(f'operation_{self.cache_key}_output', output_data, CACHE_DURATION)

def get_output(self) -> dict:
Expand All @@ -96,19 +98,16 @@ def set_failed(self, message: str):
self.set_status('FAILED')
self.set_message(message)

def get_fits_npdata(self, input_files: list[dict], percent=None, cur_percent=None) -> list[np.memmap]:
total_files = len(input_files)
def get_fits_npdata(self, input_files: list[dict]) -> list[np.memmap]:
image_data_list = []

# get the fits urls and extract the image data
for index, file_info in enumerate(input_files, start=1):
basename = file_info.get('basename', 'No basename found')
source = file_info.get('source', 'No source found')

sci_hdu = get_hdu(basename, 'SCI', source)
fits_path = get_fits(file_info['basename'], file_info['source'])
sci_hdu = get_hdu(fits_path, 'SCI')
image_data_list.append(sci_hdu.data)

if percent is not None and cur_percent is not None:
self.set_percent_completion(cur_percent + index/total_files * percent)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A lot of the operation progress is used when downloading input files to work on. It seems bad to remove any reporting of that progress. I.e. if you do an operation with 100 images, you will sit there with 0% progress for a long while before it finishes loading all 100 images. Can we retain the reporting of progress up to some maximum based on the number of images its downloading?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah.. it's a lot harder to track the progress like you said, downloading is most of the work for many operations. I also was dubious about removing the download percent. Next time I should call a meeting to go over the feedback on whether we should action on it or not. Customer isn't always right haha


return image_data_list
7 changes: 2 additions & 5 deletions datalab/datalab_session/data_operations/long.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,6 @@ def operate(self):
for i, file in enumerate(self.input_data.get('input_files', [])):
print(f"Processing long operation on file {file.get('basename', 'No basename found')}")
sleep(per_image_timeout)
self.set_percent_completion((i+1) / num_files)
self.set_operation_progress((i+1) / num_files)
# Done "processing" the files so set the output which sets the final status
output = {
'output_files': self.input_data.get('input_files', [])
}
self.set_output(output)
self.set_output(self.input_data.get('input_files', []))
14 changes: 5 additions & 9 deletions datalab/datalab_session/data_operations/median.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

from datalab.datalab_session.data_operations.data_operation import BaseDataOperation
from datalab.datalab_session.exceptions import ClientAlertException
from datalab.datalab_session.file_utils import create_fits, crop_arrays, create_jpgs
from datalab.datalab_session.s3_utils import save_fits_and_thumbnails
from datalab.datalab_session.file_utils import crop_arrays, create_output

log = logging.getLogger()
log.setLevel(logging.INFO)
Expand Down Expand Up @@ -49,21 +48,18 @@ def operate(self):

log.info(f'Executing median operation on {len(input)} files')

image_data_list = self.get_fits_npdata(input, percent=0.4, cur_percent=0.0)
image_data_list = self.get_fits_npdata(input)
self.set_operation_progress(0.40)

cropped_data_list = crop_arrays(image_data_list)
stacked_data = np.stack(cropped_data_list, axis=2)

# using the numpy library's median method
median = np.median(stacked_data, axis=2)

fits_file = create_fits(self.cache_key, median)
self.set_operation_progress(0.80)

large_jpg_path, small_jpg_path = create_jpgs(self.cache_key, fits_file)

output_file = save_fits_and_thumbnails(self.cache_key, fits_file, large_jpg_path, small_jpg_path)

output = {'output_files': [output_file]}
output = create_output(self.cache_key, median, comment=f'Product of Datalab Median on files {", ".join([image["basename"] for image in input])}')

self.set_output(output)
log.info(f'Median output: {self.get_output()}')
5 changes: 1 addition & 4 deletions datalab/datalab_session/data_operations/noop.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,4 @@ def wizard_description():

def operate(self):
print("No-op triggered!")
output = {
'output_files': self.input_data.get('input_files', [])
}
self.set_output(output)
self.set_output(self.input_data.get('input_files', []))
17 changes: 6 additions & 11 deletions datalab/datalab_session/data_operations/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import numpy as np

from datalab.datalab_session.data_operations.data_operation import BaseDataOperation
from datalab.datalab_session.file_utils import create_fits, create_jpgs
from datalab.datalab_session.s3_utils import save_fits_and_thumbnails
from datalab.datalab_session.file_utils import create_output

log = logging.getLogger()
log.setLevel(logging.INFO)
Expand Down Expand Up @@ -46,21 +45,17 @@ def operate(self):
log.info(f'Executing normalization operation on {len(input)} file(s)')

image_data_list = self.get_fits_npdata(input)
self.set_percent_completion(0.40)
self.set_operation_progress(0.40)

output_files = []
for index, image in enumerate(image_data_list):
median = np.median(image)
normalized_image = image / median

fits_file = create_fits(self.cache_key, normalized_image)
large_jpg_path, small_jpg_path = create_jpgs(self.cache_key, fits_file)
output_file = save_fits_and_thumbnails(self.cache_key, fits_file, large_jpg_path, small_jpg_path, index=index)
output_files.append(output_file)
output = create_output(self.cache_key, normalized_image, index=index, comment=f'Product of Datalab Normalization on file {input[index]["basename"]}')
output_files.append(output)

self.set_percent_completion(self.get_percent_completion() + .40 * (index + 1) / len(input))

output = {'output_files': output_files}
self.set_operation_progress(self.get_operation_progress() + .40 * (index + 1) / len(input))

self.set_output(output)
self.set_output(output_files)
log.info(f'Normalization output: {self.get_output()}')
44 changes: 21 additions & 23 deletions datalab/datalab_session/data_operations/rgb_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from datalab.datalab_session.data_operations.data_operation import BaseDataOperation
from datalab.datalab_session.exceptions import ClientAlertException
from datalab.datalab_session.file_utils import get_fits, crop_arrays, create_fits, create_jpgs
from datalab.datalab_session.s3_utils import save_fits_and_thumbnails
from datalab.datalab_session.file_utils import create_output, crop_arrays, create_jpgs
from datalab.datalab_session.s3_utils import get_fits

log = logging.getLogger()
log.setLevel(logging.INFO)
Expand Down Expand Up @@ -59,30 +59,28 @@ def wizard_description():
def operate(self):
rgb_input_list = self.input_data['red_input'] + self.input_data['green_input'] + self.input_data['blue_input']

if len(rgb_input_list) == 3:
log.info(f'Executing RGB Stack operation on files: {rgb_input_list}')
if len(rgb_input_list) != 3:
raise ClientAlertException('RGB stack requires exactly 3 files')

log.info(f'Executing RGB Stack operation on files: {rgb_input_list}')

fits_paths = []
for file in rgb_input_list:
fits_paths.append(get_fits(file.get('basename')))
self.set_percent_completion(self.get_percent_completion() + 0.2)
large_jpg_path, small_jpg_path = create_jpgs(self.cache_key, fits_paths, color=True)
fits_paths = []
for file in rgb_input_list:
fits_paths.append(get_fits(file.get('basename')))
self.set_operation_progress(self.get_operation_progress() + 0.2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a generic comment and I know we had already been doing this before, but calling the .get_operation_progress() every time we want to set the value causes an extra lookup into the remote cache, which is wasteful. We should probably just maintain a local variable with the operation progress in the class and use this during the operate method so we don't waste time retrieving it from the cache each time we set it.


large_jpg_path, small_jpg_path = create_jpgs(self.cache_key, fits_paths, color=True)

# color photos take three files, so we store it as one fits file with a 3d SCI ndarray
arrays = [fits.open(file)['SCI'].data for file in fits_paths]
cropped_data_list = crop_arrays(arrays)
stacked_data = np.stack(cropped_data_list, axis=2)

fits_file = create_fits(self.cache_key, stacked_data)
# color photos take three files, so we store it as one fits file with a 3d SCI ndarray
arrays = [fits.open(file)['SCI'].data for file in fits_paths]
cropped_data_list = crop_arrays(arrays)
stacked_data = np.stack(cropped_data_list, axis=2)

output_file = save_fits_and_thumbnails(self.cache_key, fits_file, large_jpg_path, small_jpg_path)
self.set_operation_progress(0.8)

rgb_comment = f'Product of Datalab RGB Stack on files {", ".join([image["basename"] for image in rgb_input_list])}'
output = create_output(self.cache_key, stacked_data, large_jpg=large_jpg_path, small_jpg=small_jpg_path, comment=rgb_comment)

output = {'output_files': [output_file]}
else:
output = {'output_files': []}
raise ClientAlertException('RGB Stack operation requires exactly 3 input files')

self.set_percent_completion(1.0)
self.set_operation_progress(1.0)
self.set_output(output)
log.info(f'RGB Stack output: {self.get_output()}')
18 changes: 6 additions & 12 deletions datalab/datalab_session/data_operations/stacking.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

from datalab.datalab_session.data_operations.data_operation import BaseDataOperation
from datalab.datalab_session.exceptions import ClientAlertException
from datalab.datalab_session.file_utils import create_fits, crop_arrays, create_jpgs
from datalab.datalab_session.s3_utils import save_fits_and_thumbnails
from datalab.datalab_session.file_utils import create_output, crop_arrays

log = logging.getLogger()
log.setLevel(logging.INFO)
Expand Down Expand Up @@ -52,25 +51,20 @@ def operate(self):

image_data_list = self.get_fits_npdata(input_files)

self.set_percent_completion(0.4)
self.set_operation_progress(0.4)

cropped_data = crop_arrays(image_data_list)
stacked_data = np.stack(cropped_data, axis=2)

self.set_percent_completion(0.6)
self.set_operation_progress(0.6)

# using the numpy library's sum method
stacked_sum = np.sum(stacked_data, axis=2)

self.set_percent_completion(0.8)
self.set_operation_progress(0.8)

fits_file = create_fits(self.cache_key, stacked_sum)

large_jpg_path, small_jpg_path = create_jpgs(self.cache_key, fits_file)

output_file = save_fits_and_thumbnails(self.cache_key, fits_file, large_jpg_path, small_jpg_path)

output = {'output_files': [output_file]}
stacking_comment = f'Product of Datalab Stacking. Stack of {", ".join([image["basename"] for image in input_files])}'
output = create_output(self.cache_key, stacked_sum, comment=stacking_comment)

self.set_output(output)
log.info(f'Stacked output: {self.get_output()}')
20 changes: 7 additions & 13 deletions datalab/datalab_session/data_operations/subtraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

from datalab.datalab_session.data_operations.data_operation import BaseDataOperation
from datalab.datalab_session.exceptions import ClientAlertException
from datalab.datalab_session.file_utils import create_fits, create_jpgs, crop_arrays
from datalab.datalab_session.s3_utils import save_fits_and_thumbnails
from datalab.datalab_session.file_utils import crop_arrays, create_output

log = logging.getLogger()
log.setLevel(logging.INFO)
Expand Down Expand Up @@ -64,10 +63,10 @@ def operate(self):
log.info(f'Executing subtraction operation on {len(input_files)} files')

input_image_data_list = self.get_fits_npdata(input_files)
self.set_percent_completion(.30)
self.set_operation_progress(.30)

subtraction_image = self.get_fits_npdata(subtraction_file_input)[0]
self.set_percent_completion(.40)
self.set_operation_progress(.40)

outputs = []
for index, input_image in enumerate(input_image_data_list):
Expand All @@ -76,15 +75,10 @@ def operate(self):

difference_array = np.subtract(input_image, subtraction_image)

fits_file = create_fits(self.cache_key, difference_array)
large_jpg_path, small_jpg_path = create_jpgs(self.cache_key, fits_file)
subtraction_comment = f'Product of Datalab Subtraction of {subtraction_file_input[0]["basename"]} subtracted from {input_files[index]["basename"]}'
outputs.append(create_output(self.cache_key, difference_array, index=index, comment=subtraction_comment))

output_file = save_fits_and_thumbnails(self.cache_key, fits_file, large_jpg_path, small_jpg_path, index)
outputs.append(output_file)
self.set_operation_progress(self.get_operation_progress() + .50 * (index + 1) / len(input_files))

self.set_percent_completion(self.get_percent_completion() + .50 * (index + 1) / len(input_files))

output = {'output_files': outputs}

self.set_output(output)
self.set_output(outputs)
log.info(f'Subtraction output: {self.get_output()}')
Loading
Loading