Skip to content

Commit

Permalink
Use timestamps from filenames (#35)
Browse files Browse the repository at this point in the history
* Rename types module to schemas

* Look for timestamp in filenames first

* Fix other filemanagement tests

* Commands for exploring captures before processing
  • Loading branch information
mihow authored May 4, 2023
1 parent 15ffd14 commit 666fc8a
Show file tree
Hide file tree
Showing 18 changed files with 154 additions and 37 deletions.
36 changes: 34 additions & 2 deletions trapdata/cli/show.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import pathlib
from typing import Optional

import typer
Expand All @@ -10,6 +11,7 @@
from trapdata import logger, ml
from trapdata.cli import settings
from trapdata.cli.queue import status as queue_status
from trapdata.common.filemanagement import find_images
from trapdata.db import models
from trapdata.db.base import get_session_class
from trapdata.db.models.deployments import list_deployments
Expand All @@ -24,6 +26,7 @@
update_all_aggregates,
)
from trapdata.db.models.occurrences import list_occurrences, list_species
from trapdata.ml.utils import StopWatch

cli = typer.Typer(no_args_is_help=True)

Expand Down Expand Up @@ -90,11 +93,40 @@ def deployments():


@cli.command()
def captures(deployment: str):
def captures(path: Optional[pathlib.Path] = None, max_num: Optional[int] = None):
"""
Summarize the raw images captured by a deployment.
"""
raise NotImplementedError
if not path:
path = settings.image_base_path
images = []
i = 0
with StopWatch() as t:
for i, f in enumerate(find_images(path)):
# logger.debug(f'Found {f["path"].name} from {f["timestamp"].strftime("%c")}')
images.append(f)
if max_num and i + 1 >= max_num:
break

print(f"Total images: {i+1}")
print(t)
print(images)


@cli.command()
def capture_counts(path: Optional[pathlib.Path] = None, check_exif: bool = False):
"""
Show number of raw images captured by a deployment as fast as possible.
By default does not read timestamps from image EXIF data.
"""
if not path:
path = settings.image_base_path
with StopWatch() as t:
count = sum(1 for _ in find_images(path, check_exif=check_exif))
print(f"Total images: {count}")
print(t)
print(0)


@cli.command()
Expand Down
Empty file added trapdata/common/__init__.py
Empty file.
121 changes: 99 additions & 22 deletions trapdata/common/filemanagement.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
import imagesize
import PIL.ExifTags
import PIL.Image
from dateutil.parser import ParserError

from trapdata.tests import TEST_IMAGES_BASE_PATH # noqa: F401

from . import constants
from .logs import logger
Expand Down Expand Up @@ -111,7 +114,8 @@ def construct_exif(
"""
Construct an EXIF class using human readable keys.
Can be save to a Pillow image using:
>>> image = PIL.Image("test.jpg")
>>> image = PIL.Image.open("./trapdata/tests/images/denmark/20220811005907-00-78.jpg")
>>> existing_exif = image.getexif()
>>> exif_data = construct_exif(description="hi!", existing_exif=existing_exif)
>>> image.save("test_with_exif.jpg", exif=exif_data)
Expand Down Expand Up @@ -161,7 +165,47 @@ def get_image_filesize(img_path):
return pathlib.Path(img_path).stat().st_size


def get_image_timestamp(img_path):
def get_image_timestamp_from_filename(img_path) -> datetime.datetime:
"""
Parse the date and time a photo was taken from its filename.
The timestamp must be in the format `YYYYMMDDHHMMSS` but can be
preceded or followed by other characters (e.g. `84-20220916202959-snapshot.jpg`).
>>> out_fmt = "%Y-%m-%d %H:%M:%S"
>>> # Aarhus date format
>>> get_image_timestamp_from_filename("20220810231507-00-07.jpg").strftime(out_fmt)
'2022-08-10 23:15:07'
>>> # Diopsis date format
>>> get_image_timestamp_from_filename("20230124191342.jpg").strftime(out_fmt)
'2023-01-24 19:13:42'
>>> # Snapshot date format in Vermont traps
>>> get_image_timestamp_from_filename("20220622000459-108-snapshot.jpg").strftime(out_fmt)
'2022-06-22 00:04:59'
>>> # Snapshot date format in Cyprus traps
>>> get_image_timestamp_from_filename("84-20220916202959-snapshot.jpg").strftime(out_fmt)
'2022-09-16 20:29:59'
"""
name = pathlib.Path(img_path).stem
date = None

# Extract date from a filename using regex in the format %Y%m%d%H%M%S
matches = re.search(r"(\d{14})", name)
if matches:
date = datetime.datetime.strptime(matches.group(), "%Y%m%d%H%M%S")
else:
date = dateutil.parser.parse(
name, fuzzy=False
) # Fuzzy will interpret "DSC_1974" as 1974-01-01

if date:
return date
else:
raise ValueError(f"Could not parse date from filename '{img_path}'")


def get_image_timestamp_from_exif(img_path):
"""
Parse the date and time a photo was taken from its EXIF data.
Expand All @@ -174,6 +218,42 @@ def get_image_timestamp(img_path):
return date


def get_image_timestamp(
img_path, check_exif=True, assert_exists=True
) -> datetime.datetime:
"""
Parse the date and time a photo was taken from its filename or EXIF data.
Reading the exif data is slow, so only do it if necessary.
It is set to True for backwards compatibility.
>>> images = pathlib.Path(TEST_IMAGES_BASE_PATH)
>>> # Use filename
>>> get_image_timestamp(images / "cyprus/84-20220916202959-snapshot.jpg").strftime("%Y-%m-%d %H:%M:%S")
'2022-09-16 20:29:59'
>>> # Fallback to EXIF
>>> get_image_timestamp(images / "DSLR/DSC_0390.JPG").strftime("%Y-%m-%d %H:%M:%S")
'2022-07-19 14:28:16'
"""
if assert_exists:
assert pathlib.Path(img_path).exists(), f"Image file does not exist: {img_path}"
try:
date = get_image_timestamp_from_filename(img_path)
except (ValueError, ParserError) as e:
if check_exif:
logger.debug(f"Could not parse date from filename: {e}. Trying EXIF.")
try:
date = get_image_timestamp_from_exif(img_path)
except dateutil.parser.ParserError:
logger.error(
f"Could not parse image timestamp from filename or EXIF tags: {e}."
)
raise
else:
raise
return date


def get_image_timestamp_with_timezone(img_path, default_offset="+0"):
"""
Parse the date and time a photo was taken from its EXIF data.
Expand All @@ -195,11 +275,11 @@ def get_image_timestamp_with_timezone(img_path, default_offset="+0"):
def find_images(
base_directory,
absolute_paths=False,
include_timestamps=True,
skip_bad_exif=True,
check_exif=True,
skip_missing_timestamps=True,
):
logger.info(f"Scanning '{base_directory}' for images")
base_directory = pathlib.Path(base_directory)
base_directory = pathlib.Path(base_directory).expanduser().resolve()
if not base_directory.exists():
raise Exception(f"Directory does not exist: {base_directory}")
extensions_list = "|".join(
Expand All @@ -215,19 +295,16 @@ def find_images(
shape = get_image_dimensions(path)
filesize = get_image_filesize(path)

if include_timestamps:
try:
date = get_image_timestamp_with_timezone(full_path)
except Exception as e:
logger.error(
f"Could not get EXIF date for image: {full_path}\n {e}"
)
if skip_bad_exif:
continue
else:
date = None
else:
date = None
try:
date = get_image_timestamp(full_path, check_exif=check_exif)
except Exception as e:
logger.error(
f"Skipping image, could not determine timestamp for: {full_path}\n {e}"
)
if skip_missing_timestamps:
continue
else:
date = None

yield {
"path": path,
Expand All @@ -247,10 +324,10 @@ def group_images_by_day(images, maximum_gap_minutes=6 * 60):
# @TODO add other group by methods? like image size, camera model, random sample batches, etc. Add to UI settings
@TODO make fake images for this test
>>> images = find_images(TEST_IMAGES_BASE_PATH, skip_bad_exif=True)
>>> sessions = group_images_by_session(images)
>>> images = find_images(TEST_IMAGES_BASE_PATH, skip_missing_timestamps=True)
>>> sessions = group_images_by_day(images)
>>> len(sessions)
11
7
"""
logger.info(
f"Grouping images into date-based groups with a maximum gap of {maximum_gap_minutes} minutes"
Expand All @@ -270,7 +347,7 @@ def group_images_by_day(images, maximum_gap_minutes=6 * 60):
else:
delta = maximum_gap_minutes

# logger.debug(f"{timestamp}, {round(delta, 2)}")
logger.debug(f"{image['timestamp']}, {round(delta, 2)}")

if delta >= maximum_gap_minutes:
current_day = image["timestamp"].date()
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion trapdata/db/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from sqlalchemy import orm

from trapdata import logger
from trapdata.common.types import DatabaseURL
from trapdata.common.schemas import DatabaseURL

DIALECT_CONNECTION_ARGS = {
"sqlite": {
Expand Down
2 changes: 1 addition & 1 deletion trapdata/db/models/deployments.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pydantic import BaseModel
from sqlalchemy import orm

from trapdata.common.types import FilePath
from trapdata.common.schemas import FilePath
from trapdata.db import models


Expand Down
10 changes: 7 additions & 3 deletions trapdata/db/models/detections.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from trapdata import constants, db
from trapdata.common.filemanagement import absolute_path, construct_exif, save_image
from trapdata.common.logs import logger
from trapdata.common.types import FilePath
from trapdata.common.schemas import FilePath
from trapdata.common.utils import bbox_area, bbox_center, export_report
from trapdata.db import models
from trapdata.db.models.images import completely_classified
Expand Down Expand Up @@ -507,7 +507,9 @@ def get_species_for_image(db_path, image_id):
def num_species_for_event(
db_path, monitoring_session, classification_threshold: float = 0.6
) -> int:
query = sa.select(sa.func.count(DetectedObject.specific_label.distinct()),).where(
query = sa.select(
sa.func.count(DetectedObject.specific_label.distinct()),
).where(
(DetectedObject.specific_label_score >= classification_threshold)
& (DetectedObject.monitoring_session == monitoring_session)
)
Expand All @@ -519,7 +521,9 @@ def num_species_for_event(
def num_occurrences_for_event(
db_path, monitoring_session, classification_threshold: float = 0.6
) -> int:
query = sa.select(sa.func.count(DetectedObject.sequence_id.distinct()),).where(
query = sa.select(
sa.func.count(DetectedObject.sequence_id.distinct()),
).where(
(DetectedObject.specific_label_score >= classification_threshold)
& (DetectedObject.monitoring_session == monitoring_session)
)
Expand Down
2 changes: 1 addition & 1 deletion trapdata/db/models/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from trapdata.common.filemanagement import find_images, group_images_by_day
from trapdata.common.logs import logger
from trapdata.common.types import FilePath
from trapdata.common.schemas import FilePath
from trapdata.common.utils import export_report
from trapdata.db import Base, get_session, models

Expand Down
2 changes: 1 addition & 1 deletion trapdata/db/models/queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sqlalchemy as sa

from trapdata import constants, logger
from trapdata.common.types import FilePath
from trapdata.common.schemas import FilePath
from trapdata.db import get_session
from trapdata.db.models.detections import DetectedObject
from trapdata.db.models.events import MonitoringSession
Expand Down
2 changes: 1 addition & 1 deletion trapdata/ml/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sentry_sdk import start_transaction

from trapdata import logger
from trapdata.common.types import FilePath
from trapdata.common.schemas import FilePath
from trapdata.common.utils import slugify
from trapdata.db.models.queue import QueueManager
from trapdata.ml.utils import StopWatch, get_device, get_or_download_file
Expand Down
2 changes: 1 addition & 1 deletion trapdata/ml/models/tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from torchvision import transforms

from trapdata import constants, logger
from trapdata.common.types import BoundingBox, FilePath
from trapdata.common.schemas import BoundingBox, FilePath
from trapdata.db.models.detections import DetectedObject, save_classified_objects
from trapdata.db.models.events import MonitoringSession
from trapdata.db.models.images import TrapImage
Expand Down
3 changes: 2 additions & 1 deletion trapdata/ml/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from sqlalchemy import orm

from trapdata import logger, ml
from trapdata.common.types import FilePath
from trapdata.common.schemas import FilePath
from trapdata.db.base import get_session_class
from trapdata.settings import Settings

Expand Down
2 changes: 1 addition & 1 deletion trapdata/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from trapdata import ml
from trapdata.common.filemanagement import default_database_dsn, get_app_dir
from trapdata.common.types import FilePath
from trapdata.common.schemas import FilePath


class Settings(BaseSettings):
Expand Down
3 changes: 3 additions & 0 deletions trapdata/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import os

TEST_IMAGES_BASE_PATH = os.path.join(os.path.dirname(__file__), "images")
Binary file added trapdata/tests/images/DSLR/DSC_0390.JPG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added trapdata/tests/images/DSLR/DSC_0391.JPG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion trapdata/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from rich import print

from trapdata import logger
from trapdata.common.types import FilePath
from trapdata.common.schemas import FilePath
from trapdata.db import check_db, get_session_class
from trapdata.db.models.events import get_or_create_monitoring_sessions
from trapdata.db.models.queue import (
Expand Down
2 changes: 1 addition & 1 deletion trapdata/ui/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pathlib

from trapdata import logger, ml
from trapdata.common.types import FilePath
from trapdata.common.schemas import FilePath
from trapdata.db.base import get_session_class

# @TODO Replace this pipeline file with the version in ml/pipeline.py after fully
Expand Down

0 comments on commit 666fc8a

Please sign in to comment.