Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
f638723
Fix error ( plot = True on retry combination of anomalies)
clarasaja Nov 18, 2025
80a501a
Create base structure for a plot method on dataset generator (remove …
clarasaja Nov 18, 2025
0409a18
Create a list of anomalies applied for plotting purpose
clarasaja Nov 18, 2025
3ae9e14
Implement plot_dataset func and its test
clarasaja Nov 18, 2025
cd5b91a
Fix plotting function
clarasaja Nov 18, 2025
95ca242
Add a function to manage time"str" division
clarasaja Nov 21, 2025
53d08bb
Refactor of generate function and test fixing
clarasaja Nov 24, 2025
6df2c5f
Add limitation on step anomalise time_span
clarasaja Nov 24, 2025
1ba5713
Fix a typo
clarasaja Nov 24, 2025
78f3868
Fix test on step_uv /mv
clarasaja Nov 24, 2025
b23153b
Put max_anomalies_per_series = min(max_anomalies_per_seri…
clarasaja Nov 24, 2025
49a10a3
Put example of usage on the notebook
clarasaja Nov 24, 2025
e102a57
Implemente anomalies_ratio
clarasaja Nov 25, 2025
61c3961
Modify a logger info
clarasaja Nov 27, 2025
cf9376f
Allign with privatization of _plot_func methods in timeseriesgenerator
clarasaja Nov 28, 2025
e277b00
Move an import to the top of the page
clarasaja Nov 28, 2025
8c086af
Fix logger info
clarasaja Nov 28, 2025
e018338
Fix _plot_func in order too handle more anomalies of the same type
clarasaja Nov 28, 2025
006fdab
Add auto_search_anomalies_label=True to plot_datas
clarasaja Nov 28, 2025
2efb320
Fix multiple labelliing in plot_func
clarasaja Nov 28, 2025
308a632
Fix random anomalies generation
clarasaja Nov 28, 2025
a252803
Fix a bug in generating series with 0 anomalies
clarasaja Nov 28, 2025
aa84da4
Fix _plot_func autoserach when labelling multiple anomalies of the sa…
clarasaja Nov 28, 2025
3988f3e
Fix _plot_func legend upper_left
clarasaja Nov 28, 2025
8ee3ae5
Update Examples - generetor notobook
clarasaja Nov 28, 2025
4a829b0
Change plot_dataset in a static method of the class
clarasaja Nov 28, 2025
c42c75f
Fix a minor bug in plot_dataset
clarasaja Nov 28, 2025
76f5c37
Add auto_repeat_anomalies interface for future implementation
clarasaja Nov 28, 2025
8beec2f
Add implementation for auto_repeate_anomalies
clarasaja Nov 28, 2025
1dc8f32
Correct 'repeate' in 'repeat'
clarasaja Nov 28, 2025
7a26ea9
Change auto_repeat_anomalies default to True
clarasaja Nov 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
322 changes: 26 additions & 296 deletions Examples - generators.ipynb

Large diffs are not rendered by default.

180 changes: 115 additions & 65 deletions ats/dataset_generators.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from .timeseries_generators import HumiTempTimeseriesGenerator
from .timeseries_generators import HumiTempTimeseriesGenerator, _plot_func
import random as rnd
import pandas as pd
import itertools

# Setup logging
import logging
Expand Down Expand Up @@ -30,9 +29,63 @@ def __check_list(self, value, name):
raise TypeError(f"`{name}` must be a list, got {type(value).__name__}.")
return value

def generate(self, n_series=9, time_span='30D', plot=False,
def _divide_time_interval(self,interval_str, max_anomalies_per_series, anomalies=[]):
# TODO: Clarify UTC only
total_seconds = int(pd.Timedelta(interval_str).total_seconds())
segment_seconds = total_seconds / max_anomalies_per_series

if segment_seconds < 20*24*60*60 and ("step_uv" in anomalies or "step_mv" in anomalies):
raise NotImplementedError("Step anomalies require longer time_span")

return "{}s".format(segment_seconds)

def _generate_series(self,sampling_interval='15min',sub_time_span='30D', anomalies=[], effects=[],max_anomalies_per_series=2):
if len(anomalies) == 0:
return HumiTempTimeseriesGenerator(
sampling_interval=sampling_interval,
time_span=self.time_span,
).generate(effects=effects, anomalies=[])
else:
anomalies = anomalies.copy()
for i in range(max_anomalies_per_series - len(anomalies)):
anomalies.append(None)
first_anomaly = rnd.sample(anomalies, 1)
anomalies.remove(first_anomaly[0])
if first_anomaly[0] is None:
first_anomaly = []
series_combined = HumiTempTimeseriesGenerator(sampling_interval=sampling_interval,time_span=sub_time_span).generate(
effects=effects,
anomalies=first_anomaly
)
last_time = series_combined.index[-1] + pd.Timedelta(sampling_interval)

for i in range(1, max_anomalies_per_series):
i_anomaly = rnd.sample(anomalies, 1)
anomalies.remove(i_anomaly[0])
if i_anomaly[0] is None:
i_anomaly = []
last_time = series_combined.index[-1] + pd.Timedelta(sampling_interval)
series = HumiTempTimeseriesGenerator( sampling_interval=sampling_interval,time_span=sub_time_span,
starting_year = last_time.year,
starting_month = last_time.month,
starting_day = last_time.day,
starting_hour = last_time.hour,
starting_minute = last_time.minute
).generate(
effects=effects,
anomalies=i_anomaly
)
series_combined = pd.concat([series_combined, series])
if len(anomalies) != max_anomalies_per_series - i -1:
raise ValueError("Anomalies list length mismatch.")

return series_combined


def generate(self, n_series=9, time_span='60D',
effects='default', anomalies='default',
max_anomalies_per_series = 2, anomalies_ratio = 0.5):
max_anomalies_per_series = 1, anomalies_ratio = 0.5,
auto_repeat_anomalies = True):
"""
Generate a synthetic dataset of humidity-temperature time series
with different anomaly configurations.
Expand All @@ -44,101 +97,98 @@ def generate(self, n_series=9, time_span='30D', plot=False,
anomalies (list[str]): Anomalies to apply in each series.
max_anomalies_per_series (int): Max anomalies per series.
anomalies_ratio (float): ratio of series with anomalies w.r.t. series without it in the dataset (0-1 range).
auto_repeat_anomalies (bool): If True, anomalies are automatically reused to fill the requested number per series.
If False, anomalies will only appear as many times as listed in the `anomalies` argument.
Returns:
list: Generated synthetic time series.
"""
random_effects = [] # random_effects (bool, optional): Random effects to apply across series.
n = n_series

# Validate input parameters
if not isinstance(n, int):
raise TypeError(f"'n' must be an integer, got {type(n).__name__}.")
if n <= 0:
raise ValueError("'n' must be a positive integer.")

if max_anomalies_per_series != 2:
raise NotImplementedError("Not yet.")
if anomalies_ratio != 0.5:
raise NotImplementedError("Not yet.")

# Validate and convert parameters to lists
if not isinstance(max_anomalies_per_series, int):
raise TypeError(f"'max_anomalies_per_series' must be an integer, got {type(max_anomalies_per_series).__name__}.")
if max_anomalies_per_series < 0:
raise ValueError("'max_anomalies_per_series' must be a non-negative integer.")
if not isinstance(anomalies_ratio, (int, float)):
raise TypeError(f"'anomalies_ratio' must be a float, got {type(anomalies_ratio).__name__}.")
if not (0 <= anomalies_ratio <= 1):
raise ValueError("'anomalies_ratio' must be between 0 and 1.")
if not isinstance(auto_repeat_anomalies, bool):
raise TypeError(f"'auto_repeat_anomalies' must be a boolean, got {type(auto_repeat_anomalies).__name__}.")

# Validate list parameters
effects = self.__check_list(effects, "effects")
random_effects = self.__check_list(random_effects, "random_effects")
anomalies = self.__check_list(anomalies, "anomalies")

number_of_anomalies = len(anomalies)

if number_of_anomalies == 0:
logger.info("No anomalies specified; generating dataset without anomalies.")
if number_of_anomalies == 1:
logger.info("Single anomaly specified; generating dataset with 0 or 1 anomaly per series.")
if number_of_anomalies >= 2:
logger.info("Multiple anomalies specified; generating dataset with 0, 1, or 2 anomalies per series.")

if number_of_anomalies == 2:
anomaly1, anomaly2 = anomalies[0], anomalies[1]
base1 = anomaly1.replace('_uv', '').replace('_mv', '')
base2 = anomaly2.replace('_uv', '').replace('_mv', '')
if (base1 == base2 and
((anomaly1.endswith('_uv') and anomaly2.endswith('_mv')) or
(anomaly1.endswith('_mv') and anomaly2.endswith('_uv')))):
raise ValueError(f"Incompatible anomaly pair: {anomalies}. '{anomaly1}' and '{anomaly2}' cannot be used together.")

logger.info("No anomalies specified; generating dataset without anomalies. \n " \
"set max_anomalies_per_series to 0.")
max_anomalies_per_series = 0
sub_time_span = time_span
if number_of_anomalies > 0:
logger.info("Generating datest with max {} anomalies per series and " \
"with a {} % of series with anomalies.".format(max_anomalies_per_series, anomalies_ratio * 100))
if not auto_repeat_anomalies:
max_anomalies_per_series = min(max_anomalies_per_series, number_of_anomalies)
sub_time_span = self._divide_time_interval(time_span, max_anomalies_per_series,anomalies=anomalies)

if "clouds" in anomalies:
if "clouds" not in effects:
raise ValueError("Cannot use 'clouds' anomaly without including 'clouds' effect.")
raise ValueError("Cannot use 'clouds' anomaly without including 'clouds' effect.")

dataset = []
self._current_time_span = time_span or self.time_span

try:
generator = HumiTempTimeseriesGenerator(
temperature=self.temperature,
humidity=self.humidity,
sampling_interval=self.sampling_interval,
time_span=self._current_time_span
)
except Exception as e:
raise RuntimeError(f"Error initializing HumiTempTimeseriesGenerator") from e


self.time_span = time_span

accumulator=0.0
for i in range(n):
if i % 2 == 1:
accumulator += anomalies_ratio
if accumulator < 1.0:
anomalies_for_group = []
else:
accumulator -= 1.0
if number_of_anomalies == 0:
anomalies_for_group = []
elif number_of_anomalies == 1:
anomalies_for_group = rnd.sample(anomalies, 1)
else: # number_of_anomalies >= 2
if i % 4 == 0:
anomalies_for_group = rnd.sample(anomalies, 1)
else:
number_of_anomalies = rnd.randint(1, max_anomalies_per_series)
if auto_repeat_anomalies:
anomalies_for_group = rnd.choices(anomalies, k=number_of_anomalies)
else:
anomalies_for_group = rnd.sample(anomalies, 2)
anomalies_for_group = rnd.sample(anomalies, number_of_anomalies)

random_applied_effects = rnd.sample(random_effects, rnd.randint(0, len(random_effects)))
applied_effects = list(set(effects + random_applied_effects))

try:
series = generator.generate(effects=applied_effects or [],
anomalies=anomalies_for_group or [],
plot=plot, generate_csv=False)
except Exception as Error:
logger.warning(f"Error generating dataset with anomalies {anomalies_for_group}: Retrying.")
# Try other combinations of anomalies
for combo in rnd.sample(list(itertools.combinations(anomalies, 2)), len(anomalies)):
try:
series = generator.generate(effects=applied_effects or [],
anomalies=list(combo),
plot=True, generate_csv=False)
break # Exit loop if successful
except Exception as e:
logger.warning(f"Failed with combination {combo}: {e}")
logger.info(f"Generated dataset {len(dataset)+1} with effects: {applied_effects}")
series = self._generate_series(sampling_interval=self.sampling_interval,
sub_time_span=sub_time_span,
anomalies=anomalies_for_group,
effects=applied_effects,
max_anomalies_per_series=max_anomalies_per_series)
except Exception as e:
logger.error(f"Error generating series {i+1}: {e}")
continue
logger.info(f"Generated dataset {len(dataset)+1} with effects: {applied_effects} and anomalies: {anomalies_for_group} ")
dataset.append(series)

return dataset

@staticmethod
def plot_dataset(dataset):
"""
Plots each DataFrame in the dataset using _plot_func.
"""
for df in dataset:
_plot_func(df, auto_search_anomalies_label=True)


def _expected_points(self):
obs_window = pd.Timedelta(self._current_time_span)
obs_window = pd.Timedelta(self.time_span)
samp_interval = pd.Timedelta(self.sampling_interval)
return int(obs_window / samp_interval)
50 changes: 29 additions & 21 deletions ats/tests/test_dataset_generators.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import unittest
import pandas as pd
from unittest.mock import patch

from ..dataset_generators import HumiTempDatasetGenerator

Expand All @@ -14,8 +15,8 @@ def test_generate(self):
test_dataset = generator.generate(
n_series=12,
time_span='3D',
effects=['noise'],
anomalies=['spike_uv', 'step_uv']
effects=['noise','clouds'],
anomalies=['spike_uv', 'clouds']
)
expected_points = generator._expected_points()
self.assertEqual(len(test_dataset), 12)
Expand Down Expand Up @@ -45,25 +46,24 @@ def test_generate_errors(self):
generator.generate(effects='noise',anomalies=[])
with self.assertRaises(TypeError):
generator.generate(effects=456,anomalies=[])
with self.assertRaises(ValueError):
generator.generate(effects=[],anomalies=['spike_uv', 'spike_mv'])
generator.generate(effects=[],anomalies=['spike_uv', 'spike_mv'])
with self.assertRaises(ValueError):
generator.generate(effects=[],anomalies=['clouds'])
generator.generate(effects=['clouds'],anomalies=['clouds','spike_mv']) # Should not raise

def test_generate_random_effects(self):
generator = HumiTempDatasetGenerator()
test_dataset = generator.generate(
n_series=9,
time_span='4D',
#def test_generate_random_effects(self):
# generator = HumiTempDatasetGenerator()
# test_dataset = generator.generate(
# n_series=9,
# time_span='90D',
#random_effects=['clouds'],
effects=['noise', 'seasons'],
anomalies=['spike_uv','step_uv']
)
self.assertEqual(len(test_dataset), 9)
for i, series in enumerate(test_dataset, start=1):
self.assertIsNotNone(series, f"Series {i} is None")
self.assertTrue(len(series) > 0, f"Series {i} is empty")
# effects=['noise', 'seasons'],
# anomalies=['spike_uv','step_uv']
# )
# self.assertEqual(len(test_dataset), 9)
#for i, series in enumerate(test_dataset, start=1):
# self.assertIsNotNone(series, f"Series {i} is None")
# self.assertTrue(len(series) > 0, f"Series {i} is empty")

def test_no_anomalies(self):
generator = HumiTempDatasetGenerator()
Expand Down Expand Up @@ -105,16 +105,24 @@ def test_multiple_anomalies(self):
generator = HumiTempDatasetGenerator()
test_dataset = generator.generate(
n_series=8,
time_span='2D',
time_span='3D',
effects=['noise'],
anomalies=['spike_uv', 'step_mv']
anomalies=['spike_uv', 'spike_mv']
)
self.assertEqual(len(test_dataset), 8)
for i, series in enumerate(test_dataset):
with self.subTest(dataset=i):
self.assertIn('temperature', series.columns)
self.assertIn('humidity', series.columns)
self.assertEqual(len(series), generator._expected_points())
# Verify anomaly labels are either 0, 1, or 2
if 'anomaly' in series.columns:
self.assertTrue(series['anomaly'].isin([0, 1, 2]).all())
# Verify anomaly labels are either 0, 1, or 2
if 'anomaly' in series.columns:
self.assertTrue(series['anomaly'].isin([0, 1, 2]).all())

@patch("matplotlib.pyplot.show")
def test_plot_dataset(self, mock_show):
generator = HumiTempDatasetGenerator()
test_dataset = generator.generate(n_series=3, time_span='1D',
effects=['noise'], anomalies=['spike_uv'])
generator.plot_dataset(test_dataset)
self.assertEqual(mock_show.call_count, 3)
Loading