diff --git a/resources/src/ai/shallow_outliers.py b/resources/src/ai/shallow_outliers.py index 370804b..550d7ce 100644 --- a/resources/src/ai/shallow_outliers.py +++ b/resources/src/ai/shallow_outliers.py @@ -67,16 +67,16 @@ def predict(self, arr): smooth_arr = np.convolve(padded_arr, kernel, mode='valid') return smooth_arr - def get_outliers(self, arr, smoothed_arr): + def get_outliers(self, arr, smoothed_arr, other=None): """ Given an array of data points and an aproximation of it, return a boolean array with the same shape as the original array which is True when the data point is an outlier and False otherwise. The method used for outlier detection is an isolation forest, which will look for - the 0.3% most isolated points when taking into account the original value, the - smoothed valued, the diference between them (error) and the squared diference - between them. + the 1% most isolated points when taking into account the original value, the + smoothed valued, the absolute diference between them (MAE) and the sign of the + difference between them. Args: arr (numpy.ndarray): 1D numpy array where the outliers shall be detected. @@ -87,13 +87,38 @@ def get_outliers(self, arr, smoothed_arr): numpy.ndarray: 1D numpy array with the smoothed data. """ error = arr-smoothed_arr - loss = error**2 - data = np.stack((arr,smoothed_arr,error,loss), axis = 1) - model = IsolationForest(n_estimators=100, contamination=0.003) + sign = np.sign(error) + data = np.stack((smoothed_arr, np.abs(error), sign), axis=1) + if other is not None: + data = np.concatenate([data, other], axis=1) + model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42) model.fit(data) + model.offset_=-0.05+0.95*model.offset_ outliers = model.predict(data)==-1 return outliers + def encode_timestamp(self, timestamp): + """ + Takes a pandas Series of timestamps and returns a numpy array with a sine encoding for the + hour of day and a cosine encoding for the day of the week. This encoding helps the model to + learn periodic patterns in the data while maintaining simplicity. + + Parameters: + timestamps (pd.Series): A Pandas Series of timestamps. + + Returns: + pd.DataFrame: A DataFrame with sine-cosine encodings for daily and weekly periods. + """ + if not isinstance(timestamp, pd.Series): + raise ValueError("Input must be a Pandas Series") + timestamp = pd.to_datetime(timestamp) + hour_of_day = timestamp.dt.hour + timestamp.dt.minute/60 + day_of_week = timestamp.dt.dayofweek + hour_of_day/24 + daily_sin = np.sin(2*np.pi*hour_of_day/24) + weekly_cos = np.cos(2*np.pi*day_of_week/7) + encoded = np.stack((daily_sin, weekly_cos), axis=1) + return encoded + def compute_json(self, raw_json): """ Main method used for anomaly detection. @@ -111,7 +136,8 @@ def compute_json(self, raw_json): data = pd.json_normalize(raw_json) arr = data.iloc[:, 1].values smoothed_arr = self.predict(arr) - outliers = self.get_outliers(arr, smoothed_arr) + encoded_timestamp = self.encode_timestamp(data["timestamp"]) + outliers = self.get_outliers(arr, smoothed_arr, other=encoded_timestamp) data["smooth"] = smoothed_arr predicted = data[["timestamp","smooth"]].rename(columns={"smooth":"forecast"}) anomalies = data[["timestamp","smooth"]].rename(columns={"smooth":"expected"}).loc[outliers]