Skip to content

Commit 5f6c48c

Browse files
authored
Merge pull request #74 from redBorder/feature/ip_identifier
PR-63: Outliers identifier
2 parents e49f5eb + cd864ff commit 5f6c48c

File tree

3 files changed

+298
-2
lines changed

3 files changed

+298
-2
lines changed
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# Copyright (C) 2024 Eneo Tecnologia S.L.
2+
#
3+
# Authors:
4+
# Miguel Álvarez Adsuara <malvarez@redborder.com>
5+
#
6+
# This program is free software: you can redistribute it and/or modify it under the terms of the
7+
# GNU Affero General Public License as published by the Free Software Foundation, either version 3
8+
# of the License, or (at your option) any later version.
9+
#
10+
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
11+
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12+
# Affero General Public License for more details.
13+
#
14+
# You should have received a copy of the GNU Affero General Public License along with this program.
15+
# If not, see <https://www.gnu.org/licenses/>.
16+
17+
import json
18+
import pandas as pd
19+
from resources.src.logger import logger
20+
from sklearn.ensemble import IsolationForest
21+
22+
class OutlierIdentifier:
23+
def __init__(self):
24+
self.df = None
25+
self.model = None
26+
27+
def prepare_data(self, all_ips_data):
28+
"""
29+
Prepare the data by flattening the input data, extracting relevant features,
30+
and computing rolling statistics.
31+
32+
Args:
33+
all_ips_data (dict): Dictionary containing time-series data for each IP.
34+
"""
35+
flattened_data = []
36+
for ip, ip_data in all_ips_data.items():
37+
for entry in ip_data:
38+
flattened_data.append({
39+
"ip": ip,
40+
"timestamp": entry.get("timestamp"),
41+
"bytes": entry.get("result", {}).get("bytes", 0),
42+
})
43+
44+
self.df = pd.DataFrame(flattened_data)
45+
self.df['timestamp'] = pd.to_datetime(self.df['timestamp'])
46+
self.df['hour'] = self.df['timestamp'].dt.hour
47+
self.df['minute'] = self.df['timestamp'].dt.minute
48+
self.df['day'] = self.df['timestamp'].dt.day
49+
self.df['dayofweek'] = self.df['timestamp'].dt.dayofweek
50+
self.df['dayofyear'] = self.df['timestamp'].dt.dayofyear
51+
52+
self.df['rolling_mean'] = self.df['bytes'].rolling(window=5, min_periods=1).mean()
53+
self.df['rolling_std'] = self.df['bytes'].rolling(window=5, min_periods=1).std()
54+
55+
self.df['rolling_mean'] = self.df['rolling_mean'].fillna(0)
56+
self.df['rolling_std'] = self.df['rolling_std'].fillna(0)
57+
58+
self.df['low_traffic'] = self.df['bytes'] == 0
59+
60+
def train_model(self, X_train):
61+
"""
62+
Train the Isolation Forest model on the provided training data.
63+
64+
Args:
65+
X_train (DataFrame): The training set features.
66+
"""
67+
self.model = IsolationForest(contamination=0.05, random_state=42)
68+
self.model.fit(X_train)
69+
70+
def identify_implicated_ips(self, outliers):
71+
"""
72+
Identify IPs that contributed to the outlier events.
73+
74+
Args:
75+
outliers (list): A list of outlier events with timestamps and expected values.
76+
77+
Returns:
78+
dict: A dictionary with implicated IPs for each outlier event.
79+
"""
80+
self.df['outlier'] = self.model.predict(self.df[['hour', 'minute', 'day', 'dayofweek', 'dayofyear', 'rolling_mean', 'rolling_std', 'low_traffic']])
81+
self.df['outlier'] = self.df['outlier'].apply(lambda x: 'anomaly' if x == -1 else 'normal')
82+
83+
implicated_ips = {"ips": []}
84+
for outlier in outliers:
85+
timestamp = outlier["timestamp"]
86+
outlier_data = self.df[self.df['timestamp'] == timestamp]
87+
88+
implicated_ips["ips"].append({
89+
"caused_by": list(outlier_data[outlier_data['outlier'] == 'anomaly']['ip'])
90+
})
91+
92+
return implicated_ips
93+
94+
def execute(self, outliers, all_ips_data):
95+
"""
96+
Execute the full pipeline for detecting outliers and identifying implicated IPs.
97+
98+
Args:
99+
outliers (list): A list of outlier events.
100+
all_ips_data (dict): Dictionary containing time-series data for each IP.
101+
102+
Returns:
103+
json: A JSON string with the implicated IPs and outlier information.
104+
"""
105+
self.prepare_data(all_ips_data)
106+
self.train_model(self.df[['hour', 'minute', 'day', 'dayofweek', 'dayofyear', 'rolling_mean', 'rolling_std', 'low_traffic']])
107+
108+
implicated_ips = self.identify_implicated_ips(outliers)
109+
110+
logger.logger.error(implicated_ips)
111+
112+
return json.dumps(implicated_ips) if implicated_ips else {"ips": []}
113+
114+
def train_and_execute_model(self, outliers, all_ips_data):
115+
"""
116+
Wrapper function to handle errors during model training and execution.
117+
118+
Args:
119+
outliers (list): A list of outliers to process.
120+
all_ips_data (dict): Dictionary of IP data.
121+
122+
Returns:
123+
json: A JSON response with the result or error message.
124+
"""
125+
try:
126+
return self.execute(outliers, all_ips_data)
127+
except Exception as e:
128+
logger.logger.error("Could not execute anomaly detection")
129+
return self.return_error(e)
130+
131+
def return_error(self, error="error"):
132+
"""
133+
Return a JSON formatted error message.
134+
135+
Args:
136+
error (str): The error message to return.
137+
138+
Returns:
139+
dict: A dictionary containing the error status and message.
140+
"""
141+
return { "status": "error", "msg": error }

resources/src/server/rest.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,11 @@
2525
from flask import Flask, jsonify, request
2626

2727
from resources.src.redborder.s3 import S3
28-
from resources.src.ai import outliers, shallow_outliers
28+
from resources.src.ai import outliers, shallow_outliers, outliers_identifier
2929
from resources.src.druid import client, query_builder
3030
from resources.src.logger import logger
3131
from resources.src.config import configmanager
3232

33-
3433
'''
3534
Init local variables
3635
'''
@@ -63,11 +62,13 @@ def __init__(self):
6362
self.start_s3_sync_thread()
6463
self.app = Flask(__name__)
6564
self.app.add_url_rule('/api/v1/outliers', view_func=self.calculate, methods=['POST'])
65+
self.app.add_url_rule('/api/v1/ip_identifier', view_func=self.identify_ip, methods=['POST'])
6666
self.exit_code = 0
6767
self.shallow = shallow_outliers.ShallowOutliers(
6868
sensitivity = config.get("ShallowOutliers", "sensitivity"),
6969
contamination = config.get("ShallowOutliers", "contamination")
7070
)
71+
self.identifier = outliers_identifier.OutlierIdentifier()
7172
self.ai_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "ai")
7273
self.deep_models={}
7374

@@ -116,6 +117,33 @@ def calculate(self):
116117
logger.logger.info("Starting outliers execution")
117118
return self.execute_model(data, config.get("Outliers","metric"), model)
118119

120+
def identify_ip(self):
121+
"""
122+
Process the incoming request to identify implicated IPs based on outlier data.
123+
124+
Returns:
125+
Response: A JSON response with implicated IPs or an error message.
126+
"""
127+
try:
128+
payload = json.loads(request.form.get('payload', '{}'))
129+
130+
outliers = payload.get('outliers', [])
131+
all_ips_data = payload.get('all_ips_data', {})
132+
133+
if not isinstance(outliers, list) or not isinstance(all_ips_data, dict):
134+
return jsonify({"error": "Invalid data format"}), 400
135+
136+
result = self.identifier.train_and_execute_model(outliers, all_ips_data)
137+
138+
logger.logger.error(result)
139+
140+
return jsonify(result), 200
141+
142+
except Exception as e:
143+
logger.logger.error(f"Exception in identify_ip: {e}")
144+
return jsonify({"error": "An internal error has occurred!"}), 500
145+
146+
119147
def decode_b64_json(self, b64_json):
120148
"""
121149
Decode a base64 json into a python dictionary.

resources/tests/test_ip_identifier.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
# Copyright (C) 2024 Eneo Tecnologia S.L.
2+
#
3+
# Authors:
4+
# Miguel Álvarez Adsuara <malvarez@redborder.com>
5+
#
6+
# This program is free software: you can redistribute it and/or modify it under the terms of the
7+
# GNU Affero General Public License as published by the Free Software Foundation, either version 3
8+
# of the License, or (at your option) any later version.
9+
#
10+
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
11+
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12+
# Affero General Public License for more details.
13+
#
14+
# You should have received a copy of the GNU Affero General Public License along with this program.
15+
# If not, see <https://www.gnu.org/licenses/>.
16+
17+
import json
18+
import unittest
19+
import pandas as pd
20+
from resources.src.ai.outliers_identifier import OutlierIdentifier
21+
22+
class TestOutlierIdentifier(unittest.TestCase):
23+
24+
def setUp(self):
25+
self.identifier = OutlierIdentifier()
26+
27+
def test_prepare_data_valid_input(self):
28+
data = {
29+
"192.168.1.1": [
30+
{"timestamp": "2024-11-14T12:00:00", "result": {"bytes": 500}},
31+
{"timestamp": "2024-11-14T12:05:00", "result": {"bytes": 300}}
32+
]
33+
}
34+
self.identifier.prepare_data(data)
35+
self.assertIsInstance(self.identifier.df, pd.DataFrame)
36+
self.assertIn('timestamp', self.identifier.df.columns)
37+
self.assertIn('bytes', self.identifier.df.columns)
38+
self.assertEqual(len(self.identifier.df), 2)
39+
40+
def test_prepare_data_missing_bytes(self):
41+
data = {
42+
"192.168.1.1": [
43+
{"timestamp": "2024-11-14T12:00:00", "result": {}},
44+
{"timestamp": "2024-11-14T12:05:00", "result": {}}
45+
]
46+
}
47+
self.identifier.prepare_data(data)
48+
self.assertEqual(self.identifier.df['bytes'].sum(), 0)
49+
50+
def test_prepare_data_empty_input(self):
51+
data = {}
52+
self.identifier.prepare_data(data)
53+
self.assertTrue(self.identifier.df.empty)
54+
55+
def test_train_model_valid_data(self):
56+
data = {
57+
"192.168.1.1": [
58+
{"timestamp": "2024-11-14T12:00:00", "result": {"bytes": 500}},
59+
{"timestamp": "2024-11-14T12:05:00", "result": {"bytes": 300}},
60+
{"timestamp": "2024-11-14T12:10:00", "result": {"bytes": 1000}},
61+
{"timestamp": "2024-11-14T12:15:00", "result": {"bytes": 700}},
62+
{"timestamp": "2024-11-14T12:20:00", "result": {"bytes": 0}}
63+
]
64+
}
65+
self.identifier.prepare_data(data)
66+
try:
67+
self.identifier.train_model(self.identifier.df[['hour', 'minute', 'day', 'dayofweek', 'dayofyear', 'rolling_mean', 'rolling_std', 'low_traffic']])
68+
except Exception as e:
69+
self.fail(f"Training failed with exception: {e}")
70+
71+
def test_identify_implicated_ips_no_outliers(self):
72+
data = {
73+
"192.168.1.1": [
74+
{"timestamp": "2024-11-14T12:00:00", "result": {"bytes": 100}},
75+
{"timestamp": "2024-11-14T12:05:00", "result": {"bytes": 100}}
76+
]
77+
}
78+
outliers = []
79+
self.identifier.prepare_data(data)
80+
self.identifier.train_model(self.identifier.df[['hour', 'minute', 'day', 'dayofweek', 'dayofyear', 'rolling_mean', 'rolling_std', 'low_traffic']])
81+
result = self.identifier.identify_implicated_ips(outliers)
82+
self.assertEqual(result, {"ips": []})
83+
84+
def test_identify_implicated_ips_with_outliers(self):
85+
data = {
86+
"192.168.1.1": [
87+
{"timestamp": "2024-11-14T12:00:00", "result": {"bytes": 100}},
88+
{"timestamp": "2024-11-14T12:05:00", "result": {"bytes": 1000}}, # Anomalous traffic
89+
{"timestamp": "2024-11-14T12:10:00", "result": {"bytes": 100}},
90+
]
91+
}
92+
outliers = [{"timestamp": "2024-11-14T12:05:00"}]
93+
self.identifier.prepare_data(data)
94+
self.identifier.train_model(self.identifier.df[['hour', 'minute', 'day', 'dayofweek', 'dayofyear', 'rolling_mean', 'rolling_std', 'low_traffic']])
95+
result = self.identifier.identify_implicated_ips(outliers)
96+
self.assertIn("192.168.1.1", result["ips"][0]["caused_by"])
97+
98+
def test_execute_with_valid_input(self):
99+
data = {
100+
"192.168.1.1": [
101+
{"timestamp": "2024-11-14T12:00:00", "result": {"bytes": 500}},
102+
{"timestamp": "2024-11-14T12:05:00", "result": {"bytes": 300}},
103+
{"timestamp": "2024-11-14T12:10:00", "result": {"bytes": 1000}}
104+
]
105+
}
106+
outliers = [{"timestamp": "2024-11-14T12:10:00"}]
107+
result = self.identifier.execute(outliers, data)
108+
self.assertIsInstance(result, str)
109+
parsed_result = json.loads(result)
110+
self.assertIn("ips", parsed_result)
111+
self.assertEqual(len(parsed_result["ips"]), 1)
112+
113+
def test_train_and_execute_model_error_handling(self):
114+
data = None # Invalid data
115+
outliers = [{"timestamp": "2024-11-14T12:10:00"}]
116+
result = self.identifier.train_and_execute_model(outliers, data)
117+
self.assertIn("status", result)
118+
self.assertEqual(result["status"], "error")
119+
120+
def test_return_error(self):
121+
error_message = "Test error"
122+
result = self.identifier.return_error(error_message)
123+
self.assertEqual(result["status"], "error")
124+
self.assertEqual(result["msg"], error_message)
125+
126+
if __name__ == "__main__":
127+
unittest.main()

0 commit comments

Comments
 (0)