Skip to content

Commit

Permalink
plgn-380 improve deduplication
Browse files Browse the repository at this point in the history
  • Loading branch information
llaszuk-r7 committed Oct 31, 2023
1 parent 90bf23a commit 8770a47
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 13 deletions.
6 changes: 3 additions & 3 deletions plugins/salesforce/.CHECKSUM
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"spec": "0ec9ed127b0d93e5957193155cc27e3f",
"manifest": "0727a55b0072f452c180829a4cddd8e7",
"setup": "89fe33ae3cb64e2d51cd30705310194c",
"spec": "008516a68718d4a9b57835c57c3a132a",
"manifest": "f9b4006cd9fa90077c2395ea32bdc5fb",
"setup": "fab40f6c1ff45349e729be467280922f",
"schemas": [
{
"identifier": "advanced_search/schema.py",
Expand Down
2 changes: 1 addition & 1 deletion plugins/salesforce/bin/komand_salesforce
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ from sys import argv

Name = "Salesforce"
Vendor = "rapid7"
Version = "2.1.2"
Version = "2.1.3"
Description = "The Salesforce plugin allows you to search, update, and manage salesforce records"


Expand Down
1 change: 1 addition & 0 deletions plugins/salesforce/help.md
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ _This plugin does not contain any troubleshooting information._

# Version History

* 2.1.3 - Task Monitor Users: improve deduplication logic on user login history
* 2.1.2 - Task Monitor Users: normalisation for date in state, handle backwards compatibility
* 2.1.1 - Task Monitor Users: query improvement on updated users | Add extra logs on timestamp | Add cutoff time limit for 24 hours
* 2.1.0 - Implemented token auto-refresh on expiration for continuous sessions | Task Monitor Users: add flag `remove_duplicates` for duplicated events | Task Monitor Users: removed formatting of task output and cleaning null
Expand Down
42 changes: 35 additions & 7 deletions plugins/salesforce/komand_salesforce/tasks/monitor_users/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,16 +159,17 @@ def run(self, params={}, state={}): # noqa: C901
)
users_login = response.get("records", [])
user_login_next_page_id = response.get("next_page_id")

if remove_duplicates is True:
users_login = self.remove_duplicates_user_login_history(users_login)

if user_login_next_page_id:
state[self.USER_LOGIN_NEXT_PAGE_ID] = user_login_next_page_id
has_more_pages = True

self.logger.info(f"{len(users_login)} users login history added to output")
records.extend(self.add_data_type_field(users_login, "User Login"))

if remove_duplicates is True:
records = self.remove_duplicates(records)

return records, state, has_more_pages, 200, None
except ApiException as error:
return [], state, False, error.status_code, error
Expand Down Expand Up @@ -205,24 +206,51 @@ def _get_recent_timestamp(self, state: dict, fallback_timestamp: datetime, key:
stored_timestamp = self.convert_to_datetime(state.get(key, fallback_timestamp))
return max(stored_timestamp, fallback_timestamp)

def remove_duplicates(self, records: list) -> list:
def remove_duplicates_user_login_history(self, records: list) -> list:
"""
Remove duplicate entries from the provided list of records.
Remove duplicate entries from the provided list of records based on a hash of non-time fields.
Args:
records (list): A list containing the records to be de-duplicated.
Returns:
list: A list containing only the unique records from the input list.
"""
unique_records = {json.dumps(event, sort_keys=True): event for event in records}
unique_records = list(unique_records.values())
unique_records = []
seen_hashes = []

for record in records:
hash_record = self._get_non_time_fields_hash(record)
if hash_record not in seen_hashes:
unique_records.append(record)
seen_hashes.append(hash_record)

if len(records) != len(unique_records):
self.logger.info(
f"Removed {len(records) - len(unique_records)} duplicate from a total of {len(records)} duplicate records."
)
return unique_records

def _get_non_time_fields_hash(self, record):
"""
Calculate a hash based on the non-time fields of a record.
Args:
record (dict): A dictionary containing the record data with fields to be used for hash calculation.
Returns:
int: A hash value representing the non-time fields of the record.
"""
return hash(
str(record.get("userId", ""))
+ str(record.get("LoginType", ""))
+ str(record.get("LoginUrl", ""))
+ str(record.get("SourceIp", ""))
+ str(record.get("Status", ""))
+ str(record.get("Application", ""))
+ str(record.get("Browser", ""))
)

@staticmethod
def get_current_time() -> datetime:
return datetime.now(timezone.utc)
Expand Down
2 changes: 1 addition & 1 deletion plugins/salesforce/plugin.spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ products: [insightconnect]
name: salesforce
title: Salesforce
description: The Salesforce plugin allows you to search, update, and manage salesforce records
version: 2.1.2
version: 2.1.3
connection_version: 2
vendor: rapid7
support: community
Expand Down
2 changes: 1 addition & 1 deletion plugins/salesforce/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


setup(name="salesforce-rapid7-plugin",
version="2.1.2",
version="2.1.3",
description="The Salesforce plugin allows you to search, update, and manage salesforce records",
author="rapid7",
author_email="",
Expand Down

0 comments on commit 8770a47

Please sign in to comment.