Skip to content

Commit

Permalink
make graph more meaningful (#1977)
Browse files Browse the repository at this point in the history
* make graph more meaningful

if a search with multiple usernames is launched, it creates an additional site node where they both are found. 
advantages:
- better recognition, that users have a connection with each other
- better detection of false positives when launching a search with two fake usernames (site node = definite false positive)

* fix Graph linking report.py
  • Loading branch information
overcuriousity authored Dec 17, 2024
1 parent c2e3e96 commit 36ce285
Showing 1 changed file with 85 additions and 74 deletions.
159 changes: 85 additions & 74 deletions maigret/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,118 +98,129 @@ class MaigretGraph:
def __init__(self, graph):
self.G = graph

def add_node(self, key, value):
def add_node(self, key, value, color=None):
node_name = f'{key}: {value}'

params = self.other_params
params = dict(self.other_params)
if key in SUPPORTED_IDS:
params = self.username_params
params = dict(self.username_params)
elif value.startswith('http'):
params = self.site_params

self.G.add_node(node_name, title=node_name, **params)

if value != value.lower():
normalized_node_name = self.add_node(key, value.lower())
self.link(node_name, normalized_node_name)
params = dict(self.site_params)

params['title'] = node_name
if color:
params['color'] = color

self.G.add_node(node_name, **params)
return node_name

def link(self, node1_name, node2_name):
self.G.add_edge(node1_name, node2_name, weight=2)


def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
# moved here to speed up the launch of Maigret
import networkx as nx

G = nx.Graph()
graph = MaigretGraph(G)

for username, id_type, results in username_results:
username_node_name = graph.add_node(id_type, username)
base_site_nodes = {}
site_account_nodes = {}
processed_values = {} # Track processed values to avoid duplicates

for website_name in results:
dictionary = results[website_name]
# TODO: fix no site data issue
if not dictionary:
continue
for username, id_type, results in username_results:
# Add username node, using normalized version directly if different
norm_username = username.lower()
username_node_name = graph.add_node(id_type, norm_username)

if dictionary.get("is_similar"):
for website_name, dictionary in results.items():
if not dictionary or dictionary.get("is_similar"):
continue

status = dictionary.get("status")
if not status: # FIXME: currently in case of timeout
if not status or status.status != MaigretCheckStatus.CLAIMED:
continue

if dictionary["status"].status != MaigretCheckStatus.CLAIMED:
continue
# base site node
site_base_url = website_name
if site_base_url not in base_site_nodes:
base_site_nodes[site_base_url] = graph.add_node('site', site_base_url, color='#28a745') # Green color

site_fallback_name = dictionary.get(
'url_user', f'{website_name}: {username.lower()}'
)
# site_node_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
site_node_name = graph.add_node('site', site_fallback_name)
graph.link(username_node_name, site_node_name)
site_base_node_name = base_site_nodes[site_base_url]

# account node
account_url = dictionary.get('url_user', f'{site_base_url}/{norm_username}')
account_node_id = f"{site_base_url}: {account_url}"
if account_node_id not in site_account_nodes:
site_account_nodes[account_node_id] = graph.add_node('account', account_url)

account_node_name = site_account_nodes[account_node_id]

# link username → account → site
graph.link(username_node_name, account_node_name)
graph.link(account_node_name, site_base_node_name)

def process_ids(parent_node, ids):
for k, v in ids.items():
if k.endswith('_count') or k.startswith('is_') or k.endswith('_at'):
continue
if k in 'image':
if k.endswith('_count') or k.startswith('is_') or k.endswith('_at') or k in 'image':
continue

v_data = v
if v.startswith('['):
try:
v_data = ast.literal_eval(v)
except Exception as e:
logging.error(e)

# value is a list
if isinstance(v_data, list):
list_node_name = graph.add_node(k, site_fallback_name)
for vv in v_data:
data_node_name = graph.add_node(vv, site_fallback_name)
graph.link(list_node_name, data_node_name)

add_ids = {
a: b for b, a in db.extract_ids_from_url(vv).items()
}
if add_ids:
process_ids(data_node_name, add_ids)
# Normalize value if string
norm_v = v.lower() if isinstance(v, str) else v
value_key = f"{k}:{norm_v}"

if value_key in processed_values:
ids_data_name = processed_values[value_key]
else:
# value is just a string
# ids_data_name = f'{k}: {v}'
# if ids_data_name == parent_node:
# continue

ids_data_name = graph.add_node(k, v)
# G.add_node(ids_data_name, size=10, title=ids_data_name, group=3)
graph.link(parent_node, ids_data_name)

# check for username
if 'username' in k or k in SUPPORTED_IDS:
new_username_node_name = graph.add_node('username', v)
graph.link(ids_data_name, new_username_node_name)
v_data = v
if isinstance(v, str) and v.startswith('['):
try:
v_data = ast.literal_eval(v)
except Exception as e:
logging.error(e)
continue

if isinstance(v_data, list):
list_node_name = graph.add_node(k, site_base_url)
processed_values[value_key] = list_node_name
for vv in v_data:
data_node_name = graph.add_node(vv, site_base_url)
graph.link(list_node_name, data_node_name)

add_ids = {a: b for b, a in db.extract_ids_from_url(vv).items()}
if add_ids:
process_ids(data_node_name, add_ids)
ids_data_name = list_node_name
else:
ids_data_name = graph.add_node(k, norm_v)
processed_values[value_key] = ids_data_name

if 'username' in k or k in SUPPORTED_IDS:
new_username_key = f"username:{norm_v}"
if new_username_key not in processed_values:
new_username_node_name = graph.add_node('username', norm_v)
processed_values[new_username_key] = new_username_node_name
graph.link(ids_data_name, new_username_node_name)

add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
if add_ids:
process_ids(ids_data_name, add_ids)

add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
if add_ids:
process_ids(ids_data_name, add_ids)
graph.link(parent_node, ids_data_name)

if status.ids_data:
process_ids(site_node_name, status.ids_data)
process_ids(account_node_name, status.ids_data)

nodes_to_remove = []
for node in G.nodes:
if len(str(node)) > 100:
nodes_to_remove.append(node)
# Remove overly long nodes
nodes_to_remove = [node for node in G.nodes if len(str(node)) > 100]
G.remove_nodes_from(nodes_to_remove)

[G.remove_node(node) for node in nodes_to_remove]
# Remove site nodes with only one connection
single_degree_sites = [n for n, deg in G.degree() if n.startswith("site:") and deg <= 1]
G.remove_nodes_from(single_degree_sites)

# moved here to speed up the launch of Maigret
# Generate interactive visualization
from pyvis.network import Network

nt = Network(notebook=True, height="750px", width="100%")
nt.from_nx(G)
nt.show(filename)
Expand Down

0 comments on commit 36ce285

Please sign in to comment.