Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: computing neo4j analytics for just the only latest date! #34

Merged
merged 5 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,16 @@ def __init__(
self.projection_utils = ProjectionUtils(self.platform_id, self.graph_schema)

def compute_stats(self, from_start: bool) -> None:
"""
from_start is disabled. We would always compute just for the latest date
"""
amindadgar marked this conversation as resolved.
Show resolved Hide resolved
# possible dates to do the computations
possible_dates = self.projection_utils.get_dates()

# if we didn't want to compute from the day start
if not from_start:
computed_dates = self.get_computed_dates()
possible_dates = possible_dates - computed_dates
# if not from_start:
# computed_dates = self.get_computed_dates()
# possible_dates = possible_dates - computed_dates

for date in possible_dates:
try:
Expand Down
44 changes: 24 additions & 20 deletions tc_analyzer_lib/algorithms/neo4j_analysis/centrality.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def compute_degree_centerality(
the computed_dates will be based on the
network decentrality metric computations

NOTE: `from_start` is disabled and we're always computing for the latest date available

Parameters:
------------
direction : str
Expand Down Expand Up @@ -68,30 +70,32 @@ def compute_degree_centerality(
node = self.graph_schema.user_label
weighted = True if "weighted" not in kwargs.keys() else kwargs["weighted"]
normalize = False if "normalize" not in kwargs.keys() else kwargs["normalize"]
preserve_parallel = (
True
if "preserve_parallel" not in kwargs.keys()
else kwargs["preserve_parallel"]
)
preserve_parallel = kwargs.get("preserve_parallel", True)

recompute_dates = None
if "recompute_dates" in kwargs:
recompute_dates = kwargs["recompute_dates"]
# recompute_dates = None
# if "recompute_dates" in kwargs:
# recompute_dates = kwargs["recompute_dates"]
Comment on lines +75 to +77
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove commented-out code

The code for handling recompute_dates has been commented out. If this functionality is no longer needed, it's better to remove the code entirely rather than leaving it commented out. This helps maintain code cleanliness and readability.

If you think this functionality might be needed in the future, remember that version control systems can be used to retrieve old code if necessary.


if weighted and not preserve_parallel:
logging.warn(
logging.warning(
"""preserver_parallel=False with weighted=True
could produce wrong results!"""
)

interacted_with_label = self.graph_schema.interacted_with_rel
query = """
MATCH () -[r:INTERACTED_WITH {platformId: $platform_id}]-()
WITH max(r.date) as latest_date
"""

# determining one line of the query useing the direction variable
interaction = f"[r:{interacted_with_label} {{date: latest_date}}]"
if direction == "in_degree":
query = f"MATCH (a:{node})<-[r:{interacted_with_label}]-(b:{node})"
query += f"MATCH (a:{node})<-{interaction}-(b:{node})"
elif direction == "out_degree":
query = f"MATCH (a:{node})-[r:{interacted_with_label}]->(b:{node})"
query += f"MATCH (a:{node})-{interaction}->(b:{node})"
elif direction == "undirected":
query = f"MATCH (a:{node})-[r:{interacted_with_label}]-(b:{node})"
query += f"MATCH (a:{node})-{interaction}-(b:{node})"

results = self.neo4j_ops.gds.run_cypher(
f"""
Expand All @@ -107,14 +111,14 @@ def compute_degree_centerality(
)

dates_to_compute = set(results["date"].value_counts().index)
if not from_start:
projection_utils = ProjectionUtils(self.platform_id, self.graph_schema)

dates_to_compute = self._get_dates_to_compute(
projection_utils, dates_to_compute
)
if recompute_dates is not None:
dates_to_compute = dates_to_compute.union(recompute_dates)
# if not from_start:
# projection_utils = ProjectionUtils(self.platform_id, self.graph_schema)

# dates_to_compute = self._get_dates_to_compute(
# projection_utils, dates_to_compute
# )
# if recompute_dates is not None:
# dates_to_compute = dates_to_compute.union(recompute_dates)
amindadgar marked this conversation as resolved.
Show resolved Hide resolved

degree_centerality = self.count_degrees(
computation_date=dates_to_compute,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ def compute(self, from_start: bool = False) -> None:

computable_dates = self.projection_utils.get_dates()

# compute for each date
to_compute: set[float]
if from_start:
to_compute = computable_dates
else:
computed_dates = self.get_computed_dates()
to_compute = computable_dates - computed_dates

for date in to_compute:
# # compute for each date
# to_compute: set[float]
# if from_start:
# to_compute = computable_dates
# else:
# computed_dates = self.get_computed_dates()
# to_compute = computable_dates - computed_dates

amindadgar marked this conversation as resolved.
Show resolved Hide resolved
for date in computable_dates:
amindadgar marked this conversation as resolved.
Show resolved Hide resolved
try:
self.closeness_computation_wrapper(date)
except Exception as exp:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,17 @@ def compute(self, from_start: bool = False) -> None:
# Getting all possible dates
computable_dates = self.projection_utils.get_dates()

computed_dates = self.get_computed_dates()
# computed_dates = self.get_computed_dates()

# compute for each date
to_compute: set[float]
if from_start:
to_compute = computable_dates
else:
to_compute = computable_dates - computed_dates
# to_compute: set[float]
# if from_start:
# to_compute = computable_dates
# else:
# to_compute = computable_dates - computed_dates

# for the computation date
for date in to_compute:
for date in computable_dates:
try:
self.local_clustering_computation_wrapper(date=date)
except Exception as exp:
Expand Down
16 changes: 8 additions & 8 deletions tc_analyzer_lib/algorithms/neo4j_analysis/louvain.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,14 @@ def compute(self, from_start: bool = False) -> None:
computable_dates = self.projection_utils.get_dates()

# compute for each date
to_compute: set[float]
if from_start:
to_compute = computable_dates
else:
computed_dates = self.get_computed_dates()
to_compute = computable_dates - computed_dates

for date in to_compute:
# to_compute: set[float]
# if from_start:
# to_compute = computable_dates
# else:
# computed_dates = self.get_computed_dates()
# to_compute = computable_dates - computed_dates

for date in computable_dates:
amindadgar marked this conversation as resolved.
Show resolved Hide resolved
try:
self.louvain_computation_wrapper(date)
except Exception as exp:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def project_temp_graph(
def get_dates(self) -> set[float]:
"""
get all the dates we do have on the INTERACTED_WITH relations
Note: returning just the only previous date

Parameters:
------------
Expand All @@ -124,8 +125,7 @@ def get_dates(self) -> set[float]:
f"""
MATCH (a:{self.user_label})
-[r:{self.between_user_label} {{platformId: $platform_id}}]-()
WITH DISTINCT(r.date) as dates
RETURN dates
RETURN r.date as dates ORDER BY dates DESC LIMIT 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update return type and variable name to reflect single date return

The method now returns only the latest date, but the return type is still set[float] and the variable name suggests multiple dates. This inconsistency could lead to confusion or errors in code that uses this method.

Consider the following changes:

  1. Update the method signature to return a single float instead of a set[float].
  2. Rename the variable to reflect that it contains a single date.
  3. Modify the return statement to return the single date value.
-    def get_dates(self) -> set[float]:
+    def get_latest_date(self) -> float:
         """
-        get all the dates we do have on the INTERACTED_WITH relations
+        get the most recent date from the INTERACTED_WITH relations
         
-        Note: returning just the only previous date
+        Note: returning only the most recent date
         
         Parameters:
         ------------
         guildId : str
             the guild we do want the dates of relations
         """
         dates = self.gds.run_cypher(
             f"""
             MATCH (a:{self.user_label})
                 -[r:{self.between_user_label} {{platformId: $platform_id}}]-()
             RETURN r.date as dates ORDER BY dates DESC LIMIT 1
             """,
             params={"platform_id": self.platform_id},
         )
-        computable_dates_set = set(dates["dates"].values)
+        latest_date = dates["dates"].values[0] if dates["dates"].values else None

-        return computable_dates_set
+        return latest_date

Also applies to: 130-132

""",
params={"platform_id": self.platform_id},
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,15 @@ def test_available_date(self):
louvain = ClosenessCentrality(platform_id, graph_schema)
louvain.compute(from_start=False)

yesterday_results = self.neo4j_ops.gds.run_cypher(
f"""
MATCH (user:{graph_schema.user_label})-[r:HAVE_METRICS {{platformId: '{platform_id}', date: {yesterday}}}]->(user)
RETURN r.date as date, r.closenessCentrality as closenessScore
"""
)
# yesterday three users were interacting
assert len(yesterday_results) == 3
assert yesterday_results["date"].iloc[0] == yesterday
# yesterday_results = self.neo4j_ops.gds.run_cypher(
# f"""
# MATCH (user:{graph_schema.user_label})-[r:HAVE_METRICS {{platformId: '{platform_id}', date: {yesterday}}}]->(user)
# RETURN r.date as date, r.closenessCentrality as closenessScore
# """
# )
# # yesterday three users were interacting
# assert len(yesterday_results) == 3
# assert yesterday_results["date"].iloc[0] == yesterday

today_results = self.neo4j_ops.gds.run_cypher(
f"""
Expand Down Expand Up @@ -157,15 +157,15 @@ def test_more_available_data(self):

louvain.compute(from_start=False)

yesterday_results = self.neo4j_ops.gds.run_cypher(
f"""
MATCH (user:{graph_schema.user_label})-[r:HAVE_METRICS {{platformId: '{platform_id}', date: {yesterday}}}]->(user)
RETURN r.date as date, r.closenessCentrality as closenessScore
"""
)
# yesterday 4 users were interacting
assert len(yesterday_results) == 4
assert yesterday_results["date"].iloc[0] == yesterday
# yesterday_results = self.neo4j_ops.gds.run_cypher(
# f"""
# MATCH (user:{graph_schema.user_label})-[r:HAVE_METRICS {{platformId: '{platform_id}', date: {yesterday}}}]->(user)
# RETURN r.date as date, r.closenessCentrality as closenessScore
# """
# )
# # yesterday 4 users were interacting
# assert len(yesterday_results) == 4
# assert yesterday_results["date"].iloc[0] == yesterday

today_results = self.neo4j_ops.gds.run_cypher(
f"""
Expand Down
34 changes: 17 additions & 17 deletions tests/integration/test_closeness_centrality_with_mutual_ties.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,23 +232,23 @@ def test_three_vertices_two_dates_from_start_true(self):
centrality = ClosenessCentrality(platform_id, self.graph_schema)
centrality.compute(from_start=True)

yesterday_results = self.neo4j_ops.gds.run_cypher(
f"""
MATCH (user:{self.graph_schema.user_label})-[r:HAVE_METRICS {{platformId: '{platform_id}', date: {yesterday}}}]->(user)
RETURN user.id as userid, r.date as date, r.closenessCentrality as closenessScore
"""
)
self.assertEqual(len(yesterday_results), 3)

# the yesterday scores should be recomputed
for _, row in yesterday_results.iterrows():
self.assertEqual(row["date"], yesterday)

if row["userid"] == "a" or row["userid"] == "b" or row["userid"] == "c":
expected_score = 1 if row["userid"] == "a" else 2 / 3
self.assertAlmostEqual(row["closenessScore"], expected_score)
else:
raise ValueError("Never should reach here!")
# yesterday_results = self.neo4j_ops.gds.run_cypher(
# f"""
# MATCH (user:{self.graph_schema.user_label})-[r:HAVE_METRICS {{platformId: '{platform_id}', date: {yesterday}}}]->(user)
# RETURN user.id as userid, r.date as date, r.closenessCentrality as closenessScore
# """
# )
# self.assertEqual(len(yesterday_results), 3)

# # the yesterday scores should be recomputed
# for _, row in yesterday_results.iterrows():
# self.assertEqual(row["date"], yesterday)

# if row["userid"] == "a" or row["userid"] == "b" or row["userid"] == "c":
# expected_score = 1 if row["userid"] == "a" else 2 / 3
# self.assertAlmostEqual(row["closenessScore"], expected_score)
# else:
# raise ValueError("Never should reach here!")
amindadgar marked this conversation as resolved.
Show resolved Hide resolved

today_results = self.neo4j_ops.gds.run_cypher(
f"""
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_decentralization_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,5 +74,5 @@ def test_decentralization_score():
)

# because python is not good with equality comparison of float values
assert network_decentrality[yesterday] - 133.33 < 0.1
# assert network_decentrality[yesterday] - 133.33 < 0.1
assert network_decentrality[today] - 66.66 < 0.1
Comment on lines +77 to 78
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tip

Codebase Verification

Centrality Computation Not Restricted to Latest Date

The compute_network_decentrality function does not appear to be called with from_start=False, which means it may still be processing historical data contrary to the PR's objective of focusing solely on the latest date.

  • Implementation Check: Verify that all calls to compute_network_decentrality use from_start=False to ensure only the latest date is considered.
  • Test Update: Adjust the test setup to exclude data for 'yesterday' if historical data processing is no longer required.
Analysis chain

Verify alignment with PR objective and consider test improvements

The commented-out assertion aligns with the PR objective of "computing neo4j analytics for just the only latest date". However, this change raises some concerns:

  1. Test coverage: By removing the assertion for the 'yesterday' date, we're no longer verifying the behavior for historical data. This might lead to undetected regressions in processing older data.

  2. Test setup: The test still creates data for both 'today' and 'yesterday', but only asserts on 'today'. This might be unnecessary and could be simplified.

To ensure that the implementation correctly focuses on the latest date, please run the following script:

Consider the following improvements:

  1. Update the test setup to only create data for the latest date if historical data is no longer relevant.
  2. Add a comment explaining why the 'yesterday' assertion was removed and how it relates to the new feature.
  3. If historical data processing is still important, consider adding a separate test for that scenario.

Example refactor:

def test_decentralization_score():
    # ... (existing setup code)

    # Only create data for 'today'
    neo4j_ops.gds.run_cypher(
        f"""
        CREATE (a:{user_label}) -[:{is_member}]->(g:{platform_label} {{guildId: '{platform_id}'}})
        # ... (create other nodes)
        MERGE (a) -[r2:{interacted_with} {{date: {today}, weight: 2}}]->(b)
        # ... (create other 'today' interactions)
        """
    )

    network_decentrality = centrality.compute_network_decentrality(
        from_start=True, save=True
    )

    # Assert only for the latest date (today)
    assert network_decentrality[today] - 66.66 < 0.1

    # TODO: Consider adding a separate test for historical data processing if still relevant
Scripts executed

The following scripts were executed for the analysis:

Script:

#!/bin/bash
# Description: Verify that the centrality computation only considers the latest date

# Test: Search for date filtering in the centrality computation
rg --type python -g '!test_*' 'compute_network_decentrality' -A 10 -B 10

Length of output: 103


Script:

#!/bin/bash
# Description: Verify that the centrality computation only considers the latest date

# Test: Search for date filtering in the centrality computation
rg -t python -g '!test_*' 'compute_network_decentrality' -A 10 -B 10

Length of output: 99


Script:

#!/bin/bash
# Description: Verify that the centrality computation only considers the latest date

# Test: Search for date filtering in the centrality computation within Python files
rg -g '*.py' -g '!tests/*' 'compute_network_decentrality' -A 10 -B 10

Length of output: 5724


Script:

#!/bin/bash
# Description: Verify that 'compute_network_decentrality' is called with 'from_start=False'

# Test: Search for 'compute_network_decentrality' calls with 'from_start=False' in Python files
rg -g '*.py' 'compute_network_decentrality\s*\(\s*from_start\s*=\s*False\s*\)' -A 5 -B 5

Length of output: 90


Script:

#!/bin/bash
# Description: Refined search for 'compute_network_decentrality' calls with 'from_start=False'

# Test: Search for 'compute_network_decentrality' calls with 'from_start=False' allowing for variations
rg -g '*.py' 'compute_network_decentrality\s*\([^)]*from_start\s*=\s*False[^)]*\)' -A 5 -B 5

Length of output: 94

Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,17 @@ def test_partially_connected_coeffs():
)

print(degree_centrality)
assert degree_centrality[yesterday]["1000"] == 2 / 3
# assert degree_centrality[yesterday]["1000"] == 2 / 3
assert degree_centrality[today]["1000"] == 1 / 2

assert degree_centrality[yesterday]["1001"] == 1
# assert degree_centrality[yesterday]["1001"] == 1
assert degree_centrality[today]["1001"] == 1

assert degree_centrality[yesterday]["1002"] == 2 / 3
# assert degree_centrality[yesterday]["1002"] == 2 / 3
assert degree_centrality[today]["1002"] == 3 / 4

assert degree_centrality[yesterday]["1003"] == 1
# assert degree_centrality[yesterday]["1003"] == 1
assert degree_centrality[today]["1003"] == 1 / 2

assert "1004" not in degree_centrality[yesterday]
# assert "1004" not in degree_centrality[yesterday]
assert degree_centrality[today]["1004"] == 1 / 4
21 changes: 6 additions & 15 deletions tests/integration/test_lcc_all_connected.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,22 +65,13 @@ def test_all_connected_coeffs():
)

user0_id = "1000"
expected_results_user0 = [
[user0_id, yesterday, 1.0],
[user0_id, today, 0.0],
]
assert expected_results_user0 in results[results.userId == user0_id].values
expected_results_user0 = [user0_id, today, 0.0]
assert expected_results_user0 == list(results[results.userId == user0_id].values[0])

user1_id = "1001"
expected_results_user1 = [
[user1_id, yesterday, 1.0],
[user1_id, today, 0.0],
]
assert expected_results_user1 in results[results.userId == user1_id].values
expected_results_user1 = [user1_id, today, 0.0]
assert expected_results_user1 == list(results[results.userId == user1_id].values[0])

user2_id = "1002"
expected_results_user2 = [
[user2_id, yesterday, 1.0],
[user2_id, today, 0.0],
]
assert expected_results_user2 in results[results.userId == user2_id].values
expected_results_user2 = []
assert expected_results_user2 == list(results[results.userId == user2_id].values)
Comment on lines +76 to +77
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tip

Codebase Verification

Inconsistent Test Expectations for user2_id Identified

The test references two dates with different coefficients:

  • Date 166: coeffs are 1.0 (user2_id should have expected results)
  • Date 167: coeffs are 0.0 (user2_id is expected to have no results)

However, expected_results_user2 is set to an empty list, which only aligns with Date 167. This discrepancy suggests that the expected results for Date 166 are not being properly validated.

Please review and ensure that the test accurately reflects the expected behavior for both dates.

Analysis chain

Clarify the expected behavior for user2_id and overall test strategy.

The change to an empty list for user2_id's expected results is a significant modification. Please clarify:

  1. Is it intentional that user2_id has no results for the latest date?
  2. The function comment mentions two dates (166 and 167) with different coefficients, but the test now only seems to check for the latest date. How does this align with the test's original purpose?
  3. Are we still adequately testing the behavior described in the function comment?

Consider adding comments explaining the expected behavior for each user, especially for user2_id, to improve test clarity.

Scripts executed

The following scripts were executed for the analysis:

Script:

#!/bin/bash
# Description: Check for any remaining references to the old test structure or multiple dates

# Test: Search for references to multiple dates or old test structure
rg --type python "166|167|\[\[" tests/integration/test_lcc_all_connected.py

Length of output: 108


Script:

#!/bin/bash
# Description: Re-run the search for references to multiple dates or old test structure without specifying file type

# Test: Search for references to multiple dates or old test structure
rg "166|167|\[\[" tests/integration/test_lcc_all_connected.py

Length of output: 170

12 changes: 4 additions & 8 deletions tests/integration/test_louvain_algorithm_computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def test_louvain_algorithm_available_data():
user_label = graph_schema.user_label
platform_label = graph_schema.platform_label
interacted_with = graph_schema.interacted_with_rel
interacted_in = graph_schema.interacted_in_rel
is_member = graph_schema.member_relation

# creating some nodes with data
Expand Down Expand Up @@ -52,9 +51,8 @@ def test_louvain_algorithm_available_data():
"""
)

assert len(results) == 2
assert results["date"].iloc[0] in [yesterday, today]
assert results["date"].iloc[1] in [yesterday, today]
assert len(results) == 1
assert results["date"].iloc[0] in [today]
amindadgar marked this conversation as resolved.
Show resolved Hide resolved


def test_louvain_algorithm_more_available_data():
Expand All @@ -74,7 +72,6 @@ def test_louvain_algorithm_more_available_data():
user_label = graph_schema.user_label
platform_label = graph_schema.platform_label
interacted_with = graph_schema.interacted_with_rel
interacted_in = graph_schema.interacted_in_rel
is_member = graph_schema.member_relation

# creating some nodes with data
Expand Down Expand Up @@ -128,6 +125,5 @@ def test_louvain_algorithm_more_available_data():
"""
)
print(results)
assert len(results) == 2
assert results["date"].iloc[0] in [yesterday, today]
assert results["date"].iloc[1] in [yesterday, today]
assert len(results) == 1
assert results["date"].iloc[0] in [today]
Loading
Loading