Skip to content

Commit e41a755

Browse files
committed
Add some debug to keep track of client state desynchronisation
Fixes element-hq/crypto-internal#179
1 parent e200023 commit e41a755

File tree

1 file changed

+160
-0
lines changed

1 file changed

+160
-0
lines changed

synapse/handlers/sync.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,10 @@
8787

8888
logger = logging.getLogger(__name__)
8989

90+
# Logging for https://github.com/matrix-org/matrix-spec/issues/1209 and
91+
# https://github.com/element-hq/synapse/issues/16940
92+
client_state_desync_logger = logging.getLogger("synapse.client_state_desync_debug")
93+
9094
# Counts the number of times we returned a non-empty sync. `type` is one of
9195
# "initial_sync", "full_state_sync" or "incremental_sync", `lazy_loaded` is
9296
# "true" or "false" depending on if the request asked for lazy loaded members or
@@ -262,6 +266,17 @@ def __bool__(self) -> bool:
262266
)
263267

264268

269+
@attr.s(slots=True, auto_attribs=True)
270+
class ClientCalculatedMembershipStateErrorEntry:
271+
"""Difference between the membership states calculated by the client and the server
272+
273+
Returned by `SyncHandler._calculate_state_error`.
274+
"""
275+
276+
actual: Optional[str]
277+
calculated: Optional[str]
278+
279+
265280
class SyncHandler:
266281
def __init__(self, hs: "HomeServer"):
267282
self.hs_config = hs.config
@@ -1204,6 +1219,12 @@ async def _compute_state_delta_for_full_sync(
12041219
previous_timeline_end={},
12051220
lazy_load_members=lazy_load_members,
12061221
)
1222+
1223+
if client_state_desync_logger.isEnabledFor(logging.WARNING):
1224+
await self._log_client_state_desync(
1225+
room_id, None, state_ids, timeline_state, lazy_load_members
1226+
)
1227+
12071228
return state_ids
12081229

12091230
async def _compute_state_delta_for_incremental_sync(
@@ -1304,6 +1325,15 @@ async def _compute_state_delta_for_incremental_sync(
13041325
lazy_load_members=lazy_load_members,
13051326
)
13061327

1328+
if client_state_desync_logger.isEnabledFor(logging.WARNING):
1329+
await self._log_client_state_desync(
1330+
room_id,
1331+
since_token,
1332+
state_ids,
1333+
timeline_state,
1334+
lazy_load_members,
1335+
)
1336+
13071337
return state_ids
13081338

13091339
async def _find_missing_partial_state_memberships(
@@ -1420,6 +1450,136 @@ async def _find_missing_partial_state_memberships(
14201450

14211451
return additional_state_ids
14221452

1453+
async def _log_client_state_desync(
1454+
self,
1455+
room_id: str,
1456+
since_token: Optional[StreamToken],
1457+
sync_response_state_state: StateMap[str],
1458+
sync_response_timeline_state: StateMap[str],
1459+
lazy_load_members: bool,
1460+
) -> None:
1461+
# Tracking to see how often the client's state gets out of sync with the
1462+
# actual current state of the room.
1463+
#
1464+
# There are few different potential failure modes here:
1465+
#
1466+
# * State resolution can cause changes in the state of the room that don't
1467+
# directly correspond to events with the corresponding (type, state_key).
1468+
# https://github.com/matrix-org/matrix-spec/issues/1209 discusses this in
1469+
# more detail.
1470+
#
1471+
# * Even where there is an event that causes a given state change, Synapse
1472+
# may not serve it to the client, since it works on state at specific points
1473+
# in the DAG, rather than "current state".
1474+
# See https://github.com/element-hq/synapse/issues/16940.
1475+
#
1476+
# * Lazy-loading adds more complexity, as it means that events that would
1477+
# normally be served via the `state` part of an incremental sync are filtered
1478+
# out.
1479+
#
1480+
# To try to get a handle on this, let's put ourselves in the shoes of a client,
1481+
# and compare the state they will calculate against the actual current state.
1482+
1483+
if since_token is None:
1484+
if lazy_load_members:
1485+
# For initial syncs with lazy-loading enabled, there's not too much
1486+
# concern here. We know the client will do a `/members` query before
1487+
# doing any encryption, so what sync returns isn't too important.
1488+
#
1489+
# (Of course, then `/members` might also return an incomplete list, but
1490+
# that's a separate problem.)
1491+
return
1492+
1493+
# For regular initial syncs, compare the returned response with the actual
1494+
# current state.
1495+
client_calculated_state = {}
1496+
client_calculated_state.update(sync_response_state_state)
1497+
client_calculated_state.update(sync_response_timeline_state)
1498+
1499+
current_state = await self._state_storage_controller.get_current_state_ids(
1500+
room_id, await_full_state=False
1501+
)
1502+
state_error = await self._calculate_state_error(
1503+
current_state, client_calculated_state
1504+
)
1505+
if state_error:
1506+
client_state_desync_logger.warning(
1507+
"client state discrepancy in initialsync in room %s: %s",
1508+
room_id,
1509+
state_error,
1510+
)
1511+
else:
1512+
# For an incremental (gappy or otherwise) sync, let's assume the client has
1513+
# a complete membership list as of the last sync (or rather, at
1514+
# `since_token`, which is the closes approximation we have to it
1515+
# right now), and see what they would calculate as the current state given
1516+
# this sync update.
1517+
1518+
client_calculated_state = dict(
1519+
await self.get_state_at(
1520+
room_id,
1521+
stream_position=since_token,
1522+
await_full_state=False,
1523+
)
1524+
)
1525+
client_calculated_state.update(sync_response_state_state)
1526+
client_calculated_state.update(sync_response_timeline_state)
1527+
1528+
current_state = await self._state_storage_controller.get_current_state_ids(
1529+
room_id, await_full_state=False
1530+
)
1531+
1532+
state_error = await self._calculate_state_error(
1533+
current_state, client_calculated_state
1534+
)
1535+
if state_error:
1536+
client_state_desync_logger.warning(
1537+
"client state discrepancy in incremental sync in room %s: %s",
1538+
room_id,
1539+
state_error,
1540+
)
1541+
1542+
async def _calculate_state_error(
1543+
self,
1544+
actual_state: StateMap[str],
1545+
client_calculated_state: StateMap[str],
1546+
) -> Mapping[str, ClientCalculatedMembershipStateErrorEntry]:
1547+
error_map: Dict[str, ClientCalculatedMembershipStateErrorEntry] = {}
1548+
1549+
async def event_id_to_membership(event_id: Optional[str]) -> Optional[str]:
1550+
if event_id is None:
1551+
return None
1552+
event = await self.store.get_event(event_id)
1553+
return event.membership
1554+
1555+
# Check for entries in the calculated state which differ from the actual state.
1556+
for (
1557+
event_type,
1558+
state_key,
1559+
), calculated_event_id in client_calculated_state.items():
1560+
if event_type != EventTypes.Member:
1561+
continue
1562+
1563+
actual_event_id = actual_state.get((event_type, state_key))
1564+
if calculated_event_id != actual_event_id:
1565+
error_map[state_key] = ClientCalculatedMembershipStateErrorEntry(
1566+
actual=await event_id_to_membership(actual_event_id),
1567+
calculated=await event_id_to_membership(calculated_event_id),
1568+
)
1569+
1570+
# Check for entries which are missing altogether.
1571+
for (event_type, state_key), actual_event_id in actual_state.items():
1572+
if event_type != EventTypes.Member:
1573+
continue
1574+
1575+
if (event_type, state_key) not in client_calculated_state:
1576+
error_map[state_key] = ClientCalculatedMembershipStateErrorEntry(
1577+
actual=await event_id_to_membership(actual_event_id),
1578+
calculated=None,
1579+
)
1580+
1581+
return error_map
1582+
14231583
async def unread_notifs_for_room_id(
14241584
self, room_id: str, sync_config: SyncConfig
14251585
) -> RoomNotifCounts:

0 commit comments

Comments
 (0)