|
87 | 87 |
|
88 | 88 | logger = logging.getLogger(__name__)
|
89 | 89 |
|
| 90 | +# Logging for https://github.com/matrix-org/matrix-spec/issues/1209 and |
| 91 | +# https://github.com/element-hq/synapse/issues/16940 |
| 92 | +client_state_desync_logger = logging.getLogger("synapse.client_state_desync_debug") |
| 93 | + |
90 | 94 | # Counts the number of times we returned a non-empty sync. `type` is one of
|
91 | 95 | # "initial_sync", "full_state_sync" or "incremental_sync", `lazy_loaded` is
|
92 | 96 | # "true" or "false" depending on if the request asked for lazy loaded members or
|
@@ -262,6 +266,17 @@ def __bool__(self) -> bool:
|
262 | 266 | )
|
263 | 267 |
|
264 | 268 |
|
| 269 | +@attr.s(slots=True, auto_attribs=True) |
| 270 | +class ClientCalculatedMembershipStateErrorEntry: |
| 271 | + """Difference between the membership states calculated by the client and the server |
| 272 | +
|
| 273 | + Returned by `SyncHandler._calculate_state_error`. |
| 274 | + """ |
| 275 | + |
| 276 | + actual: Optional[str] |
| 277 | + calculated: Optional[str] |
| 278 | + |
| 279 | + |
265 | 280 | class SyncHandler:
|
266 | 281 | def __init__(self, hs: "HomeServer"):
|
267 | 282 | self.hs_config = hs.config
|
@@ -1204,6 +1219,12 @@ async def _compute_state_delta_for_full_sync(
|
1204 | 1219 | previous_timeline_end={},
|
1205 | 1220 | lazy_load_members=lazy_load_members,
|
1206 | 1221 | )
|
| 1222 | + |
| 1223 | + if client_state_desync_logger.isEnabledFor(logging.WARNING): |
| 1224 | + await self._log_client_state_desync( |
| 1225 | + room_id, None, state_ids, timeline_state, lazy_load_members |
| 1226 | + ) |
| 1227 | + |
1207 | 1228 | return state_ids
|
1208 | 1229 |
|
1209 | 1230 | async def _compute_state_delta_for_incremental_sync(
|
@@ -1304,6 +1325,15 @@ async def _compute_state_delta_for_incremental_sync(
|
1304 | 1325 | lazy_load_members=lazy_load_members,
|
1305 | 1326 | )
|
1306 | 1327 |
|
| 1328 | + if client_state_desync_logger.isEnabledFor(logging.WARNING): |
| 1329 | + await self._log_client_state_desync( |
| 1330 | + room_id, |
| 1331 | + since_token, |
| 1332 | + state_ids, |
| 1333 | + timeline_state, |
| 1334 | + lazy_load_members, |
| 1335 | + ) |
| 1336 | + |
1307 | 1337 | return state_ids
|
1308 | 1338 |
|
1309 | 1339 | async def _find_missing_partial_state_memberships(
|
@@ -1420,6 +1450,136 @@ async def _find_missing_partial_state_memberships(
|
1420 | 1450 |
|
1421 | 1451 | return additional_state_ids
|
1422 | 1452 |
|
| 1453 | + async def _log_client_state_desync( |
| 1454 | + self, |
| 1455 | + room_id: str, |
| 1456 | + since_token: Optional[StreamToken], |
| 1457 | + sync_response_state_state: StateMap[str], |
| 1458 | + sync_response_timeline_state: StateMap[str], |
| 1459 | + lazy_load_members: bool, |
| 1460 | + ) -> None: |
| 1461 | + # Tracking to see how often the client's state gets out of sync with the |
| 1462 | + # actual current state of the room. |
| 1463 | + # |
| 1464 | + # There are few different potential failure modes here: |
| 1465 | + # |
| 1466 | + # * State resolution can cause changes in the state of the room that don't |
| 1467 | + # directly correspond to events with the corresponding (type, state_key). |
| 1468 | + # https://github.com/matrix-org/matrix-spec/issues/1209 discusses this in |
| 1469 | + # more detail. |
| 1470 | + # |
| 1471 | + # * Even where there is an event that causes a given state change, Synapse |
| 1472 | + # may not serve it to the client, since it works on state at specific points |
| 1473 | + # in the DAG, rather than "current state". |
| 1474 | + # See https://github.com/element-hq/synapse/issues/16940. |
| 1475 | + # |
| 1476 | + # * Lazy-loading adds more complexity, as it means that events that would |
| 1477 | + # normally be served via the `state` part of an incremental sync are filtered |
| 1478 | + # out. |
| 1479 | + # |
| 1480 | + # To try to get a handle on this, let's put ourselves in the shoes of a client, |
| 1481 | + # and compare the state they will calculate against the actual current state. |
| 1482 | + |
| 1483 | + if since_token is None: |
| 1484 | + if lazy_load_members: |
| 1485 | + # For initial syncs with lazy-loading enabled, there's not too much |
| 1486 | + # concern here. We know the client will do a `/members` query before |
| 1487 | + # doing any encryption, so what sync returns isn't too important. |
| 1488 | + # |
| 1489 | + # (Of course, then `/members` might also return an incomplete list, but |
| 1490 | + # that's a separate problem.) |
| 1491 | + return |
| 1492 | + |
| 1493 | + # For regular initial syncs, compare the returned response with the actual |
| 1494 | + # current state. |
| 1495 | + client_calculated_state = {} |
| 1496 | + client_calculated_state.update(sync_response_state_state) |
| 1497 | + client_calculated_state.update(sync_response_timeline_state) |
| 1498 | + |
| 1499 | + current_state = await self._state_storage_controller.get_current_state_ids( |
| 1500 | + room_id, await_full_state=False |
| 1501 | + ) |
| 1502 | + state_error = await self._calculate_state_error( |
| 1503 | + current_state, client_calculated_state |
| 1504 | + ) |
| 1505 | + if state_error: |
| 1506 | + client_state_desync_logger.warning( |
| 1507 | + "client state discrepancy in initialsync in room %s: %s", |
| 1508 | + room_id, |
| 1509 | + state_error, |
| 1510 | + ) |
| 1511 | + else: |
| 1512 | + # For an incremental (gappy or otherwise) sync, let's assume the client has |
| 1513 | + # a complete membership list as of the last sync (or rather, at |
| 1514 | + # `since_token`, which is the closes approximation we have to it |
| 1515 | + # right now), and see what they would calculate as the current state given |
| 1516 | + # this sync update. |
| 1517 | + |
| 1518 | + client_calculated_state = dict( |
| 1519 | + await self.get_state_at( |
| 1520 | + room_id, |
| 1521 | + stream_position=since_token, |
| 1522 | + await_full_state=False, |
| 1523 | + ) |
| 1524 | + ) |
| 1525 | + client_calculated_state.update(sync_response_state_state) |
| 1526 | + client_calculated_state.update(sync_response_timeline_state) |
| 1527 | + |
| 1528 | + current_state = await self._state_storage_controller.get_current_state_ids( |
| 1529 | + room_id, await_full_state=False |
| 1530 | + ) |
| 1531 | + |
| 1532 | + state_error = await self._calculate_state_error( |
| 1533 | + current_state, client_calculated_state |
| 1534 | + ) |
| 1535 | + if state_error: |
| 1536 | + client_state_desync_logger.warning( |
| 1537 | + "client state discrepancy in incremental sync in room %s: %s", |
| 1538 | + room_id, |
| 1539 | + state_error, |
| 1540 | + ) |
| 1541 | + |
| 1542 | + async def _calculate_state_error( |
| 1543 | + self, |
| 1544 | + actual_state: StateMap[str], |
| 1545 | + client_calculated_state: StateMap[str], |
| 1546 | + ) -> Mapping[str, ClientCalculatedMembershipStateErrorEntry]: |
| 1547 | + error_map: Dict[str, ClientCalculatedMembershipStateErrorEntry] = {} |
| 1548 | + |
| 1549 | + async def event_id_to_membership(event_id: Optional[str]) -> Optional[str]: |
| 1550 | + if event_id is None: |
| 1551 | + return None |
| 1552 | + event = await self.store.get_event(event_id) |
| 1553 | + return event.membership |
| 1554 | + |
| 1555 | + # Check for entries in the calculated state which differ from the actual state. |
| 1556 | + for ( |
| 1557 | + event_type, |
| 1558 | + state_key, |
| 1559 | + ), calculated_event_id in client_calculated_state.items(): |
| 1560 | + if event_type != EventTypes.Member: |
| 1561 | + continue |
| 1562 | + |
| 1563 | + actual_event_id = actual_state.get((event_type, state_key)) |
| 1564 | + if calculated_event_id != actual_event_id: |
| 1565 | + error_map[state_key] = ClientCalculatedMembershipStateErrorEntry( |
| 1566 | + actual=await event_id_to_membership(actual_event_id), |
| 1567 | + calculated=await event_id_to_membership(calculated_event_id), |
| 1568 | + ) |
| 1569 | + |
| 1570 | + # Check for entries which are missing altogether. |
| 1571 | + for (event_type, state_key), actual_event_id in actual_state.items(): |
| 1572 | + if event_type != EventTypes.Member: |
| 1573 | + continue |
| 1574 | + |
| 1575 | + if (event_type, state_key) not in client_calculated_state: |
| 1576 | + error_map[state_key] = ClientCalculatedMembershipStateErrorEntry( |
| 1577 | + actual=await event_id_to_membership(actual_event_id), |
| 1578 | + calculated=None, |
| 1579 | + ) |
| 1580 | + |
| 1581 | + return error_map |
| 1582 | + |
1423 | 1583 | async def unread_notifs_for_room_id(
|
1424 | 1584 | self, room_id: str, sync_config: SyncConfig
|
1425 | 1585 | ) -> RoomNotifCounts:
|
|
0 commit comments