From 1920690551a296bb6271cd55bf81b68bc5967a61 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Mon, 30 Dec 2024 13:06:08 +0800
Subject: [PATCH 1/7] decrease buffer size and print timestamps

---
 examples/video-stream/video_play.py     | 62 +++++++++++++++++--------
 livekit-rtc/livekit/rtc/synchronizer.py | 24 ++++++++--
 2 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/examples/video-stream/video_play.py b/examples/video-stream/video_play.py
index 05f753ef..6365787c 100644
--- a/examples/video-stream/video_play.py
+++ b/examples/video-stream/video_play.py
@@ -13,9 +13,10 @@
 
 try:
     import av
+    import cv2
 except ImportError:
     raise RuntimeError(
-        "av is required to run this example, install with `pip install av`"
+        "av and opencv-python is required to run this example, install with `pip install av opencv-python`"
     )
 
 # ensure LIVEKIT_URL, LIVEKIT_API_KEY, and LIVEKIT_API_SECRET are set
@@ -51,36 +52,56 @@ def __init__(self, media_file: Union[str, Path]) -> None:
             audio_sample_rate=audio_stream.sample_rate,
             audio_channels=audio_stream.channels,
         )
+        print(self._info)
 
     @property
     def info(self) -> MediaInfo:
         return self._info
 
-    async def stream_video(self) -> AsyncIterable[rtc.VideoFrame]:
+    async def stream_video(
+        self, av_sync: rtc.AVSynchronizer
+    ) -> AsyncIterable[tuple[rtc.VideoFrame, float]]:
         """Streams video frames from the media file in an endless loop."""
-        for av_frame in self._video_container.decode(video=0):
+        for i, av_frame in enumerate(self._video_container.decode(video=0)):
             # Convert video frame to RGBA
             frame = av_frame.to_rgb().to_ndarray()
             frame_rgba = np.ones((frame.shape[0], frame.shape[1], 4), dtype=np.uint8)
             frame_rgba[:, :, :3] = frame
-            yield rtc.VideoFrame(
-                width=frame.shape[1],
-                height=frame.shape[0],
-                type=rtc.VideoBufferType.RGBA,
-                data=frame_rgba.tobytes(),
+
+            # put fps and timestamps in the frame
+            frame_rgba = cv2.putText(
+                frame_rgba, f"{av_sync.actual_fps:.2f}fps", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2
+            )
+
+            if i % 10 == 0:
+                print(
+                    f"decoded frame {i} ({av_frame.time:.3f}s), {av_sync.actual_fps:.2f}fps, "
+                    f"last video time: {av_sync.last_video_time:.3f}s, last audio time: {av_sync.last_audio_time:.3f}s"
+                )
+            yield (
+                rtc.VideoFrame(
+                    width=frame.shape[1],
+                    height=frame.shape[0],
+                    type=rtc.VideoBufferType.RGBA,
+                    data=frame_rgba.tobytes(),
+                ),
+                av_frame.time,
             )
 
-    async def stream_audio(self) -> AsyncIterable[rtc.AudioFrame]:
+    async def stream_audio(self) -> AsyncIterable[tuple[rtc.AudioFrame, float]]:
         """Streams audio frames from the media file in an endless loop."""
-        for av_frame in self._audio_container.decode(audio=0):
+        for i, av_frame in enumerate(self._audio_container.decode(audio=0)):
             # Convert audio frame to raw int16 samples
             frame = av_frame.to_ndarray().T  # Transpose to (samples, channels)
             frame = (frame * 32768).astype(np.int16)
-            yield rtc.AudioFrame(
-                data=frame.tobytes(),
-                sample_rate=self.info.audio_sample_rate,
-                num_channels=frame.shape[1],
-                samples_per_channel=frame.shape[0],
+            yield (
+                rtc.AudioFrame(
+                    data=frame.tobytes(),
+                    sample_rate=self.info.audio_sample_rate,
+                    num_channels=frame.shape[1],
+                    samples_per_channel=frame.shape[0],
+                ),
+                av_frame.time,
             )
 
     def reset(self):
@@ -102,6 +123,7 @@ async def main(room: rtc.Room, room_name: str, media_path: str):
             api.VideoGrants(
                 room_join=True,
                 room=room_name,
+                agent=True,
             )
         )
         .to_jwt()
@@ -121,7 +143,7 @@ async def main(room: rtc.Room, room_name: str, media_path: str):
     media_info = streamer.info
 
     # Create video and audio sources/tracks
-    queue_size_ms = 1000  # 1 second
+    queue_size_ms = 50  # TODO: testing with different sizes
     video_source = rtc.VideoSource(
         width=media_info.video_width,
         height=media_info.video_height,
@@ -157,18 +179,18 @@ async def main(room: rtc.Room, room_name: str, media_path: str):
     )
 
     async def _push_frames(
-        stream: AsyncIterable[rtc.VideoFrame | rtc.AudioFrame],
+        stream: AsyncIterable[tuple[rtc.VideoFrame | rtc.AudioFrame, float]],
         av_sync: rtc.AVSynchronizer,
     ):
-        async for frame in stream:
-            await av_sync.push(frame)
+        async for frame, timestamp in stream:
+            await av_sync.push(frame, timestamp)
             await asyncio.sleep(0)
 
     try:
         while True:
             streamer.reset()
             video_task = asyncio.create_task(
-                _push_frames(streamer.stream_video(), av_sync)
+                _push_frames(streamer.stream_video(av_sync), av_sync)
             )
             audio_task = asyncio.create_task(
                 _push_frames(streamer.stream_audio(), av_sync)
diff --git a/livekit-rtc/livekit/rtc/synchronizer.py b/livekit-rtc/livekit/rtc/synchronizer.py
index 09d442cd..4efd6a13 100644
--- a/livekit-rtc/livekit/rtc/synchronizer.py
+++ b/livekit-rtc/livekit/rtc/synchronizer.py
@@ -43,6 +43,8 @@ def __init__(
         self._max_delay_tolerance_ms = _max_delay_tolerance_ms
 
         self._stopped = False
+        self._last_video_time: float = 0
+        self._last_audio_time: float = 0
 
         self._video_queue_max_size = int(
             self._video_fps * self._video_queue_size_ms / 1000
@@ -51,7 +53,7 @@ def __init__(
             # ensure queue is bounded if queue size is specified
             self._video_queue_max_size = max(1, self._video_queue_max_size)
 
-        self._video_queue = asyncio.Queue[VideoFrame](
+        self._video_queue = asyncio.Queue[tuple[VideoFrame, float]](
             maxsize=self._video_queue_max_size
         )
         self._fps_controller = _FPSController(
@@ -60,12 +62,16 @@ def __init__(
         )
         self._capture_video_task = asyncio.create_task(self._capture_video())
 
-    async def push(self, frame: Union[VideoFrame, AudioFrame]) -> None:
+    async def push(
+        self, frame: Union[VideoFrame, AudioFrame], timestamp: Optional[float] = None
+    ) -> None:
         if isinstance(frame, AudioFrame):
             await self._audio_source.capture_frame(frame)
+            if timestamp is not None:
+                self._last_audio_time = timestamp
             return
 
-        await self._video_queue.put(frame)
+        await self._video_queue.put((frame, timestamp))
 
     async def clear_queue(self) -> None:
         self._audio_source.clear_queue()
@@ -79,9 +85,11 @@ async def wait_for_playout(self) -> None:
 
     async def _capture_video(self) -> None:
         while not self._stopped:
-            frame = await self._video_queue.get()
+            frame, timestamp = await self._video_queue.get()
             async with self._fps_controller:
                 self._video_source.capture_frame(frame)
+                if timestamp is not None:
+                    self._last_video_time = timestamp
             self._video_queue.task_done()
 
     async def aclose(self) -> None:
@@ -93,6 +101,14 @@ async def aclose(self) -> None:
     def actual_fps(self) -> float:
         return self._fps_controller.actual_fps
 
+    @property
+    def last_video_time(self) -> float:
+        return self._last_video_time
+
+    @property
+    def last_audio_time(self) -> float:
+        return self._last_audio_time
+
 
 class _FPSController:
     def __init__(

From 223be6953e71e07ad0d036d74a30eb070ae29ca3 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Mon, 30 Dec 2024 13:22:17 +0800
Subject: [PATCH 2/7] put timestamp on frames

---
 examples/video-stream/video_play.py     | 21 +++-------------
 livekit-rtc/livekit/rtc/synchronizer.py | 33 +++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/examples/video-stream/video_play.py b/examples/video-stream/video_play.py
index 6365787c..b42d5f67 100644
--- a/examples/video-stream/video_play.py
+++ b/examples/video-stream/video_play.py
@@ -13,10 +13,9 @@
 
 try:
     import av
-    import cv2
 except ImportError:
     raise RuntimeError(
-        "av and opencv-python is required to run this example, install with `pip install av opencv-python`"
+        "av is required to run this example, install with `pip install av`"
     )
 
 # ensure LIVEKIT_URL, LIVEKIT_API_KEY, and LIVEKIT_API_SECRET are set
@@ -52,32 +51,18 @@ def __init__(self, media_file: Union[str, Path]) -> None:
             audio_sample_rate=audio_stream.sample_rate,
             audio_channels=audio_stream.channels,
         )
-        print(self._info)
 
     @property
     def info(self) -> MediaInfo:
         return self._info
 
-    async def stream_video(
-        self, av_sync: rtc.AVSynchronizer
-    ) -> AsyncIterable[tuple[rtc.VideoFrame, float]]:
+    async def stream_video(self) -> AsyncIterable[tuple[rtc.VideoFrame, float]]:
         """Streams video frames from the media file in an endless loop."""
         for i, av_frame in enumerate(self._video_container.decode(video=0)):
             # Convert video frame to RGBA
             frame = av_frame.to_rgb().to_ndarray()
             frame_rgba = np.ones((frame.shape[0], frame.shape[1], 4), dtype=np.uint8)
             frame_rgba[:, :, :3] = frame
-
-            # put fps and timestamps in the frame
-            frame_rgba = cv2.putText(
-                frame_rgba, f"{av_sync.actual_fps:.2f}fps", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2
-            )
-
-            if i % 10 == 0:
-                print(
-                    f"decoded frame {i} ({av_frame.time:.3f}s), {av_sync.actual_fps:.2f}fps, "
-                    f"last video time: {av_sync.last_video_time:.3f}s, last audio time: {av_sync.last_audio_time:.3f}s"
-                )
             yield (
                 rtc.VideoFrame(
                     width=frame.shape[1],
@@ -190,7 +175,7 @@ async def _push_frames(
         while True:
             streamer.reset()
             video_task = asyncio.create_task(
-                _push_frames(streamer.stream_video(av_sync), av_sync)
+                _push_frames(streamer.stream_video(), av_sync)
             )
             audio_task = asyncio.create_task(
                 _push_frames(streamer.stream_audio(), av_sync)
diff --git a/livekit-rtc/livekit/rtc/synchronizer.py b/livekit-rtc/livekit/rtc/synchronizer.py
index 4efd6a13..89a9180a 100644
--- a/livekit-rtc/livekit/rtc/synchronizer.py
+++ b/livekit-rtc/livekit/rtc/synchronizer.py
@@ -8,6 +8,8 @@
 from .audio_frame import AudioFrame
 from .audio_source import AudioSource
 from .video_source import VideoSource
+import numpy as np
+import cv2
 
 logger = logging.getLogger(__name__)
 
@@ -84,8 +86,39 @@ async def wait_for_playout(self) -> None:
         await self._video_queue.join()
 
     async def _capture_video(self) -> None:
+        count = 0
         while not self._stopped:
             frame, timestamp = await self._video_queue.get()
+
+            # debug
+            frame_rgba = np.frombuffer(frame.data, dtype=np.uint8).reshape(
+                frame.height, frame.width, 4
+            )
+            frame_bgr = cv2.cvtColor(frame_rgba[:, :, :3], cv2.COLOR_RGBA2BGR)
+            frame_bgr = cv2.putText(
+                frame_bgr,
+                f"{self.actual_fps:.2f}fps, video time: {timestamp:.3f}s, audio time: {self.last_audio_time:.3f}s",
+                (10, 100),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                1,
+                (0, 0, 255),
+                2,
+            )
+            frame_rgba = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGBA)
+            frame = VideoFrame(
+                width=frame.width,
+                height=frame.height,
+                type=frame.type,
+                data=frame_rgba.tobytes(),
+            )
+            count += 1
+            if count % 30 == 0:
+                print(
+                    f"{self.actual_fps:.2f}fps, last video time: {self.last_video_time:.3f}s, "
+                    f"last audio time: {self.last_audio_time:.3f}s"
+                )
+            # end debug
+
             async with self._fps_controller:
                 self._video_source.capture_frame(frame)
                 if timestamp is not None:

From 964dcb2d993a5a7415fcbfe038180bcd3c978d43 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Mon, 30 Dec 2024 14:48:44 +0800
Subject: [PATCH 3/7] update audio time calculation

---
 examples/video-stream/video_play.py     |  5 +-
 livekit-rtc/livekit/rtc/synchronizer.py | 61 +++++++++++++------------
 2 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/examples/video-stream/video_play.py b/examples/video-stream/video_play.py
index b42d5f67..5d5b6064 100644
--- a/examples/video-stream/video_play.py
+++ b/examples/video-stream/video_play.py
@@ -79,6 +79,7 @@ async def stream_audio(self) -> AsyncIterable[tuple[rtc.AudioFrame, float]]:
             # Convert audio frame to raw int16 samples
             frame = av_frame.to_ndarray().T  # Transpose to (samples, channels)
             frame = (frame * 32768).astype(np.int16)
+            duration = len(frame) / self.info.audio_sample_rate
             yield (
                 rtc.AudioFrame(
                     data=frame.tobytes(),
@@ -86,7 +87,7 @@ async def stream_audio(self) -> AsyncIterable[tuple[rtc.AudioFrame, float]]:
                     num_channels=frame.shape[1],
                     samples_per_channel=frame.shape[0],
                 ),
-                av_frame.time,
+                av_frame.time + duration,
             )
 
     def reset(self):
@@ -128,7 +129,7 @@ async def main(room: rtc.Room, room_name: str, media_path: str):
     media_info = streamer.info
 
     # Create video and audio sources/tracks
-    queue_size_ms = 50  # TODO: testing with different sizes
+    queue_size_ms = 1000  # TODO: testing with different sizes
     video_source = rtc.VideoSource(
         width=media_info.video_width,
         height=media_info.video_height,
diff --git a/livekit-rtc/livekit/rtc/synchronizer.py b/livekit-rtc/livekit/rtc/synchronizer.py
index 89a9180a..23f723eb 100644
--- a/livekit-rtc/livekit/rtc/synchronizer.py
+++ b/livekit-rtc/livekit/rtc/synchronizer.py
@@ -90,39 +90,42 @@ async def _capture_video(self) -> None:
         while not self._stopped:
             frame, timestamp = await self._video_queue.get()
 
-            # debug
-            frame_rgba = np.frombuffer(frame.data, dtype=np.uint8).reshape(
-                frame.height, frame.width, 4
-            )
-            frame_bgr = cv2.cvtColor(frame_rgba[:, :, :3], cv2.COLOR_RGBA2BGR)
-            frame_bgr = cv2.putText(
-                frame_bgr,
-                f"{self.actual_fps:.2f}fps, video time: {timestamp:.3f}s, audio time: {self.last_audio_time:.3f}s",
-                (10, 100),
-                cv2.FONT_HERSHEY_SIMPLEX,
-                1,
-                (0, 0, 255),
-                2,
-            )
-            frame_rgba = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGBA)
-            frame = VideoFrame(
-                width=frame.width,
-                height=frame.height,
-                type=frame.type,
-                data=frame_rgba.tobytes(),
-            )
-            count += 1
-            if count % 30 == 0:
-                print(
-                    f"{self.actual_fps:.2f}fps, last video time: {self.last_video_time:.3f}s, "
-                    f"last audio time: {self.last_audio_time:.3f}s"
+            async with self._fps_controller:
+                # debug
+                frame_rgba = np.frombuffer(frame.data, dtype=np.uint8).reshape(
+                    frame.height, frame.width, 4
+                )
+                frame_bgr = cv2.cvtColor(frame_rgba[:, :, :3], cv2.COLOR_RGBA2BGR)
+                frame_bgr = cv2.putText(
+                    frame_bgr,
+                    f"{self.actual_fps:.2f}fps, video time: {timestamp:.3f}s, "
+                    f"audio time: {self.last_audio_time:.3f}s, diff: {timestamp - self.last_audio_time:.3f}s",
+                    (10, 100),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    1,
+                    (0, 0, 255),
+                    2,
+                )
+                frame_rgba = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGBA)
+                frame = VideoFrame(
+                    width=frame.width,
+                    height=frame.height,
+                    type=frame.type,
+                    data=frame_rgba.tobytes(),
                 )
-            # end debug
+                count += 1
+                # end debug
 
-            async with self._fps_controller:
                 self._video_source.capture_frame(frame)
                 if timestamp is not None:
                     self._last_video_time = timestamp
+
+                if count % 30 == 0:
+                    diff = self.last_video_time - self.last_audio_time
+                    print(
+                        f"{self.actual_fps:.2f}fps, last video time: {self.last_video_time:.3f}s, "
+                        f"last audio time: {self.last_audio_time:.3f}s, diff: {diff:.3f}s"
+                    )
             self._video_queue.task_done()
 
     async def aclose(self) -> None:
@@ -140,7 +143,7 @@ def last_video_time(self) -> float:
 
     @property
     def last_audio_time(self) -> float:
-        return self._last_audio_time
+        return self._last_audio_time - self._audio_source.queued_duration
 
 
 class _FPSController:

From 51b51e0011145fbcd2fe02300755860e2d0b8763 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Mon, 30 Dec 2024 19:08:00 +0800
Subject: [PATCH 4/7] add reset for av sync

---
 examples/video-stream/video_play.py     | 40 ++++++++++++++---
 livekit-rtc/livekit/rtc/synchronizer.py | 60 ++++++++++---------------
 2 files changed, 56 insertions(+), 44 deletions(-)

diff --git a/examples/video-stream/video_play.py b/examples/video-stream/video_play.py
index 5d5b6064..0013cce4 100644
--- a/examples/video-stream/video_play.py
+++ b/examples/video-stream/video_play.py
@@ -75,7 +75,7 @@ async def stream_video(self) -> AsyncIterable[tuple[rtc.VideoFrame, float]]:
 
     async def stream_audio(self) -> AsyncIterable[tuple[rtc.AudioFrame, float]]:
         """Streams audio frames from the media file in an endless loop."""
-        for i, av_frame in enumerate(self._audio_container.decode(audio=0)):
+        for av_frame in self._audio_container.decode(audio=0):
             # Convert audio frame to raw int16 samples
             frame = av_frame.to_ndarray().T  # Transpose to (samples, channels)
             frame = (frame * 32768).astype(np.int16)
@@ -129,7 +129,7 @@ async def main(room: rtc.Room, room_name: str, media_path: str):
     media_info = streamer.info
 
     # Create video and audio sources/tracks
-    queue_size_ms = 1000  # TODO: testing with different sizes
+    queue_size_ms = 1000
     video_source = rtc.VideoSource(
         width=media_info.video_width,
         height=media_info.video_height,
@@ -172,19 +172,45 @@ async def _push_frames(
             await av_sync.push(frame, timestamp)
             await asyncio.sleep(0)
 
+    async def _log_fps(av_sync: rtc.AVSynchronizer):
+        while True:
+            await asyncio.sleep(2)
+            diff = av_sync.last_video_time - av_sync.last_audio_time
+
+            logger.info(
+                f"fps: {av_sync.actual_fps:.2f}, video_time: {av_sync.last_video_time:.3f}s, "
+                f"audio_time: {av_sync.last_audio_time:.3f}s, diff: {diff:.3f}s"
+            )
+
     try:
         while True:
             streamer.reset()
-            video_task = asyncio.create_task(
-                _push_frames(streamer.stream_video(), av_sync)
-            )
-            audio_task = asyncio.create_task(
-                _push_frames(streamer.stream_audio(), av_sync)
+
+            video_stream = streamer.stream_video()
+            audio_stream = streamer.stream_audio()
+
+            # read the head frames and push them at the same time
+            first_video_frame, video_timestamp = await video_stream.__anext__()
+            first_audio_frame, audio_timestamp = await audio_stream.__anext__()
+            logger.info(
+                f"first video duration: {1/media_info.video_fps:.3f}s, "
+                f"first audio duration: {first_audio_frame.duration:.3f}s"
             )
+            await av_sync.push(first_video_frame, video_timestamp)
+            await av_sync.push(first_audio_frame, audio_timestamp)
+
+            video_task = asyncio.create_task(_push_frames(video_stream, av_sync))
+            audio_task = asyncio.create_task(_push_frames(audio_stream, av_sync))
+
+            log_fps_task = asyncio.create_task(_log_fps(av_sync))
 
             # wait for both tasks to complete
             await asyncio.gather(video_task, audio_task)
             await av_sync.wait_for_playout()
+
+            # clean up
+            av_sync.reset()
+            log_fps_task.cancel()
             logger.info("playout finished")
     finally:
         await streamer.aclose()
diff --git a/livekit-rtc/livekit/rtc/synchronizer.py b/livekit-rtc/livekit/rtc/synchronizer.py
index 23f723eb..a6ea41bd 100644
--- a/livekit-rtc/livekit/rtc/synchronizer.py
+++ b/livekit-rtc/livekit/rtc/synchronizer.py
@@ -45,6 +45,7 @@ def __init__(
         self._max_delay_tolerance_ms = _max_delay_tolerance_ms
 
         self._stopped = False
+        # the time of the last video/audio frame captured
         self._last_video_time: float = 0
         self._last_audio_time: float = 0
 
@@ -55,7 +56,7 @@ def __init__(
             # ensure queue is bounded if queue size is specified
             self._video_queue_max_size = max(1, self._video_queue_max_size)
 
-        self._video_queue = asyncio.Queue[tuple[VideoFrame, float]](
+        self._video_queue = asyncio.Queue[tuple[VideoFrame, Optional[float]]](
             maxsize=self._video_queue_max_size
         )
         self._fps_controller = _FPSController(
@@ -67,6 +68,13 @@ def __init__(
     async def push(
         self, frame: Union[VideoFrame, AudioFrame], timestamp: Optional[float] = None
     ) -> None:
+        """Push a frame to the synchronizer
+
+        Args:
+            frame: The video or audio frame to push.
+            timestamp: (optional) The timestamp of the frame, for logging purposes for now.
+                For AudioFrame, it should be the end time of the frame.
+        """
         if isinstance(frame, AudioFrame):
             await self._audio_source.capture_frame(frame)
             if timestamp is not None:
@@ -79,53 +87,25 @@ async def clear_queue(self) -> None:
         self._audio_source.clear_queue()
         while not self._video_queue.empty():
             await self._video_queue.get()
+            self._video_queue.task_done()
 
     async def wait_for_playout(self) -> None:
         """Wait until all video and audio frames are played out."""
-        await self._audio_source.wait_for_playout()
-        await self._video_queue.join()
+        await asyncio.gather(
+            self._audio_source.wait_for_playout(),
+            self._video_queue.join(),
+        )
+
+    def reset(self) -> None:
+        self._fps_controller.reset()
 
     async def _capture_video(self) -> None:
-        count = 0
         while not self._stopped:
             frame, timestamp = await self._video_queue.get()
-
             async with self._fps_controller:
-                # debug
-                frame_rgba = np.frombuffer(frame.data, dtype=np.uint8).reshape(
-                    frame.height, frame.width, 4
-                )
-                frame_bgr = cv2.cvtColor(frame_rgba[:, :, :3], cv2.COLOR_RGBA2BGR)
-                frame_bgr = cv2.putText(
-                    frame_bgr,
-                    f"{self.actual_fps:.2f}fps, video time: {timestamp:.3f}s, "
-                    f"audio time: {self.last_audio_time:.3f}s, diff: {timestamp - self.last_audio_time:.3f}s",
-                    (10, 100),
-                    cv2.FONT_HERSHEY_SIMPLEX,
-                    1,
-                    (0, 0, 255),
-                    2,
-                )
-                frame_rgba = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGBA)
-                frame = VideoFrame(
-                    width=frame.width,
-                    height=frame.height,
-                    type=frame.type,
-                    data=frame_rgba.tobytes(),
-                )
-                count += 1
-                # end debug
-
                 self._video_source.capture_frame(frame)
                 if timestamp is not None:
                     self._last_video_time = timestamp
-
-                if count % 30 == 0:
-                    diff = self.last_video_time - self.last_audio_time
-                    print(
-                        f"{self.actual_fps:.2f}fps, last video time: {self.last_video_time:.3f}s, "
-                        f"last audio time: {self.last_audio_time:.3f}s, diff: {diff:.3f}s"
-                    )
             self._video_queue.task_done()
 
     async def aclose(self) -> None:
@@ -139,10 +119,12 @@ def actual_fps(self) -> float:
 
     @property
     def last_video_time(self) -> float:
+        """The time of the last video frame captured"""
         return self._last_video_time
 
     @property
     def last_audio_time(self) -> float:
+        """The time of the last audio frame played out"""
         return self._last_audio_time - self._audio_source.queued_duration
 
 
@@ -175,6 +157,10 @@ async def __aenter__(self) -> None:
     async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
         self.after_process()
 
+    def reset(self) -> None:
+        self._next_frame_time = None
+        self._send_timestamps.clear()
+
     async def wait_next_process(self) -> None:
         """Wait until it's time for the next frame.
 

From e5823cf5a7e40e1c33a1aaab0a952a50cab77845 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Mon, 30 Dec 2024 19:18:05 +0800
Subject: [PATCH 5/7] fix types

---
 livekit-rtc/livekit/rtc/synchronizer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/livekit-rtc/livekit/rtc/synchronizer.py b/livekit-rtc/livekit/rtc/synchronizer.py
index a6ea41bd..14df3b6c 100644
--- a/livekit-rtc/livekit/rtc/synchronizer.py
+++ b/livekit-rtc/livekit/rtc/synchronizer.py
@@ -8,8 +8,7 @@
 from .audio_frame import AudioFrame
 from .audio_source import AudioSource
 from .video_source import VideoSource
-import numpy as np
-import cv2
+
 
 logger = logging.getLogger(__name__)
 

From 6a70ae8c0344b878d577b4c5a4479e5aeb7ecb85 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Mon, 30 Dec 2024 19:49:26 +0800
Subject: [PATCH 6/7] add wall time to log

---
 examples/video-stream/video_play.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/video-stream/video_play.py b/examples/video-stream/video_play.py
index 0013cce4..d16068b6 100644
--- a/examples/video-stream/video_play.py
+++ b/examples/video-stream/video_play.py
@@ -173,15 +173,18 @@ async def _push_frames(
             await asyncio.sleep(0)
 
     async def _log_fps(av_sync: rtc.AVSynchronizer):
+        start_time = asyncio.get_running_loop().time()
         while True:
             await asyncio.sleep(2)
+            wall_time = asyncio.get_running_loop().time() - start_time
             diff = av_sync.last_video_time - av_sync.last_audio_time
-
             logger.info(
-                f"fps: {av_sync.actual_fps:.2f}, video_time: {av_sync.last_video_time:.3f}s, "
+                f"fps: {av_sync.actual_fps:.2f}, wall_time: {wall_time:.3f}s, "
+                f"video_time: {av_sync.last_video_time:.3f}s, "
                 f"audio_time: {av_sync.last_audio_time:.3f}s, diff: {diff:.3f}s"
             )
 
+
     try:
         while True:
             streamer.reset()

From 6cb5c5cfb8c422ab7336bd8ee8e844850edf73c6 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Mon, 30 Dec 2024 19:54:16 +0800
Subject: [PATCH 7/7] fix format

---
 examples/video-stream/video_play.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/video-stream/video_play.py b/examples/video-stream/video_play.py
index d16068b6..a8b970cf 100644
--- a/examples/video-stream/video_play.py
+++ b/examples/video-stream/video_play.py
@@ -184,7 +184,6 @@ async def _log_fps(av_sync: rtc.AVSynchronizer):
                 f"audio_time: {av_sync.last_audio_time:.3f}s, diff: {diff:.3f}s"
             )
 
-
     try:
         while True:
             streamer.reset()