From e508ceb72553b1abcff97cfb17f7a8967125de3e Mon Sep 17 00:00:00 2001
From: aler9 <46489434+aler9@users.noreply.github.com>
Date: Mon, 21 Oct 2024 00:41:03 +0200
Subject: [PATCH] improve performance by using DMA buffers

this speeds up text drawing and software encoding.
---
 camera.cpp          | 89 +++++++++++++++++++++++++++++----------------
 encoder.c           |  4 +-
 encoder.h           |  5 +--
 encoder_hard_h264.c | 14 +++----
 encoder_hard_h264.h |  6 ++-
 encoder_soft_h264.c |  6 +--
 encoder_soft_h264.h |  4 +-
 main.c              | 27 ++++++++++----
 pipe.c              |  8 ++--
 pipe.h              |  2 +-
 10 files changed, 100 insertions(+), 65 deletions(-)
diff --git a/camera.cpp b/camera.cpp
index 9ecec38..78e3e0f 100644
--- a/camera.cpp
+++ b/camera.cpp
@@ -8,6 +8,9 @@
 #include <fcntl.h>
 #include <unistd.h>
 
+#include <linux/videodev2.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-heap.h>
 #include <libcamera/camera_manager.h>
 #include <libcamera/camera.h>
 #include <libcamera/formats.h>
@@ -16,7 +19,6 @@
 #include <libcamera/framebuffer_allocator.h>
 #include <libcamera/property_ids.h>
 #include <libcamera/transform.h>
-#include <linux/videodev2.h>
 
 #include "camera.h"
 
@@ -31,17 +33,35 @@ using libcamera::Orientation;
 using libcamera::PixelFormat;
 using libcamera::Rectangle;
 using libcamera::Request;
+using libcamera::SharedFD;
 using libcamera::Size;
 using libcamera::Span;
 using libcamera::Stream;
 using libcamera::StreamRole;
 using libcamera::StreamConfiguration;
 using libcamera::Transform;
+using libcamera::UniqueFD;
 
 namespace controls = libcamera::controls;
 namespace formats = libcamera::formats;
 namespace properties = libcamera::properties;
 
+static const char *heap_positions[] = {
+    "/dev/dma_heap/vidbuf_cached",
+    "/dev/dma_heap/linux,cma",
+};
+
+// https://github.com/raspberrypi/rpicam-apps/blob/6de1ab6a899df35f929b2a15c0831780bd8e750e/core/dma_heaps.cpp
+static int create_dma_allocator() {
+    for (unsigned int i = 0; i < sizeof(heap_positions); i++) {
+        int fd = open(heap_positions[i], O_RDWR | O_CLOEXEC, 0);
+        if (fd >= 0) {
+            return fd;
+        }
+    }
+    return -1;
+}
+
 static char errbuf[256];
 
 static void set_error(const char *format, ...) {
@@ -80,10 +100,10 @@ struct CameraPriv {
     std::unique_ptr<CameraManager> camera_manager;
     std::shared_ptr<Camera> camera;
     Stream *video_stream;
-    std::unique_ptr<FrameBufferAllocator> allocator;
     std::vector<std::unique_ptr<Request>> requests;
     std::mutex ctrls_mutex;
     std::unique_ptr<ControlList> ctrls;
+    std::vector<std::unique_ptr<FrameBuffer>> frame_buffers;
     std::map<FrameBuffer *, uint8_t *> mapped_buffers;
     bool ts_initialized;
     uint64_t ts_start;
@@ -96,22 +116,6 @@ static int get_v4l2_colorspace(std::optional<ColorSpace> const &cs) {
     return V4L2_COLORSPACE_SMPTE170M;
 }
 
-// https://github.com/raspberrypi/libcamera-apps/blob/a5b5506a132056ac48ba22bc581cc394456da339/core/libcamera_app.cpp#L824
-static uint8_t *map_buffer(FrameBuffer *buffer) {
-    size_t buffer_size = 0;
-
-    for (unsigned i = 0; i < buffer->planes().size(); i++) {
-        const FrameBuffer::Plane &plane = buffer->planes()[i];
-        buffer_size += plane.length;
-
-        if (i == buffer->planes().size() - 1 || plane.fd.get() != buffer->planes()[i + 1].fd.get()) {
-            return (uint8_t *)mmap(NULL, buffer_size, PROT_READ | PROT_WRITE, MAP_SHARED, plane.fd.get(), 0);
-        }
-    }
-
-    return NULL;
-}
-
 // https://github.com/raspberrypi/libcamera-apps/blob/a6267d51949d0602eedf60f3ddf8c6685f652812/core/options.cpp#L101
 static void set_hdr(bool hdr) {
     bool ok = false;
@@ -175,7 +179,7 @@ bool camera_create(const parameters_t *params, camera_frame_cb frame_cb, camera_
         return false;
     }
 
-    std::vector<libcamera::StreamRole> stream_roles = { StreamRole::VideoRecording };
+    std::vector<StreamRole> stream_roles = { StreamRole::VideoRecording };
     if (params->mode != NULL) {
         stream_roles.push_back(StreamRole::Raw);
     }
@@ -187,7 +191,7 @@ bool camera_create(const parameters_t *params, camera_frame_cb frame_cb, camera_
     }
 
     StreamConfiguration &video_stream_conf = conf->at(0);
-    video_stream_conf.size = libcamera::Size(params->width, params->height);
+    video_stream_conf.size = Size(params->width, params->height);
     video_stream_conf.pixelFormat = formats::YUV420;
     video_stream_conf.bufferCount = params->buffer_count;
     if (params->width >= 1280 || params->height >= 720) {
@@ -234,24 +238,45 @@ bool camera_create(const parameters_t *params, camera_frame_cb frame_cb, camera_
         camp->requests.push_back(std::move(request));
     }
 
-    camp->allocator = std::make_unique<FrameBufferAllocator>(camp->camera);
+    // allocate DMA buffers manually instead of using default buffers provided by libcamera.
+    // this improves performance by a lot.
+    // https://forums.raspberrypi.com/viewtopic.php?t=352554
+    // https://github.com/raspberrypi/rpicam-apps/blob/6de1ab6a899df35f929b2a15c0831780bd8e750e/core/rpicam_app.cpp#L1012
+
+    int allocator_fd = create_dma_allocator();
+    if (allocator_fd < 0) {
+        set_error("failed to open dma heap allocator");
+        return false;
+    }
+
     for (StreamConfiguration &stream_conf : *conf) {
         Stream *stream = stream_conf.stream();
 
-        res = camp->allocator->allocate(stream);
-        if (res < 0) {
-            set_error("allocate() failed");
-            return false;
-        }
+        for (unsigned int i = 0; i < params->buffer_count; i++) {
+            struct dma_heap_allocation_data alloc = {};
+            alloc.len = stream_conf.frameSize;
+            alloc.fd_flags = O_CLOEXEC | O_RDWR;
+            int ret = ioctl(allocator_fd, DMA_HEAP_IOCTL_ALLOC, &alloc);
+            if (ret < 0) {
+                set_error("failed to allocate buffer in dma heap");
+                return false;
+            }
+            UniqueFD fd(alloc.fd);
 
-        int i = 0;
-        for (const std::unique_ptr<FrameBuffer> &buffer : camp->allocator->buffers(stream)) {
-            // map buffer of the video stream only
+            std::vector<FrameBuffer::Plane> plane(1);
+            plane[0].fd = SharedFD(std::move(fd));
+            plane[0].offset = 0;
+            plane[0].length = stream_conf.frameSize;
+
+            camp->frame_buffers.push_back(std::make_unique<FrameBuffer>(plane));
+            FrameBuffer *fb = camp->frame_buffers.back().get();
+
+            // map buffers of the video stream only
             if (stream == video_stream_conf.stream()) {
-                camp->mapped_buffers[buffer.get()] = map_buffer(buffer.get());
+                camp->mapped_buffers[fb] = (uint8_t*)mmap(NULL, stream_conf.frameSize, PROT_READ | PROT_WRITE, MAP_SHARED, plane[0].fd.get(), 0);
             }
 
-            res = camp->requests.at(i++)->addBuffer(stream, buffer.get());
+            res = camp->requests.at(i)->addBuffer(stream, fb);
             if (res != 0) {
                 set_error("addBuffer() failed");
                 return false;
@@ -259,6 +284,8 @@ bool camera_create(const parameters_t *params, camera_frame_cb frame_cb, camera_
         }
     }
 
+    close(allocator_fd);
+
     camp->params = params;
     camp->frame_cb = frame_cb;
     *cam = camp.release();
diff --git a/encoder.c b/encoder.c
index 1d48cc1..266709f 100644
--- a/encoder.c
+++ b/encoder.c
@@ -13,8 +13,6 @@
 #include "encoder_soft_h264.h"
 #include "encoder.h"
 
-#define HARDWARE_DEVICE "/dev/video11"
-
 static char errbuf[256];
 
 static void set_error(const char *format, ...) {
@@ -38,7 +36,7 @@ typedef struct {
 } encoder_priv_t;
 
 static bool supports_hardware_h264() {
-    int fd = open(HARDWARE_DEVICE, O_RDWR, 0);
+    int fd = open(ENCODER_HARD_H264_DEVICE, O_RDWR, 0);
     if (fd < 0) {
         return false;
     }
diff --git a/encoder.h b/encoder.h
index 6f6e7af..713cc99 100644
--- a/encoder.h
+++ b/encoder.h
@@ -5,10 +5,7 @@
 
 typedef void encoder_t;
 
-typedef void (*encoder_output_cb)(
-    uint64_t ts,
-    const uint8_t *buf,
-    uint64_t size);
+typedef void (*encoder_output_cb)(const uint8_t *mapped, uint64_t size, uint64_t ts);
 
 const char *encoder_get_error();
 bool encoder_create(const parameters_t *params, int stride, int colorspace, encoder_output_cb output_cb, encoder_t **enc);
diff --git a/encoder_hard_h264.c b/encoder_hard_h264.c
index 6309b24..2c3e022 100644
--- a/encoder_hard_h264.c
+++ b/encoder_hard_h264.c
@@ -15,8 +15,6 @@
 
 #include "encoder_hard_h264.h"
 
-#define DEVICE "/dev/video11"
-
 static char errbuf[256];
 
 static void set_error(const char *format, ...) {
@@ -61,11 +59,11 @@ static void *output_thread(void *userdata) {
             exit(1);
         }
 
+        const uint8_t *mapped = (const uint8_t *)encp->capture_buffers[buf.index];
+        int size = buf.m.planes[0].bytesused;
         uint64_t ts = ((uint64_t)buf.timestamp.tv_sec * (uint64_t)1000000) + (uint64_t)buf.timestamp.tv_usec;
 
-        const uint8_t *buf_mem = (const uint8_t *)encp->capture_buffers[buf.index];
-        int buf_size = buf.m.planes[0].bytesused;
-        encp->output_cb(ts, buf_mem, buf_size);
+        encp->output_cb(mapped, size, ts);
 
         res = ioctl(encp->fd, VIDIOC_QBUF, &buf);
         if (res != 0) {
@@ -103,7 +101,7 @@ bool encoder_hard_h264_create(const parameters_t *params, int stride, int colors
     encoder_hard_h264_priv_t *encp = (encoder_hard_h264_priv_t *)(*enc);
     memset(encp, 0, sizeof(encoder_hard_h264_priv_t));
 
-    encp->fd = open(DEVICE, O_RDWR, 0);
+    encp->fd = open(ENCODER_HARD_H264_DEVICE, O_RDWR, 0);
     if (encp->fd < 0) {
         set_error("unable to open device");
         goto failed;
@@ -266,7 +264,7 @@ bool encoder_hard_h264_create(const parameters_t *params, int stride, int colors
     return false;
 }
 
-void encoder_hard_h264_encode(encoder_hard_h264_t *enc, uint8_t *mapped_buffer, int buffer_fd, size_t size, uint64_t ts) {
+void encoder_hard_h264_encode(encoder_hard_h264_t *enc, uint8_t *mapped, int fd, size_t size, uint64_t ts) {
     encoder_hard_h264_priv_t *encp = (encoder_hard_h264_priv_t *)enc;
 
     int index = encp->cur_buffer++;
@@ -282,7 +280,7 @@ void encoder_hard_h264_encode(encoder_hard_h264_t *enc, uint8_t *mapped_buffer,
     buf.timestamp.tv_sec = ts / 1000000;
     buf.timestamp.tv_usec = ts % 1000000;
     buf.m.planes = planes;
-    buf.m.planes[0].m.fd = buffer_fd;
+    buf.m.planes[0].m.fd = fd;
     buf.m.planes[0].bytesused = size;
     buf.m.planes[0].length = size;
     int res = ioctl(encp->fd, VIDIOC_QBUF, &buf);
diff --git a/encoder_hard_h264.h b/encoder_hard_h264.h
index ef2c9b6..dfaa92d 100644
--- a/encoder_hard_h264.h
+++ b/encoder_hard_h264.h
@@ -3,13 +3,15 @@
 
 #include "parameters.h"
 
+#define ENCODER_HARD_H264_DEVICE "/dev/video11"
+
 typedef void encoder_hard_h264_t;
 
-typedef void (*encoder_hard_h264_output_cb)(uint64_t ts, const uint8_t *buf, uint64_t size);
+typedef void (*encoder_hard_h264_output_cb)(const uint8_t *mapped, uint64_t size, uint64_t ts);
 
 const char *encoder_hard_h264_get_error();
 bool encoder_hard_h264_create(const parameters_t *params, int stride, int colorspace, encoder_hard_h264_output_cb output_cb, encoder_hard_h264_t **enc);
-void encoder_hard_h264_encode(encoder_hard_h264_t *enc, uint8_t *mapped_buffer, int buffer_fd, size_t size, uint64_t ts);
+void encoder_hard_h264_encode(encoder_hard_h264_t *enc, uint8_t *mapped, int fd, size_t size, uint64_t ts);
 void encoder_hard_h264_reload_params(encoder_hard_h264_t *enc, const parameters_t *params);
 
 #endif
diff --git a/encoder_soft_h264.c b/encoder_soft_h264.c
index 1e5ea00..96f031c 100644
--- a/encoder_soft_h264.c
+++ b/encoder_soft_h264.c
@@ -97,10 +97,10 @@ bool encoder_soft_h264_create(const parameters_t *params, int stride, int colors
     return false;
 }
 
-void encoder_soft_h264_encode(encoder_soft_h264_t *enc, uint8_t *mapped_buffer, int buffer_fd, size_t size, uint64_t ts) {
+void encoder_soft_h264_encode(encoder_soft_h264_t *enc, uint8_t *mapped, int fd, size_t size, uint64_t ts) {
     encoder_soft_h264_priv_t *encp = (encoder_soft_h264_priv_t *)enc;
 
-    encp->x_pic_in.img.plane[0] = mapped_buffer; // Y
+    encp->x_pic_in.img.plane[0] = mapped; // Y
     encp->x_pic_in.img.plane[1] = encp->x_pic_in.img.plane[0] + encp->x_pic_in.img.i_stride[0] * encp->params->height; // U
     encp->x_pic_in.img.plane[2] = encp->x_pic_in.img.plane[1] + (encp->x_pic_in.img.i_stride[0] / 2) * (encp->params->height / 2); // V
     encp->x_pic_in.i_pts = encp->next_pts++;
@@ -113,7 +113,7 @@ void encoder_soft_h264_encode(encoder_soft_h264_t *enc, uint8_t *mapped_buffer,
 
     pthread_mutex_unlock(&encp->mutex);
 
-    encp->output_cb(ts, nal->p_payload, frame_size);
+    encp->output_cb(nal->p_payload, frame_size, ts);
 }
 
 void encoder_soft_h264_reload_params(encoder_soft_h264_t *enc, const parameters_t *params) {
diff --git a/encoder_soft_h264.h b/encoder_soft_h264.h
index 0e74e56..60c8834 100644
--- a/encoder_soft_h264.h
+++ b/encoder_soft_h264.h
@@ -5,11 +5,11 @@
 
 typedef void encoder_soft_h264_t;
 
-typedef void (*encoder_soft_h264_output_cb)(uint64_t ts, const uint8_t *buf, uint64_t size);
+typedef void (*encoder_soft_h264_output_cb)(const uint8_t *mapped, uint64_t size, uint64_t ts);
 
 const char *encoder_soft_h264_get_error();
 bool encoder_soft_h264_create(const parameters_t *params, int stride, int colorspace, encoder_soft_h264_output_cb output_cb, encoder_soft_h264_t **enc);
-void encoder_soft_h264_encode(encoder_soft_h264_t *enc, uint8_t *mapped_buffer, int buffer_fd, size_t size, uint64_t ts);
+void encoder_soft_h264_encode(encoder_soft_h264_t *enc, uint8_t *mapped, int fd, size_t size, uint64_t ts);
 void encoder_soft_h264_reload_params(encoder_soft_h264_t *enc, const parameters_t *params);
 
 #endif
diff --git a/main.c b/main.c
index c3337d1..110879f 100644
--- a/main.c
+++ b/main.c
@@ -5,6 +5,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
+#include <sys/ioctl.h>
+
+#include <linux/dma-buf.h>
 
 #include "parameters.h"
 #include "pipe.h"
@@ -18,17 +21,27 @@ static text_t *text;
 static encoder_t *enc;
 
 static void on_frame(
-    uint8_t *mapped_buffer,
-    int buffer_fd,
+    uint8_t *mapped,
+    int fd,
     uint64_t size,
-    uint64_t timestamp) {
-    text_draw(text, mapped_buffer);
-    encoder_encode(enc, mapped_buffer, buffer_fd, size, timestamp);
+    uint64_t ts) {
+    // mapped DMA buffers require a DMA_BUF_IOCTL_SYNC before and after usage.
+    // https://forums.raspberrypi.com/viewtopic.php?t=352554
+    struct dma_buf_sync dma_sync = {0};
+    dma_sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_RW;
+    ioctl(fd, DMA_BUF_IOCTL_SYNC, &dma_sync);
+
+    text_draw(text, mapped);
+
+    dma_sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_RW;
+    ioctl(fd, DMA_BUF_IOCTL_SYNC, &dma_sync);
+
+    encoder_encode(enc, mapped, fd, size, ts);
 }
 
-static void on_encoder_output(uint64_t ts, const uint8_t *buf, uint64_t size) {
+static void on_encoder_output(const uint8_t *mapped, uint64_t size, uint64_t ts) {
     pthread_mutex_lock(&pipe_video_mutex);
-    pipe_write_buf(pipe_video_fd, ts, buf, size);
+    pipe_write_buf(pipe_video_fd, mapped, size, ts);
     pthread_mutex_unlock(&pipe_video_mutex);
 }
 
diff --git a/pipe.c b/pipe.c
index 38d2437..fd43d41 100644
--- a/pipe.c
+++ b/pipe.c
@@ -25,13 +25,13 @@ void pipe_write_ready(int fd) {
     write(fd, buf, n);
 }
 
-void pipe_write_buf(int fd, uint64_t ts, const uint8_t *buf, uint32_t n) {
+void pipe_write_buf(int fd, const uint8_t *mapped, uint32_t size, uint64_t ts) {
     char head[] = {'b'};
-    n += 1 + sizeof(uint64_t);
-    write(fd, &n, 4);
+    size += 1 + sizeof(uint64_t);
+    write(fd, &size, 4);
     write(fd, head, 1);
     write(fd, &ts, sizeof(uint64_t));
-    write(fd, buf, n - 1 - sizeof(uint64_t));
+    write(fd, mapped, size - 1 - sizeof(uint64_t));
 }
 
 uint32_t pipe_read(int fd, uint8_t **pbuf) {
diff --git a/pipe.h b/pipe.h
index 2663045..83b76b5 100644
--- a/pipe.h
+++ b/pipe.h
@@ -6,7 +6,7 @@
 
 void pipe_write_error(int fd, const char *format, ...);
 void pipe_write_ready(int fd);
-void pipe_write_buf(int fd, uint64_t ts, const uint8_t *buf, uint32_t n);
+void pipe_write_buf(int fd, const uint8_t *mapped, uint32_t size, uint64_t ts);
 uint32_t pipe_read(int fd, uint8_t **pbuf);
 
 #endif