From 2270de6ac9bedc415af6a13b50c2edf3a53335b0 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Sun, 5 Jul 2026 16:34:08 +0900
Subject: [PATCH] seg_gather_fix

---
 libavcodec/apv_encode_vulkan.c         |  46 +++++++++--
 libavcodec/ffv1enc_vulkan.c            | 101 +++++++++++++++++--------
 libavcodec/vulkan/seg_gather.comp.glsl |  47 +++++++-----
 3 files changed, 139 insertions(+), 55 deletions(-)

diff --git a/libavcodec/apv_encode_vulkan.c b/libavcodec/apv_encode_vulkan.c
index 058f692636..6c0e3bbfd1 100644
--- a/libavcodec/apv_encode_vulkan.c
+++ b/libavcodec/apv_encode_vulkan.c
@@ -105,6 +105,7 @@ typedef struct VulkanEncodeAPVContext {
     /* Per-frame buffer pools */
     AVBufferPool *coeffs_pool;
     AVBufferPool *bytestream_pool;
+    AVBufferPool *gathered_pool;
     AVBufferPool *compacted_pool;
     AVBufferPool *sizes_pool;
 
@@ -404,6 +405,8 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec,
 
     FFVkBuffer *coeffs_buf;
     FFVkBuffer *bytestream_buf;
+    AVBufferRef *gathered_ref = NULL;
+    FFVkBuffer *gathered_buf;
     FFVkBuffer *compacted_buf;
     FFVkBuffer *sizes_buf;
 
@@ -426,10 +429,22 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec,
     bytestream_buf = (FFVkBuffer *)fd->bytestream_ref->data;
 
     /* The compaction shader gathers the sparse slots into here, contiguous.
-     * Host-visible + host-cached so the CPU readback is a fast cached copy,
-     * and the GPU writes it as one coalesced sequential stream. */
+     * Device-local: shader stores over the bus are unreliably slow on some
+     * drivers, so the transfer to the host is left to the copy engine. */
+    RET(ff_vk_get_pooled_buffer(&ev->s, &ev->gathered_pool,
+                                &gathered_ref,
+                                VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+                                VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                                VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+                                NULL, ev->bytestream_size,
+                                VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
+    gathered_buf = (FFVkBuffer *)gathered_ref->data;
+
+    /* Copy-engine destination the CPU assembles the packet from.
+     * Host-visible + host-cached so the readback is a fast cached copy. */
     RET(ff_vk_get_pooled_buffer(&ev->s, &ev->compacted_pool,
                                 &fd->compacted_ref,
+                                VK_BUFFER_USAGE_TRANSFER_DST_BIT |
                                 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
                                 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
                                 NULL, ev->bytestream_size,
@@ -449,6 +464,8 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec,
 
     ff_vk_exec_add_dep_buf(&ev->s, exec, &fd->coeffs_ref, 1, 1);
     ff_vk_exec_add_dep_buf(&ev->s, exec, &fd->bytestream_ref, 1, 1);
+    ff_vk_exec_add_dep_buf(&ev->s, exec, &gathered_ref, 1, 0);
+    gathered_ref = NULL; /* Ownership passed */
     ff_vk_exec_add_dep_buf(&ev->s, exec, &fd->compacted_ref, 1, 1);
     ff_vk_exec_add_dep_buf(&ev->s, exec, &fd->sizes_ref, 1, 1);
 
@@ -551,8 +568,8 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec,
     }
 
     /* Compaction pass: gather the sparse per-tile-component slots into one
-     * contiguous, host-visible buffer. Reads VRAM, writes the host buffer as
-     * a coalesced stream -- the device->host transfer the CPU then reads. */
+     * contiguous device-local buffer, then read it back with the copy
+     * engine. */
     if (!ev->headers_only) {
         ff_vk_buf_barrier(buf_bar[nb_buf_bar++], bytestream_buf,
                           COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE,
@@ -571,7 +588,7 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec,
 
         CompactPushData pd = {
             .sparse    = bytestream_buf->address,
-            .compacted = compacted_buf->address,
+            .compacted = gathered_buf->address,
             .slot_size = (uint32_t)ev->slot_size,
         };
 
@@ -585,6 +602,23 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec,
                                        0, sizeof(pd), &pd);
 
         vk->CmdDispatch(exec->buf, ev->tile_count * ev->num_comp, 1, 1);
+
+        /* The gathered size is only known once the encode is done, so the
+         * whole buffer is copied; the slots are sized to the entropy coder's
+         * worst case, which keeps this close to the payload size. */
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], gathered_buf,
+                          COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE,
+                          TRANSFER_BIT, TRANSFER_READ_BIT, NONE,
+                          0, gathered_buf->size);
+        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = buf_bar,
+            .bufferMemoryBarrierCount = nb_buf_bar,
+        });
+        nb_buf_bar = 0;
+
+        vk->CmdCopyBuffer(exec->buf, gathered_buf->buf, compacted_buf->buf,
+                          1, &(VkBufferCopy) { .size = ev->bytestream_size });
     }
 
     err = ff_vk_exec_submit(&ev->s, exec);
@@ -594,6 +628,7 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec,
     return 0;
 
 fail:
+    av_buffer_unref(&gathered_ref);
     ff_vk_exec_discard_deps(&ev->s, exec);
     return err;
 }
@@ -829,6 +864,7 @@ static av_cold int vulkan_encode_apv_close(AVCodecContext *avctx)
 
     av_buffer_pool_uninit(&ev->coeffs_pool);
     av_buffer_pool_uninit(&ev->bytestream_pool);
+    av_buffer_pool_uninit(&ev->gathered_pool);
     av_buffer_pool_uninit(&ev->compacted_pool);
     av_buffer_pool_uninit(&ev->sizes_pool);
 
diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index a3c67e7f43..7837dd4a17 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -91,7 +91,10 @@ typedef struct VulkanEncodeFFv1Context {
     /* Output data buffer */
     AVBufferPool *out_data_pool;
 
-    /* Gathered (contiguous) output buffer pool */
+    /* Gathered (contiguous) device-local buffer pool */
+    AVBufferPool *gathered_data_pool;
+
+    /* Host-visible readback buffer pool */
     AVBufferPool *compacted_data_pool;
 
     /* Intermediate frame pool */
@@ -301,6 +304,8 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
     /* Output data */
     size_t maxsize;
     FFVkBuffer *out_data_buf;
+    AVBufferRef *gathered_ref = NULL;
+    FFVkBuffer *gathered_buf = NULL;
     FFVkBuffer *compacted_buf;
 
     int has_inter = avctx->gop_size > 1;
@@ -372,26 +377,40 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
     maxsize = ffv1_vk_buffer_size(avctx);
     maxsize = FFMIN(maxsize, fv->s.props_11.maxMemoryAllocationSize);
 
-    /* Sparse per-slice output: written by encode, read by gather, never by the
-     * CPU, so device-local unless it won't fit in VRAM. */
-    VkMemoryPropertyFlagBits out_buf_flags;
-    if (maxsize < fv->max_heap_size)
-        out_buf_flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
-    else
-        out_buf_flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                        fv->s.host_cached_flag;
+    /* The sparse per-slice slots and the gathered copy both stay in VRAM,
+     * and the bitstream is read back with a single copy-engine transfer.
+     * Shaders only touch host memory if VRAM cannot hold both buffers:
+     * shader stores over the bus are unreliably slow on some drivers. */
+    int host_only = 2*maxsize > fv->max_heap_size;
 
     RET(ff_vk_get_pooled_buffer(&fv->s, &fv->out_data_pool,
                                 &fd->out_data_ref,
                                 VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
                                 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
                                 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
-                                NULL, maxsize, out_buf_flags));
+                                NULL, maxsize,
+                                host_only ? VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                                            fv->s.host_cached_flag :
+                                            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
     out_data_buf = (FFVkBuffer *)fd->out_data_ref->data;
 
-    /* Contiguous gathered output, read back by the CPU. */
+    /* Device-local gather destination */
+    if (!host_only) {
+        RET(ff_vk_get_pooled_buffer(&fv->s, &fv->gathered_data_pool,
+                                    &gathered_ref,
+                                    VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+                                    VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                                    VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+                                    NULL, maxsize,
+                                    VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
+        gathered_buf = (FFVkBuffer *)gathered_ref->data;
+    }
+
+    /* Contiguous output, read back by the CPU. Gathered into directly
+     * when the gather destination cannot live in VRAM. */
     RET(ff_vk_get_pooled_buffer(&fv->s, &fv->compacted_data_pool,
                                 &fd->compacted_data_ref,
+                                VK_BUFFER_USAGE_TRANSFER_DST_BIT |
                                 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
                                 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
                                 NULL, maxsize,
@@ -465,6 +484,10 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
     ff_vk_exec_add_dep_buf(&fv->s, exec, &slice_data_ref, 1, has_inter);
     ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->out_data_ref, 1, 1);
     ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->compacted_data_ref, 1, 1);
+    if (gathered_ref) {
+        ff_vk_exec_add_dep_buf(&fv->s, exec, &gathered_ref, 1, 0);
+        gathered_ref = NULL; /* Ownership passed */
+    }
     if (f->remap_mode) {
         ff_vk_exec_add_dep_buf(&fv->s, exec, &remap_data_ref, 1, 0);
         remap_data_ref = NULL;
@@ -699,8 +722,8 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                    0, sizeof(FFv1ShaderParams), &pd);
     vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1);
 
-    /* Gather the per-slice slots into one contiguous host-visible buffer,
-     * in the same submission (no separate transfer pass). */
+    /* Gather the per-slice slots into one contiguous buffer, in the same
+     * submission. */
     FFVkBuffer *results_buf = &fv->results_buf;
     ff_vk_buf_barrier(buf_bar[nb_buf_bar++], out_data_buf,
                       COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
@@ -719,7 +742,8 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
 
     SegGatherPushData gather_pd = {
         .sparse    = out_data_buf->address,
-        .compacted = compacted_buf->address,
+        .compacted = gathered_buf ? gathered_buf->address :
+                                    compacted_buf->address,
         .slot_size = (uint32_t)((out_data_buf->size / f->slice_count) & ~(size_t)15),
     };
     ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->gather, 0, 0, 0,
@@ -733,6 +757,25 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                    0, sizeof(gather_pd), &gather_pd);
     vk->CmdDispatch(exec->buf, f->slice_count, 1, 1);
 
+    /* Read the gathered bitstream back with the copy engine. The size is
+     * only known once the encode is done, so the whole buffer is copied;
+     * with the version-4 slot sizing this is close to the payload size. */
+    if (gathered_buf) {
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], gathered_buf,
+                          COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
+                          TRANSFER_BIT, TRANSFER_READ_BIT, NONE_KHR,
+                          0, VK_WHOLE_SIZE);
+        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = buf_bar,
+            .bufferMemoryBarrierCount = nb_buf_bar,
+        });
+        nb_buf_bar = 0;
+
+        vk->CmdCopyBuffer(exec->buf, gathered_buf->buf, compacted_buf->buf,
+                          1, &(VkBufferCopy) { .size = maxsize });
+    }
+
     /* Submit */
     err = ff_vk_exec_submit(&fv->s, exec);
     if (err < 0)
@@ -747,6 +790,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
 
 fail:
     av_frame_free(&tmp);
+    av_buffer_unref(&gathered_ref);
     ff_vk_exec_discard_deps(&fv->s, exec);
 
     return err;
@@ -1290,7 +1334,7 @@ fail:
 static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx)
 {
     int err;
-    size_t maxsize, max_heap_size, max_host_size;
+    size_t maxsize, max_heap_size;
     VulkanEncodeFFv1Context *fv = avctx->priv_data;
     FFV1Context *f = &fv->ctx;
 
@@ -1400,16 +1444,14 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx)
 
     /* Try to measure VRAM size */
     max_heap_size = 0;
-    max_host_size = 0;
     for (int i = 0; i < fv->s.mprops.memoryHeapCount; i++) {
         if (fv->s.mprops.memoryHeaps[i].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)
-            max_heap_size = FFMAX(fv->max_heap_size,
-                                  fv->s.mprops.memoryHeaps[i].size);
-        if (!(fv->s.mprops.memoryHeaps[i].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT))
-            max_host_size = FFMAX(max_host_size,
+            max_heap_size = FFMAX(max_heap_size,
                                   fv->s.mprops.memoryHeaps[i].size);
     }
-    fv->max_heap_size = max_heap_size;
+
+    /* Keep 1/8th of VRAM as headroom */
+    fv->max_heap_size = max_heap_size - (max_heap_size >> 3);
 
     maxsize = ffv1_vk_buffer_size(avctx);
     if (maxsize > fv->s.props_11.maxMemoryAllocationSize) {
@@ -1419,21 +1461,15 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx)
         maxsize = fv->s.props_11.maxMemoryAllocationSize;
     }
 
-    if (max_heap_size < maxsize) {
-        av_log(avctx, AV_LOG_WARNING, "Encoding buffer (%zu) larger than VRAM (%zu), "
+    /* Sparse output slots + gathered copy */
+    if (2*maxsize > fv->max_heap_size)
+        av_log(avctx, AV_LOG_WARNING, "Encoding buffers (2x%zu) larger than VRAM (%zu), "
                                       "using host memory (slower)\n",
                maxsize, fv->max_heap_size);
 
-        /* Keep 1/2th of RAM as headroom */
-        max_heap_size = max_host_size - (max_host_size >> 1);
-    } else {
-        /* Keep 1/8th of VRAM as headroom */
-        max_heap_size = max_heap_size - (max_heap_size >> 3);
-    }
-
     av_log(avctx, AV_LOG_INFO, "Async buffers: %zuMiB per context, %zuMiB total, depth: %i\n",
-           maxsize / (1024*1024),
-           (fv->async_depth * maxsize) / (1024*1024),
+           2*maxsize / (1024*1024),
+           (fv->async_depth * 2*maxsize) / (1024*1024),
            fv->async_depth);
 
     err = ff_vk_exec_pool_init(&fv->s, fv->qf, &fv->exec_pool,
@@ -1602,6 +1638,7 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx)
     av_buffer_unref(&fv->intermediate_frames_ref);
 
     av_buffer_pool_uninit(&fv->out_data_pool);
+    av_buffer_pool_uninit(&fv->gathered_data_pool);
     av_buffer_pool_uninit(&fv->compacted_data_pool);
 
     av_buffer_unref(&fv->keyframe_slice_data_ref);
diff --git a/libavcodec/vulkan/seg_gather.comp.glsl b/libavcodec/vulkan/seg_gather.comp.glsl
index 772f0200ff..2fc92630b2 100644
--- a/libavcodec/vulkan/seg_gather.comp.glsl
+++ b/libavcodec/vulkan/seg_gather.comp.glsl
@@ -31,8 +31,8 @@
  * FFv1 slice, ...) into its own fixed-stride, device-local slot, because no
  * encoder workgroup knows the others' final sizes. Run afterwards, this
  * shader prefix-sums the per-segment sizes and packs the segments back to
- * back into one contiguous, host-visible buffer -- so the device->host
- * transfer is a single coalesced stream rather than a scattered dribble.
+ * back into one contiguous buffer -- so the device->host transfer is a
+ * single coalesced stream rather than a scattered dribble.
  *
  * One workgroup per segment. Inputs: the sparse slot buffer, the per-segment
  * sizes, and the slot stride.
@@ -43,9 +43,9 @@ layout (set = 0, binding = 0, scalar) readonly buffer sizes_buf {
 };
 
 layout (push_constant, scalar) uniform pushConstants {
-    u8buf sparse;       /* device-local: one slot per segment */
-    u8buf compacted;    /* host-visible: contiguous output    */
-    uint  slot_size;    /* stride between sparse slots         */
+    u8buf sparse;       /* one slot per segment       */
+    u8buf compacted;    /* contiguous output          */
+    uint  slot_size;    /* stride between sparse slots */
 };
 
 shared uint s_dst_off;
@@ -60,17 +60,24 @@ u32vec4 funnel(u32vec4 lo, u32vec4 hi, uint sh)
     if (sh == 0u)
         return lo;
 
-    uint s[8] = uint[8](lo.x, lo.y, lo.z, lo.w, hi.x, hi.y, hi.z, hi.w);
     uint uw = sh >> 2u;             /* whole uints into the window       */
     uint bb = (sh & 3u) << 3u;      /* remaining sub-uint shift, in bits  */
 
+    /* s[uw..uw+3] and s[uw+1..uw+4] of the concatenated (lo, hi) pair,
+     * selected via uniform branches; a dynamically indexed local array
+     * would get spilled to scratch memory. */
+    u32vec4 a, c;
+    switch (uw) {
+    case 0u: a = lo;                     c = u32vec4(lo.yzw, hi.x); break;
+    case 1u: a = u32vec4(lo.yzw, hi.x);  c = u32vec4(lo.zw, hi.xy); break;
+    case 2u: a = u32vec4(lo.zw, hi.xy);  c = u32vec4(lo.w, hi.xyz); break;
+    default: a = u32vec4(lo.w, hi.xyz);  c = hi;                    break;
+    }
+
     if (bb == 0u)
-        return u32vec4(s[uw], s[uw + 1u], s[uw + 2u], s[uw + 3u]);
+        return a;
 
-    return u32vec4((s[uw     ] >> bb) | (s[uw + 1u] << (32u - bb)),
-                   (s[uw + 1u] >> bb) | (s[uw + 2u] << (32u - bb)),
-                   (s[uw + 2u] >> bb) | (s[uw + 3u] << (32u - bb)),
-                   (s[uw + 3u] >> bb) | (s[uw + 4u] << (32u - bb)));
+    return (a >> bb) | (c << (32u - bb));
 }
 
 void main(void)
@@ -82,14 +89,18 @@ void main(void)
     /*
      * Destination offset: the sum of all preceding segment sizes. The output
      * is packed tight -- segments back to back -- so it is usable directly as
-     * the assembled bitstream.
+     * the assembled bitstream. Reduced by the whole workgroup; a serial loop
+     * on one thread leaves 255 idle and is latency-bound on the reads.
      */
-    if (b == 0u) {
-        uint o = 0u;
-        for (uint i = 0u; i < seg; i++)
-            o += seg_sizes[i];
-        s_dst_off = o;
-    }
+    if (b == 0u)
+        s_dst_off = 0u;
+    barrier();
+
+    uint acc = 0u;
+    for (uint i = b; i < seg; i += wg)
+        acc += seg_sizes[i];
+    if (acc != 0u)
+        atomicAdd(s_dst_off, acc);
     barrier();
 
     const uint n = seg_sizes[seg];
-- 
2.53.0