From 2270de6ac9bedc415af6a13b50c2edf3a53335b0 Mon Sep 17 00:00:00 2001 From: Lynne Date: Sun, 5 Jul 2026 16:34:08 +0900 Subject: [PATCH] seg_gather_fix --- libavcodec/apv_encode_vulkan.c | 46 +++++++++-- libavcodec/ffv1enc_vulkan.c | 101 +++++++++++++++++-------- libavcodec/vulkan/seg_gather.comp.glsl | 47 +++++++----- 3 files changed, 139 insertions(+), 55 deletions(-) diff --git a/libavcodec/apv_encode_vulkan.c b/libavcodec/apv_encode_vulkan.c index 058f692636..6c0e3bbfd1 100644 --- a/libavcodec/apv_encode_vulkan.c +++ b/libavcodec/apv_encode_vulkan.c @@ -105,6 +105,7 @@ typedef struct VulkanEncodeAPVContext { /* Per-frame buffer pools */ AVBufferPool *coeffs_pool; AVBufferPool *bytestream_pool; + AVBufferPool *gathered_pool; AVBufferPool *compacted_pool; AVBufferPool *sizes_pool; @@ -404,6 +405,8 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec, FFVkBuffer *coeffs_buf; FFVkBuffer *bytestream_buf; + AVBufferRef *gathered_ref = NULL; + FFVkBuffer *gathered_buf; FFVkBuffer *compacted_buf; FFVkBuffer *sizes_buf; @@ -426,10 +429,22 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec, bytestream_buf = (FFVkBuffer *)fd->bytestream_ref->data; /* The compaction shader gathers the sparse slots into here, contiguous. - * Host-visible + host-cached so the CPU readback is a fast cached copy, - * and the GPU writes it as one coalesced sequential stream. */ + * Device-local: shader stores over the bus are unreliably slow on some + * drivers, so the transfer to the host is left to the copy engine. */ + RET(ff_vk_get_pooled_buffer(&ev->s, &ev->gathered_pool, + &gathered_ref, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, ev->bytestream_size, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)); + gathered_buf = (FFVkBuffer *)gathered_ref->data; + + /* Copy-engine destination the CPU assembles the packet from. + * Host-visible + host-cached so the readback is a fast cached copy. */ RET(ff_vk_get_pooled_buffer(&ev->s, &ev->compacted_pool, &fd->compacted_ref, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, ev->bytestream_size, @@ -449,6 +464,8 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec, ff_vk_exec_add_dep_buf(&ev->s, exec, &fd->coeffs_ref, 1, 1); ff_vk_exec_add_dep_buf(&ev->s, exec, &fd->bytestream_ref, 1, 1); + ff_vk_exec_add_dep_buf(&ev->s, exec, &gathered_ref, 1, 0); + gathered_ref = NULL; /* Ownership passed */ ff_vk_exec_add_dep_buf(&ev->s, exec, &fd->compacted_ref, 1, 1); ff_vk_exec_add_dep_buf(&ev->s, exec, &fd->sizes_ref, 1, 1); @@ -551,8 +568,8 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec, } /* Compaction pass: gather the sparse per-tile-component slots into one - * contiguous, host-visible buffer. Reads VRAM, writes the host buffer as - * a coalesced stream -- the device->host transfer the CPU then reads. */ + * contiguous device-local buffer, then read it back with the copy + * engine. */ if (!ev->headers_only) { ff_vk_buf_barrier(buf_bar[nb_buf_bar++], bytestream_buf, COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE, @@ -571,7 +588,7 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec, CompactPushData pd = { .sparse = bytestream_buf->address, - .compacted = compacted_buf->address, + .compacted = gathered_buf->address, .slot_size = (uint32_t)ev->slot_size, }; @@ -585,6 +602,23 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec, 0, sizeof(pd), &pd); vk->CmdDispatch(exec->buf, ev->tile_count * ev->num_comp, 1, 1); + + /* The gathered size is only known once the encode is done, so the + * whole buffer is copied; the slots are sized to the entropy coder's + * worst case, which keeps this close to the payload size. */ + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], gathered_buf, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE, + TRANSFER_BIT, TRANSFER_READ_BIT, NONE, + 0, gathered_buf->size); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + nb_buf_bar = 0; + + vk->CmdCopyBuffer(exec->buf, gathered_buf->buf, compacted_buf->buf, + 1, &(VkBufferCopy) { .size = ev->bytestream_size }); } err = ff_vk_exec_submit(&ev->s, exec); @@ -594,6 +628,7 @@ static int submit_frame(AVCodecContext *avctx, FFVkExecContext *exec, return 0; fail: + av_buffer_unref(&gathered_ref); ff_vk_exec_discard_deps(&ev->s, exec); return err; } @@ -829,6 +864,7 @@ static av_cold int vulkan_encode_apv_close(AVCodecContext *avctx) av_buffer_pool_uninit(&ev->coeffs_pool); av_buffer_pool_uninit(&ev->bytestream_pool); + av_buffer_pool_uninit(&ev->gathered_pool); av_buffer_pool_uninit(&ev->compacted_pool); av_buffer_pool_uninit(&ev->sizes_pool); diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c index a3c67e7f43..7837dd4a17 100644 --- a/libavcodec/ffv1enc_vulkan.c +++ b/libavcodec/ffv1enc_vulkan.c @@ -91,7 +91,10 @@ typedef struct VulkanEncodeFFv1Context { /* Output data buffer */ AVBufferPool *out_data_pool; - /* Gathered (contiguous) output buffer pool */ + /* Gathered (contiguous) device-local buffer pool */ + AVBufferPool *gathered_data_pool; + + /* Host-visible readback buffer pool */ AVBufferPool *compacted_data_pool; /* Intermediate frame pool */ @@ -301,6 +304,8 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, /* Output data */ size_t maxsize; FFVkBuffer *out_data_buf; + AVBufferRef *gathered_ref = NULL; + FFVkBuffer *gathered_buf = NULL; FFVkBuffer *compacted_buf; int has_inter = avctx->gop_size > 1; @@ -372,26 +377,40 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, maxsize = ffv1_vk_buffer_size(avctx); maxsize = FFMIN(maxsize, fv->s.props_11.maxMemoryAllocationSize); - /* Sparse per-slice output: written by encode, read by gather, never by the - * CPU, so device-local unless it won't fit in VRAM. */ - VkMemoryPropertyFlagBits out_buf_flags; - if (maxsize < fv->max_heap_size) - out_buf_flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; - else - out_buf_flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - fv->s.host_cached_flag; + /* The sparse per-slice slots and the gathered copy both stay in VRAM, + * and the bitstream is read back with a single copy-engine transfer. + * Shaders only touch host memory if VRAM cannot hold both buffers: + * shader stores over the bus are unreliably slow on some drivers. */ + int host_only = 2*maxsize > fv->max_heap_size; RET(ff_vk_get_pooled_buffer(&fv->s, &fv->out_data_pool, &fd->out_data_ref, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, - NULL, maxsize, out_buf_flags)); + NULL, maxsize, + host_only ? VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + fv->s.host_cached_flag : + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)); out_data_buf = (FFVkBuffer *)fd->out_data_ref->data; - /* Contiguous gathered output, read back by the CPU. */ + /* Device-local gather destination */ + if (!host_only) { + RET(ff_vk_get_pooled_buffer(&fv->s, &fv->gathered_data_pool, + &gathered_ref, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, maxsize, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)); + gathered_buf = (FFVkBuffer *)gathered_ref->data; + } + + /* Contiguous output, read back by the CPU. Gathered into directly + * when the gather destination cannot live in VRAM. */ RET(ff_vk_get_pooled_buffer(&fv->s, &fv->compacted_data_pool, &fd->compacted_data_ref, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, NULL, maxsize, @@ -465,6 +484,10 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, ff_vk_exec_add_dep_buf(&fv->s, exec, &slice_data_ref, 1, has_inter); ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->out_data_ref, 1, 1); ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->compacted_data_ref, 1, 1); + if (gathered_ref) { + ff_vk_exec_add_dep_buf(&fv->s, exec, &gathered_ref, 1, 0); + gathered_ref = NULL; /* Ownership passed */ + } if (f->remap_mode) { ff_vk_exec_add_dep_buf(&fv->s, exec, &remap_data_ref, 1, 0); remap_data_ref = NULL; @@ -699,8 +722,8 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, 0, sizeof(FFv1ShaderParams), &pd); vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1); - /* Gather the per-slice slots into one contiguous host-visible buffer, - * in the same submission (no separate transfer pass). */ + /* Gather the per-slice slots into one contiguous buffer, in the same + * submission. */ FFVkBuffer *results_buf = &fv->results_buf; ff_vk_buf_barrier(buf_bar[nb_buf_bar++], out_data_buf, COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, @@ -719,7 +742,8 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, SegGatherPushData gather_pd = { .sparse = out_data_buf->address, - .compacted = compacted_buf->address, + .compacted = gathered_buf ? gathered_buf->address : + compacted_buf->address, .slot_size = (uint32_t)((out_data_buf->size / f->slice_count) & ~(size_t)15), }; ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->gather, 0, 0, 0, @@ -733,6 +757,25 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, 0, sizeof(gather_pd), &gather_pd); vk->CmdDispatch(exec->buf, f->slice_count, 1, 1); + /* Read the gathered bitstream back with the copy engine. The size is + * only known once the encode is done, so the whole buffer is copied; + * with the version-4 slot sizing this is close to the payload size. */ + if (gathered_buf) { + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], gathered_buf, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, + TRANSFER_BIT, TRANSFER_READ_BIT, NONE_KHR, + 0, VK_WHOLE_SIZE); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + nb_buf_bar = 0; + + vk->CmdCopyBuffer(exec->buf, gathered_buf->buf, compacted_buf->buf, + 1, &(VkBufferCopy) { .size = maxsize }); + } + /* Submit */ err = ff_vk_exec_submit(&fv->s, exec); if (err < 0) @@ -747,6 +790,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, fail: av_frame_free(&tmp); + av_buffer_unref(&gathered_ref); ff_vk_exec_discard_deps(&fv->s, exec); return err; @@ -1290,7 +1334,7 @@ fail: static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx) { int err; - size_t maxsize, max_heap_size, max_host_size; + size_t maxsize, max_heap_size; VulkanEncodeFFv1Context *fv = avctx->priv_data; FFV1Context *f = &fv->ctx; @@ -1400,16 +1444,14 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx) /* Try to measure VRAM size */ max_heap_size = 0; - max_host_size = 0; for (int i = 0; i < fv->s.mprops.memoryHeapCount; i++) { if (fv->s.mprops.memoryHeaps[i].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) - max_heap_size = FFMAX(fv->max_heap_size, - fv->s.mprops.memoryHeaps[i].size); - if (!(fv->s.mprops.memoryHeaps[i].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)) - max_host_size = FFMAX(max_host_size, + max_heap_size = FFMAX(max_heap_size, fv->s.mprops.memoryHeaps[i].size); } - fv->max_heap_size = max_heap_size; + + /* Keep 1/8th of VRAM as headroom */ + fv->max_heap_size = max_heap_size - (max_heap_size >> 3); maxsize = ffv1_vk_buffer_size(avctx); if (maxsize > fv->s.props_11.maxMemoryAllocationSize) { @@ -1419,21 +1461,15 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx) maxsize = fv->s.props_11.maxMemoryAllocationSize; } - if (max_heap_size < maxsize) { - av_log(avctx, AV_LOG_WARNING, "Encoding buffer (%zu) larger than VRAM (%zu), " + /* Sparse output slots + gathered copy */ + if (2*maxsize > fv->max_heap_size) + av_log(avctx, AV_LOG_WARNING, "Encoding buffers (2x%zu) larger than VRAM (%zu), " "using host memory (slower)\n", maxsize, fv->max_heap_size); - /* Keep 1/2th of RAM as headroom */ - max_heap_size = max_host_size - (max_host_size >> 1); - } else { - /* Keep 1/8th of VRAM as headroom */ - max_heap_size = max_heap_size - (max_heap_size >> 3); - } - av_log(avctx, AV_LOG_INFO, "Async buffers: %zuMiB per context, %zuMiB total, depth: %i\n", - maxsize / (1024*1024), - (fv->async_depth * maxsize) / (1024*1024), + 2*maxsize / (1024*1024), + (fv->async_depth * 2*maxsize) / (1024*1024), fv->async_depth); err = ff_vk_exec_pool_init(&fv->s, fv->qf, &fv->exec_pool, @@ -1602,6 +1638,7 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx) av_buffer_unref(&fv->intermediate_frames_ref); av_buffer_pool_uninit(&fv->out_data_pool); + av_buffer_pool_uninit(&fv->gathered_data_pool); av_buffer_pool_uninit(&fv->compacted_data_pool); av_buffer_unref(&fv->keyframe_slice_data_ref); diff --git a/libavcodec/vulkan/seg_gather.comp.glsl b/libavcodec/vulkan/seg_gather.comp.glsl index 772f0200ff..2fc92630b2 100644 --- a/libavcodec/vulkan/seg_gather.comp.glsl +++ b/libavcodec/vulkan/seg_gather.comp.glsl @@ -31,8 +31,8 @@ * FFv1 slice, ...) into its own fixed-stride, device-local slot, because no * encoder workgroup knows the others' final sizes. Run afterwards, this * shader prefix-sums the per-segment sizes and packs the segments back to - * back into one contiguous, host-visible buffer -- so the device->host - * transfer is a single coalesced stream rather than a scattered dribble. + * back into one contiguous buffer -- so the device->host transfer is a + * single coalesced stream rather than a scattered dribble. * * One workgroup per segment. Inputs: the sparse slot buffer, the per-segment * sizes, and the slot stride. @@ -43,9 +43,9 @@ layout (set = 0, binding = 0, scalar) readonly buffer sizes_buf { }; layout (push_constant, scalar) uniform pushConstants { - u8buf sparse; /* device-local: one slot per segment */ - u8buf compacted; /* host-visible: contiguous output */ - uint slot_size; /* stride between sparse slots */ + u8buf sparse; /* one slot per segment */ + u8buf compacted; /* contiguous output */ + uint slot_size; /* stride between sparse slots */ }; shared uint s_dst_off; @@ -60,17 +60,24 @@ u32vec4 funnel(u32vec4 lo, u32vec4 hi, uint sh) if (sh == 0u) return lo; - uint s[8] = uint[8](lo.x, lo.y, lo.z, lo.w, hi.x, hi.y, hi.z, hi.w); uint uw = sh >> 2u; /* whole uints into the window */ uint bb = (sh & 3u) << 3u; /* remaining sub-uint shift, in bits */ + /* s[uw..uw+3] and s[uw+1..uw+4] of the concatenated (lo, hi) pair, + * selected via uniform branches; a dynamically indexed local array + * would get spilled to scratch memory. */ + u32vec4 a, c; + switch (uw) { + case 0u: a = lo; c = u32vec4(lo.yzw, hi.x); break; + case 1u: a = u32vec4(lo.yzw, hi.x); c = u32vec4(lo.zw, hi.xy); break; + case 2u: a = u32vec4(lo.zw, hi.xy); c = u32vec4(lo.w, hi.xyz); break; + default: a = u32vec4(lo.w, hi.xyz); c = hi; break; + } + if (bb == 0u) - return u32vec4(s[uw], s[uw + 1u], s[uw + 2u], s[uw + 3u]); + return a; - return u32vec4((s[uw ] >> bb) | (s[uw + 1u] << (32u - bb)), - (s[uw + 1u] >> bb) | (s[uw + 2u] << (32u - bb)), - (s[uw + 2u] >> bb) | (s[uw + 3u] << (32u - bb)), - (s[uw + 3u] >> bb) | (s[uw + 4u] << (32u - bb))); + return (a >> bb) | (c << (32u - bb)); } void main(void) @@ -82,14 +89,18 @@ void main(void) /* * Destination offset: the sum of all preceding segment sizes. The output * is packed tight -- segments back to back -- so it is usable directly as - * the assembled bitstream. + * the assembled bitstream. Reduced by the whole workgroup; a serial loop + * on one thread leaves 255 idle and is latency-bound on the reads. */ - if (b == 0u) { - uint o = 0u; - for (uint i = 0u; i < seg; i++) - o += seg_sizes[i]; - s_dst_off = o; - } + if (b == 0u) + s_dst_off = 0u; + barrier(); + + uint acc = 0u; + for (uint i = b; i < seg; i += wg) + acc += seg_sizes[i]; + if (acc != 0u) + atomicAdd(s_dst_off, acc); barrier(); const uint n = seg_sizes[seg]; -- 2.53.0