From 95974c0a8aac17009a806d251497f7f308a35fb2 Mon Sep 17 00:00:00 2001 From: Pritika Vipin <65793273+Pritiks23@users.noreply.github.com> Date: Thu, 21 May 2026 03:40:27 +0000 Subject: [PATCH] allocator: harden shadow pool error handling and diagnostics Signed-off-by: Pritika Vipin <65793273+Pritiks23@users.noreply.github.com> --- src/allocator.cc | 74 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 16 deletions(-) diff --git a/src/allocator.cc b/src/allocator.cc index 87cb843dd9..808da81672 100644 --- a/src/allocator.cc +++ b/src/allocator.cc @@ -11,8 +11,6 @@ #include "nvtx.h" #include "utils.h" -NCCL_PARAM(ShadowMempoolMaxSize, "SHADOW_MEMPOOL_MAX_SIZE", 1LL << 30); - NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size); ncclResult_t ncclMemAlloc(void **ptr, size_t size) { NCCL_NVTX3_FUNC_RANGE; @@ -331,6 +329,7 @@ ncclResult_t ncclShadowPoolAlloc( struct ncclShadowPool* pool, size_t size, void** outDevObj, void** outHostObj, cudaStream_t stream ) { + ncclResult_t ret = ncclSuccess; if (size == 0) { if (outDevObj) *outDevObj = nullptr; if (outHostObj) *outHostObj = nullptr; @@ -344,11 +343,22 @@ ncclResult_t ncclShadowPoolAlloc( props.handleTypes = cudaMemHandleTypeNone; props.location.type = cudaMemLocationTypeDevice; cudaGetDevice(&props.location.id); - props.maxSize = (size_t)ncclParamShadowMempoolMaxSize(); - CUDACHECK(cudaMemPoolCreate(&pool->memPool, &props)); + // Failing steps first, operate on state on the stack + cudaMemPool_t memPool = nullptr; + CUDACHECK(cudaMemPoolCreate(&memPool, &props)); + + struct ncclShadowObject** table = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<4); + if (table == nullptr) { + WARN("Failed to allocate hash table for shadow pool"); + cudaMemPoolDestroy(memPool); + return ncclSystemError; + } + + // Non-failing steps last - update pool state only after all operations succeed + pool->memPool = memPool; + pool->table = table; pool->hbits = hbits = 4; - pool->table = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<table[i] = nullptr; } @@ -356,6 +366,10 @@ ncclResult_t ncclShadowPoolAlloc( if (pool->count+1 > 2<table; struct ncclShadowObject** table1 = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<(hbits+1)); + if (table1 == nullptr) { + WARN("Failed to allocate expanded hash table for shadow pool"); + return ncclSystemError; + } pool->table = table1; pool->hbits = hbits+1; for (int i1=0; i1 < 2<= 3) { int shift = std::max(0, (int)log2Down(size) + 1 - 4); int pageObjSize = ((size + (1<>shift)<(64<<10, 64*pageObjSize); page = (struct ncclShadowPage*)malloc(sizeof(struct ncclShadowPage)); + if (page == nullptr) { + WARN("Failed to allocate shadow pool page metadata"); + free(obj); + return ncclSystemError; + } page->objSize = pageObjSize; page->freeMask = uint64_t(-1)>>(64 - pageSize/pageObjSize); + page->next = nullptr; + page->devObjs = nullptr; + newPage = page; + CUDACHECKGOTO(cudaMallocFromPoolAsync(&page->devObjs, pageSize, pool->memPool, stream), ret, fail); + CUDACHECKGOTO(cudaMemsetAsync(page->devObjs, 0, pageSize, stream), ret, fail); + newPage = nullptr; + // Only link page into pool after CUDA operations succeed page->next = pool->pages; pool->pages = page; - CUDACHECK(cudaMallocFromPoolAsync(&page->devObjs, pageSize, pool->memPool, stream)); - CUDACHECK(cudaMemsetAsync(page->devObjs, 0, pageSize, stream)); // fall through... } if (page->objSize == pageObjSize) { @@ -400,13 +434,9 @@ ncclResult_t ncclShadowPoolAlloc( } } else { page = nullptr; - CUDACHECK(cudaMallocFromPoolAsync(&devObj, size, pool->memPool, stream)); - CUDACHECK(cudaMemsetAsync(devObj, 0, size, stream)); + CUDACHECKGOTO(cudaMallocFromPoolAsync(&devObj, size, pool->memPool, stream), ret, fail); + CUDACHECKGOTO(cudaMemsetAsync(devObj, 0, size, stream), ret, fail); } - - struct ncclShadowObject* obj = (struct ncclShadowObject*)malloc( - sizeof(struct ncclShadowObject) + /*padding=*/alignof(max_align_t)-1 + size - ); obj->page = page; obj->devObj = devObj; obj->hostObj = alignUp((char*)(obj+1), alignof(max_align_t)); @@ -416,6 +446,16 @@ ncclResult_t ncclShadowPoolAlloc( if (outDevObj) *outDevObj = devObj; if (outHostObj) *outHostObj = obj->hostObj; return ncclSuccess; + +fail: + if (newPage != nullptr) { + if (newPage->devObjs != nullptr) cudaFreeAsync(newPage->devObjs, stream); + free(newPage); + } else if (page == nullptr && devObj != nullptr) { + cudaFreeAsync(devObj, stream); + } + free(obj); + return ret; } ncclResult_t ncclShadowPoolFree(struct ncclShadowPool* pool, void* devObj, cudaStream_t stream) { @@ -425,7 +465,9 @@ ncclResult_t ncclShadowPoolFree(struct ncclShadowPool* pool, void* devObj, cudaS struct ncclShadowObject** pobj = &pool->table[b]; while (true) { if (*pobj == nullptr) { - WARN("Device object does not exist in shadow pool."); + WARN("ncclShadowPoolFree: Device object %p not found in shadow pool (hash bucket %lu). " + "This may indicate a use-after-free or double-free error. Pool has %d objects.", + devObj, b, pool->count); return ncclInternalError; } if ((*pobj)->devObj == devObj) break;