/* Copyright 2005-2013 Intel Corporation. All Rights Reserved. This file is part of Threading Building Blocks. Threading Building Blocks is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation. Threading Building Blocks is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Threading Building Blocks; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA As a special exception, you may use this file as part of a free software library without restriction. Specifically, if other files instantiate templates or use macros or inline functions from this file, or you compile this file and link it with other files to produce an executable, this file does not by itself cause the resulting executable to be covered by the GNU General Public License. This exception does not however invalidate any other reasons why the executable file might be covered by the GNU General Public License. */ #include "tbbmalloc_internal.h" /********* Allocation of large objects ************/ namespace rml { namespace internal { #if __TBB_MALLOC_LOCACHE_STAT intptr_t mallocCalls, cacheHits; intptr_t memAllocKB, memHitKB; #endif inline bool lessThanWithOverflow(intptr_t a, intptr_t b) { return (a < b && (b - a < UINTPTR_MAX/2)) || (a > b && (a - b > UINTPTR_MAX/2)); } template LargeMemoryBlock *LargeObjectCacheImpl::CacheBin:: putList(ExtMemoryPool *extMemPool, LargeMemoryBlock *head, BinBitMask *bitMask, int idx) { int i, num, totalNum; size_t size = head->unalignedSize; LargeMemoryBlock *curr, *tail, *toRelease = NULL; uintptr_t currTime; // we not kept prev pointers during assigning blocks to bins, set them now head->prev = NULL; for (num=1, curr=head; curr->next; num++, curr=curr->next) curr->next->prev = curr; tail = curr; totalNum = num; { MallocMutex::scoped_lock scoped_cs(lock); usedSize -= num*size; // to keep ordering on list, get time under list lock currTime = extMemPool->loc.getCurrTimeRange(num); for (curr=tail, i=0; curr; curr=curr->prev, i++) { curr->age = currTime+i; STAT_increment(getThreadId(), ThreadCommonCounters, cacheLargeBlk); } if (!lastCleanedAge) { // 1st object of such size was released. // Not cache it, and remeber when this occurs // to take into account during cache miss. lastCleanedAge = tail->age; toRelease = tail; tail = tail->prev; if (tail) tail->next = NULL; else head = NULL; num--; } if (num) { // add [head;tail] list to cache tail->next = first; if (first) first->prev = tail; first = head; if (!last) { MALLOC_ASSERT(0 == oldest, ASSERT_TEXT); oldest = tail->age; last = tail; } cachedSize += num*size; } /* It's accebtable, if a bin is empty, and we have non-empty in bit mask. So set true in bitmask without lock. It's not acceptable, if a bin is non-empty and we have empty in bitmask. So set false in bitmask under lock. */ // No used object, and nothing in the bin, mark the bin as empty if (!usedSize && !first) bitMask->set(idx, false); } extMemPool->loc.cleanupCacheIfNeededOnRange(&extMemPool->backend, totalNum, currTime); if (toRelease) toRelease->prev = toRelease->next = NULL; return toRelease; } template LargeMemoryBlock *LargeObjectCacheImpl::CacheBin:: get(size_t size, uintptr_t currTime, bool *setNonEmpty) { LargeMemoryBlock *result=NULL; { MallocMutex::scoped_lock scoped_cs(lock); forgetOutdatedState(currTime); if (first) { result = first; first = result->next; if (first) first->prev = NULL; else { last = NULL; oldest = 0; } // use moving average with current hit interval intptr_t hitR = currTime - result->age; lastHit = lastHit? (lastHit + hitR)/2 : hitR; cachedSize -= size; } else { if (lastCleanedAge) ageThreshold = Props::OnMissFactor*(currTime - lastCleanedAge); } if (!usedSize) // inform that there are used blocks in the bin *setNonEmpty = true; // subject to later correction, if got cache miss and later allocation failed usedSize += size; lastGet = currTime; } return result; } // forget the history for the bin if it was unused for long time template void LargeObjectCacheImpl::CacheBin::forgetOutdatedState(uintptr_t currTime) { // If the time since the last get is LongWaitFactor times more than ageThreshold // for the bin, treat the bin as rarely-used and forget everything we know // about it. // If LongWaitFactor is too small, we forget too early and // so prevents good caching, while if too high, caching blocks // with unrelated usage pattern occurs. const uintptr_t sinceLastGet = currTime - lastGet; bool doCleanup = false; if (!last) { // clean only empty bins if (ageThreshold) doCleanup = sinceLastGet > Props::LongWaitFactor*ageThreshold; else if (lastCleanedAge) doCleanup = sinceLastGet > Props::LongWaitFactor*(lastCleanedAge - lastGet); } if (doCleanup) { lastCleanedAge = 0; ageThreshold = 0; } } template bool LargeObjectCacheImpl::CacheBin:: cleanToThreshold(Backend *backend, BinBitMask *bitMask, uintptr_t currTime, int idx) { LargeMemoryBlock *toRelease = NULL; bool released = false; #if MALLOC_DEBUG uintptr_t nextAge = 0; #endif /* oldest may be more recent then age, that's why cast to signed type was used. age overflow is also processed correctly. */ if (last && (intptr_t)(currTime - oldest) > ageThreshold) { MallocMutex::scoped_lock scoped_cs(lock); // double check if (last && (intptr_t)(currTime - last->age) > ageThreshold) { do { #if MALLOC_DEBUG // check that list ordered MALLOC_ASSERT(!nextAge || lessThanWithOverflow(nextAge, last->age), ASSERT_TEXT); nextAge = last->age; #endif cachedSize -= last->unalignedSize; last = last->prev; } while (last && (intptr_t)(currTime - last->age) > ageThreshold); if (last) { toRelease = last->next; oldest = last->age; last->next = NULL; } else { toRelease = first; first = NULL; oldest = 0; if (!usedSize) bitMask->set(idx, false); } MALLOC_ASSERT( toRelease, ASSERT_TEXT ); lastCleanedAge = toRelease->age; } else return false; } released = toRelease; while ( toRelease ) { LargeMemoryBlock *helper = toRelease->next; backend->returnLargeObject(toRelease); toRelease = helper; } return released; } template bool LargeObjectCacheImpl:: CacheBin::cleanAll(Backend *backend, BinBitMask *bitMask, int idx) { LargeMemoryBlock *toRelease = NULL; bool released = false; if (last) { MallocMutex::scoped_lock scoped_cs(lock); // double check if (last) { toRelease = first; last = NULL; first = NULL; oldest = 0; cachedSize = 0; if (!usedSize) bitMask->set(idx, false); } else return false; } released = toRelease; while ( toRelease ) { LargeMemoryBlock *helper = toRelease->next; MALLOC_ASSERT(!helper || lessThanWithOverflow(helper->age, toRelease->age), ASSERT_TEXT); backend->returnLargeObject(toRelease); toRelease = helper; } return released; } template size_t LargeObjectCacheImpl::CacheBin::reportStat(int num, FILE *f) { #if __TBB_MALLOC_LOCACHE_STAT if (first) printf("%d(%lu): total %lu KB thr %ld lastCln %lu lastHit %lu oldest %lu\n", num, num*CacheStep+MinSize, cachedSize/1024, ageThreshold, lastCleanedAge, lastHit, oldest); #else suppress_unused_warning(num); suppress_unused_warning(f); #endif return cachedSize; } // release from cache blocks that are older than ageThreshold template bool LargeObjectCacheImpl::regularCleanup(Backend *backend, uintptr_t currTime) { bool released = false, doThreshDecr = false; BinsSummary binsSummary; for (int i = bitMask.getMaxTrue(numBins-1); i >= 0; i = bitMask.getMaxTrue(i-1)) { bin[i].updateBinsSummary(&binsSummary); if (!doThreshDecr && tooLargeLOC>2 && binsSummary.isLOCTooLarge()) { // if LOC is too large for quite long time, decrease the threshold // based on bin hit statistics. // For this, redo cleanup from the beginnig. // Note: on this iteration total usedSz can be not too large // in comparison to total cachedSz, as we calculated it only // partially. We are ok this it. i = bitMask.getMaxTrue(numBins-1); doThreshDecr = true; binsSummary.reset(); continue; } if (doThreshDecr) bin[i].decreaseThreshold(); if (bin[i].cleanToThreshold(backend, &bitMask, currTime, i)) released = true; } // We want to find if LOC was too large for some time continuously, // so OK with races between incrementing and zeroing, but incrementing // must be atomic. if (binsSummary.isLOCTooLarge()) AtomicIncrement(tooLargeLOC); else tooLargeLOC = 0; return released; } template bool LargeObjectCacheImpl::cleanAll(Backend *backend) { bool released = false; for (int i = numBins-1; i >= 0; i--) released |= bin[i].cleanAll(backend, &bitMask, i); return released; } #if __TBB_MALLOC_WHITEBOX_TEST template size_t LargeObjectCacheImpl::getLOCSize() const { size_t size = 0; for (int i = numBins-1; i >= 0; i--) size += bin[i].getSize(); return size; } size_t LargeObjectCache::getLOCSize() const { return largeCache.getLOCSize() + hugeCache.getLOCSize(); } template size_t LargeObjectCacheImpl::getUsedSize() const { size_t size = 0; for (int i = numBins-1; i >= 0; i--) size += bin[i].getUsedSize(); return size; } size_t LargeObjectCache::getUsedSize() const { return largeCache.getUsedSize() + hugeCache.getUsedSize(); } #endif // __TBB_MALLOC_WHITEBOX_TEST uintptr_t LargeObjectCache::getCurrTime() { return (uintptr_t)AtomicIncrement((intptr_t&)cacheCurrTime); } uintptr_t LargeObjectCache::getCurrTimeRange(uintptr_t range) { return (uintptr_t)AtomicAdd((intptr_t&)cacheCurrTime, range)+1; } void LargeObjectCache::cleanupCacheIfNeeded(Backend *backend, uintptr_t currTime) { if ( 0 == currTime % cacheCleanupFreq ) doRegularCleanup(backend, currTime); } void LargeObjectCache:: cleanupCacheIfNeededOnRange(Backend *backend, uintptr_t range, uintptr_t currTime) { if (range >= cacheCleanupFreq || currTime+range < currTime-1 // overflow, 0 is power of 2, do cleanup // (prev;prev+range] contains n*cacheCleanupFreq || alignUp(currTime, cacheCleanupFreq)<=currTime+range) doRegularCleanup(backend, currTime); } bool LargeObjectCache::doRegularCleanup(Backend *backend, uintptr_t currTime) { return largeCache.regularCleanup(backend, currTime) | hugeCache.regularCleanup(backend, currTime); } bool LargeObjectCache::cleanAll(Backend *backend) { return largeCache.cleanAll(backend) | hugeCache.cleanAll(backend); } template LargeMemoryBlock *LargeObjectCacheImpl::get(uintptr_t currTime, size_t size) { MALLOC_ASSERT( size%Props::CacheStep==0, ASSERT_TEXT ); int idx = sizeToIdx(size); bool setNonEmpty = false; LargeMemoryBlock *lmb = bin[idx].get(size, currTime, &setNonEmpty); // Setting to true is possible out of lock. As bitmask is used only for cleanup, // the lack of consistency is not violating correctness here. if (setNonEmpty) bitMask.set(idx, true); if (lmb) { MALLOC_ITT_SYNC_ACQUIRED(bin+idx); STAT_increment(getThreadId(), ThreadCommonCounters, allocCachedLargeBlk); } return lmb; } template void LargeObjectCacheImpl::rollbackCacheState(size_t size) { int idx = sizeToIdx(size); MALLOC_ASSERT(idx void LargeObjectCacheImpl::reportStat(FILE *f) { size_t cachedSize = 0; for (int i=0; i void LargeObjectCacheImpl::putList(ExtMemoryPool *extMemPool, LargeMemoryBlock *toCache) { int toBinIdx = sizeToIdx(toCache->unalignedSize); MALLOC_ITT_SYNC_RELEASING(bin+toBinIdx); if (LargeMemoryBlock *release = bin[toBinIdx].putList(extMemPool, toCache, &bitMask, toBinIdx)) extMemPool->backend.returnLargeObject(release); } void LargeObjectCache::rollbackCacheState(size_t size) { if (size < maxLargeSize) largeCache.rollbackCacheState(size); else if (size < maxHugeSize) hugeCache.rollbackCacheState(size); } // return artifical bin index, it's used only during sorting and never saved int LargeObjectCache::sizeToIdx(size_t size) { MALLOC_ASSERT(size < maxHugeSize, ASSERT_TEXT); return size < maxLargeSize? LargeCacheType::sizeToIdx(size) : LargeCacheType::getNumBins()+HugeCacheType::sizeToIdx(size); } void LargeObjectCache::putList(ExtMemoryPool *extMemPool, LargeMemoryBlock *list) { LargeMemoryBlock *toProcess, *n; for (LargeMemoryBlock *curr = list; curr; curr = toProcess) { LargeMemoryBlock *tail = curr; toProcess = curr->next; if (curr->unalignedSize >= maxHugeSize) { extMemPool->backend.returnLargeObject(curr); continue; } int currIdx = sizeToIdx(curr->unalignedSize); // Find all blocks fitting to same bin. Not use more efficient sorting // algorithm because list is short (commonly, // LocalLOC's HIGH_MARK-LOW_MARK, i.e. 24 items). for (LargeMemoryBlock *b = toProcess; b; b = n) { n = b->next; if (sizeToIdx(b->unalignedSize) == currIdx) { tail->next = b; tail = b; if (toProcess == b) toProcess = toProcess->next; else { b->prev->next = b->next; if (b->next) b->next->prev = b->prev; } } } tail->next = NULL; if (curr->unalignedSize < maxLargeSize) largeCache.putList(extMemPool, curr); else hugeCache.putList(extMemPool, curr); } } void LargeObjectCache::put(ExtMemoryPool *extMemPool, LargeMemoryBlock *largeBlock) { if (largeBlock->unalignedSize < maxHugeSize) { largeBlock->next = NULL; if (largeBlock->unalignedSizebackend.returnLargeObject(largeBlock); } LargeMemoryBlock *LargeObjectCache::get(Backend *backend, size_t size) { MALLOC_ASSERT( size%largeBlockCacheStep==0, ASSERT_TEXT ); MALLOC_ASSERT( size>=minLargeSize, ASSERT_TEXT ); if ( size < maxHugeSize) { uintptr_t currTime = getCurrTime(); cleanupCacheIfNeeded(backend, currTime); return size < maxLargeSize? largeCache.get(currTime, size) : hugeCache.get(currTime, size); } return NULL; } LargeMemoryBlock *ExtMemoryPool::mallocLargeObject(size_t allocationSize) { #if __TBB_MALLOC_LOCACHE_STAT AtomicIncrement(mallocCalls); AtomicAdd(memAllocKB, allocationSize/1024); #endif LargeMemoryBlock* lmb = loc.get(&backend, allocationSize); if (!lmb) { BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/true); if (backRefIdx.isInvalid()) return NULL; // unalignedSize is set in getLargeBlock lmb = backend.getLargeBlock(allocationSize); if (!lmb) { removeBackRef(backRefIdx); loc.rollbackCacheState(allocationSize); return NULL; } lmb->backRefIdx = backRefIdx; STAT_increment(getThreadId(), ThreadCommonCounters, allocNewLargeObj); } else { #if __TBB_MALLOC_LOCACHE_STAT AtomicIncrement(cacheHits); AtomicAdd(memHitKB, allocationSize/1024); #endif } return lmb; } void ExtMemoryPool::freeLargeObject(LargeMemoryBlock *mBlock) { loc.put(this, mBlock); } void ExtMemoryPool::freeLargeObjectList(LargeMemoryBlock *head) { loc.putList(this, head); } bool ExtMemoryPool::softCachesCleanup() { // TODO: cleanup small objects as well return loc.regularCleanup(&backend); } bool ExtMemoryPool::hardCachesCleanup() { // thread-local caches must be cleaned before LOC, // because object from thread-local cache can be released to LOC bool tlCaches = releaseTLCaches(), locCaches = loc.cleanAll(&backend); return tlCaches || locCaches; } /*********** End allocation of large objects **********/ } // namespace internal } // namespace rml