You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

591 lines
18 KiB

/*
Copyright 2005-2013 Intel Corporation. All Rights Reserved.
This file is part of Threading Building Blocks.
Threading Building Blocks is free software; you can redistribute it
and/or modify it under the terms of the GNU General Public License
version 2 as published by the Free Software Foundation.
Threading Building Blocks is distributed in the hope that it will be
useful, but WITHOUT ANY WARRANTY; without even the implied warranty
of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Threading Building Blocks; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
As a special exception, you may use this file as part of a free software
library without restriction. Specifically, if other files instantiate
templates or use macros or inline functions from this file, or you compile
this file and link it with other files to produce an executable, this
file does not by itself cause the resulting executable to be covered by
the GNU General Public License. This exception does not however
invalidate any other reasons why the executable file might be covered by
the GNU General Public License.
*/
#include "tbbmalloc_internal.h"
/********* Allocation of large objects ************/
namespace rml {
namespace internal {
#if __TBB_MALLOC_LOCACHE_STAT
intptr_t mallocCalls, cacheHits;
intptr_t memAllocKB, memHitKB;
#endif
inline bool lessThanWithOverflow(intptr_t a, intptr_t b)
{
return (a < b && (b - a < UINTPTR_MAX/2)) ||
(a > b && (a - b > UINTPTR_MAX/2));
}
template<typename Props>
LargeMemoryBlock *LargeObjectCacheImpl<Props>::CacheBin::
putList(ExtMemoryPool *extMemPool, LargeMemoryBlock *head, BinBitMask *bitMask, int idx)
{
int i, num, totalNum;
size_t size = head->unalignedSize;
LargeMemoryBlock *curr, *tail, *toRelease = NULL;
uintptr_t currTime;
// we not kept prev pointers during assigning blocks to bins, set them now
head->prev = NULL;
for (num=1, curr=head; curr->next; num++, curr=curr->next)
curr->next->prev = curr;
tail = curr;
totalNum = num;
{
MallocMutex::scoped_lock scoped_cs(lock);
usedSize -= num*size;
// to keep ordering on list, get time under list lock
currTime = extMemPool->loc.getCurrTimeRange(num);
for (curr=tail, i=0; curr; curr=curr->prev, i++) {
curr->age = currTime+i;
STAT_increment(getThreadId(), ThreadCommonCounters, cacheLargeBlk);
}
if (!lastCleanedAge) {
// 1st object of such size was released.
// Not cache it, and remeber when this occurs
// to take into account during cache miss.
lastCleanedAge = tail->age;
toRelease = tail;
tail = tail->prev;
if (tail)
tail->next = NULL;
else
head = NULL;
num--;
}
if (num) {
// add [head;tail] list to cache
tail->next = first;
if (first)
first->prev = tail;
first = head;
if (!last) {
MALLOC_ASSERT(0 == oldest, ASSERT_TEXT);
oldest = tail->age;
last = tail;
}
cachedSize += num*size;
}
/* It's accebtable, if a bin is empty, and we have non-empty in bit mask.
So set true in bitmask without lock.
It's not acceptable, if a bin is non-empty and we have empty in bitmask.
So set false in bitmask under lock. */
// No used object, and nothing in the bin, mark the bin as empty
if (!usedSize && !first)
bitMask->set(idx, false);
}
extMemPool->loc.cleanupCacheIfNeededOnRange(&extMemPool->backend, totalNum, currTime);
if (toRelease)
toRelease->prev = toRelease->next = NULL;
return toRelease;
}
template<typename Props>
LargeMemoryBlock *LargeObjectCacheImpl<Props>::CacheBin::
get(size_t size, uintptr_t currTime, bool *setNonEmpty)
{
LargeMemoryBlock *result=NULL;
{
MallocMutex::scoped_lock scoped_cs(lock);
forgetOutdatedState(currTime);
if (first) {
result = first;
first = result->next;
if (first)
first->prev = NULL;
else {
last = NULL;
oldest = 0;
}
// use moving average with current hit interval
intptr_t hitR = currTime - result->age;
lastHit = lastHit? (lastHit + hitR)/2 : hitR;
cachedSize -= size;
} else {
if (lastCleanedAge)
ageThreshold = Props::OnMissFactor*(currTime - lastCleanedAge);
}
if (!usedSize) // inform that there are used blocks in the bin
*setNonEmpty = true;
// subject to later correction, if got cache miss and later allocation failed
usedSize += size;
lastGet = currTime;
}
return result;
}
// forget the history for the bin if it was unused for long time
template<typename Props>
void LargeObjectCacheImpl<Props>::CacheBin::forgetOutdatedState(uintptr_t currTime)
{
// If the time since the last get is LongWaitFactor times more than ageThreshold
// for the bin, treat the bin as rarely-used and forget everything we know
// about it.
// If LongWaitFactor is too small, we forget too early and
// so prevents good caching, while if too high, caching blocks
// with unrelated usage pattern occurs.
const uintptr_t sinceLastGet = currTime - lastGet;
bool doCleanup = false;
if (!last) { // clean only empty bins
if (ageThreshold)
doCleanup = sinceLastGet > Props::LongWaitFactor*ageThreshold;
else if (lastCleanedAge)
doCleanup = sinceLastGet > Props::LongWaitFactor*(lastCleanedAge - lastGet);
}
if (doCleanup) {
lastCleanedAge = 0;
ageThreshold = 0;
}
}
template<typename Props>
bool LargeObjectCacheImpl<Props>::CacheBin::
cleanToThreshold(Backend *backend, BinBitMask *bitMask, uintptr_t currTime, int idx)
{
LargeMemoryBlock *toRelease = NULL;
bool released = false;
#if MALLOC_DEBUG
uintptr_t nextAge = 0;
#endif
/* oldest may be more recent then age, that's why cast to signed type
was used. age overflow is also processed correctly. */
if (last && (intptr_t)(currTime - oldest) > ageThreshold) {
MallocMutex::scoped_lock scoped_cs(lock);
// double check
if (last && (intptr_t)(currTime - last->age) > ageThreshold) {
do {
#if MALLOC_DEBUG
// check that list ordered
MALLOC_ASSERT(!nextAge || lessThanWithOverflow(nextAge, last->age),
ASSERT_TEXT);
nextAge = last->age;
#endif
cachedSize -= last->unalignedSize;
last = last->prev;
} while (last && (intptr_t)(currTime - last->age) > ageThreshold);
if (last) {
toRelease = last->next;
oldest = last->age;
last->next = NULL;
} else {
toRelease = first;
first = NULL;
oldest = 0;
if (!usedSize)
bitMask->set(idx, false);
}
MALLOC_ASSERT( toRelease, ASSERT_TEXT );
lastCleanedAge = toRelease->age;
}
else
return false;
}
released = toRelease;
while ( toRelease ) {
LargeMemoryBlock *helper = toRelease->next;
backend->returnLargeObject(toRelease);
toRelease = helper;
}
return released;
}
template<typename Props>
bool LargeObjectCacheImpl<Props>::
CacheBin::cleanAll(Backend *backend, BinBitMask *bitMask, int idx)
{
LargeMemoryBlock *toRelease = NULL;
bool released = false;
if (last) {
MallocMutex::scoped_lock scoped_cs(lock);
// double check
if (last) {
toRelease = first;
last = NULL;
first = NULL;
oldest = 0;
cachedSize = 0;
if (!usedSize)
bitMask->set(idx, false);
}
else
return false;
}
released = toRelease;
while ( toRelease ) {
LargeMemoryBlock *helper = toRelease->next;
MALLOC_ASSERT(!helper || lessThanWithOverflow(helper->age, toRelease->age),
ASSERT_TEXT);
backend->returnLargeObject(toRelease);
toRelease = helper;
}
return released;
}
template<typename Props>
size_t LargeObjectCacheImpl<Props>::CacheBin::reportStat(int num, FILE *f)
{
#if __TBB_MALLOC_LOCACHE_STAT
if (first)
printf("%d(%lu): total %lu KB thr %ld lastCln %lu lastHit %lu oldest %lu\n",
num, num*CacheStep+MinSize,
cachedSize/1024, ageThreshold, lastCleanedAge, lastHit, oldest);
#else
suppress_unused_warning(num);
suppress_unused_warning(f);
#endif
return cachedSize;
}
// release from cache blocks that are older than ageThreshold
template<typename Props>
bool LargeObjectCacheImpl<Props>::regularCleanup(Backend *backend, uintptr_t currTime)
{
bool released = false, doThreshDecr = false;
BinsSummary binsSummary;
for (int i = bitMask.getMaxTrue(numBins-1); i >= 0;
i = bitMask.getMaxTrue(i-1)) {
bin[i].updateBinsSummary(&binsSummary);
if (!doThreshDecr && tooLargeLOC>2 && binsSummary.isLOCTooLarge()) {
// if LOC is too large for quite long time, decrease the threshold
// based on bin hit statistics.
// For this, redo cleanup from the beginnig.
// Note: on this iteration total usedSz can be not too large
// in comparison to total cachedSz, as we calculated it only
// partially. We are ok this it.
i = bitMask.getMaxTrue(numBins-1);
doThreshDecr = true;
binsSummary.reset();
continue;
}
if (doThreshDecr)
bin[i].decreaseThreshold();
if (bin[i].cleanToThreshold(backend, &bitMask, currTime, i))
released = true;
}
// We want to find if LOC was too large for some time continuously,
// so OK with races between incrementing and zeroing, but incrementing
// must be atomic.
if (binsSummary.isLOCTooLarge())
AtomicIncrement(tooLargeLOC);
else
tooLargeLOC = 0;
return released;
}
template<typename Props>
bool LargeObjectCacheImpl<Props>::cleanAll(Backend *backend)
{
bool released = false;
for (int i = numBins-1; i >= 0; i--)
released |= bin[i].cleanAll(backend, &bitMask, i);
return released;
}
#if __TBB_MALLOC_WHITEBOX_TEST
template<typename Props>
size_t LargeObjectCacheImpl<Props>::getLOCSize() const
{
size_t size = 0;
for (int i = numBins-1; i >= 0; i--)
size += bin[i].getSize();
return size;
}
size_t LargeObjectCache::getLOCSize() const
{
return largeCache.getLOCSize() + hugeCache.getLOCSize();
}
template<typename Props>
size_t LargeObjectCacheImpl<Props>::getUsedSize() const
{
size_t size = 0;
for (int i = numBins-1; i >= 0; i--)
size += bin[i].getUsedSize();
return size;
}
size_t LargeObjectCache::getUsedSize() const
{
return largeCache.getUsedSize() + hugeCache.getUsedSize();
}
#endif // __TBB_MALLOC_WHITEBOX_TEST
uintptr_t LargeObjectCache::getCurrTime()
{
return (uintptr_t)AtomicIncrement((intptr_t&)cacheCurrTime);
}
uintptr_t LargeObjectCache::getCurrTimeRange(uintptr_t range)
{
return (uintptr_t)AtomicAdd((intptr_t&)cacheCurrTime, range)+1;
}
void LargeObjectCache::cleanupCacheIfNeeded(Backend *backend, uintptr_t currTime)
{
if ( 0 == currTime % cacheCleanupFreq )
doRegularCleanup(backend, currTime);
}
void LargeObjectCache::
cleanupCacheIfNeededOnRange(Backend *backend, uintptr_t range, uintptr_t currTime)
{
if (range >= cacheCleanupFreq
|| currTime+range < currTime-1 // overflow, 0 is power of 2, do cleanup
// (prev;prev+range] contains n*cacheCleanupFreq
|| alignUp(currTime, cacheCleanupFreq)<=currTime+range)
doRegularCleanup(backend, currTime);
}
bool LargeObjectCache::doRegularCleanup(Backend *backend, uintptr_t currTime)
{
return largeCache.regularCleanup(backend, currTime)
| hugeCache.regularCleanup(backend, currTime);
}
bool LargeObjectCache::cleanAll(Backend *backend)
{
return largeCache.cleanAll(backend) | hugeCache.cleanAll(backend);
}
template<typename Props>
LargeMemoryBlock *LargeObjectCacheImpl<Props>::get(uintptr_t currTime, size_t size)
{
MALLOC_ASSERT( size%Props::CacheStep==0, ASSERT_TEXT );
int idx = sizeToIdx(size);
bool setNonEmpty = false;
LargeMemoryBlock *lmb = bin[idx].get(size, currTime, &setNonEmpty);
// Setting to true is possible out of lock. As bitmask is used only for cleanup,
// the lack of consistency is not violating correctness here.
if (setNonEmpty)
bitMask.set(idx, true);
if (lmb) {
MALLOC_ITT_SYNC_ACQUIRED(bin+idx);
STAT_increment(getThreadId(), ThreadCommonCounters, allocCachedLargeBlk);
}
return lmb;
}
template<typename Props>
void LargeObjectCacheImpl<Props>::rollbackCacheState(size_t size)
{
int idx = sizeToIdx(size);
MALLOC_ASSERT(idx<numBins, ASSERT_TEXT);
bin[idx].decrUsedSize(size, &bitMask, idx);
}
#if __TBB_MALLOC_LOCACHE_STAT
template<typename Props>
void LargeObjectCacheImpl<Props>::reportStat(FILE *f)
{
size_t cachedSize = 0;
for (int i=0; i<numLargeBlockBins; i++)
cachedSize += bin[i].reportStat(i, f);
fprintf(f, "total LOC size %lu MB\nnow %lu\n", cachedSize/1024/1024,
loCacheStat.age);
}
void LargeObjectCache::reportStat(FILE *f)
{
largeObjs.reportStat(f);
hugeObjs.reportStat(f);
}
#endif
template<typename Props>
void LargeObjectCacheImpl<Props>::putList(ExtMemoryPool *extMemPool, LargeMemoryBlock *toCache)
{
int toBinIdx = sizeToIdx(toCache->unalignedSize);
MALLOC_ITT_SYNC_RELEASING(bin+toBinIdx);
if (LargeMemoryBlock *release = bin[toBinIdx].putList(extMemPool, toCache,
&bitMask, toBinIdx))
extMemPool->backend.returnLargeObject(release);
}
void LargeObjectCache::rollbackCacheState(size_t size)
{
if (size < maxLargeSize)
largeCache.rollbackCacheState(size);
else if (size < maxHugeSize)
hugeCache.rollbackCacheState(size);
}
// return artifical bin index, it's used only during sorting and never saved
int LargeObjectCache::sizeToIdx(size_t size)
{
MALLOC_ASSERT(size < maxHugeSize, ASSERT_TEXT);
return size < maxLargeSize?
LargeCacheType::sizeToIdx(size) :
LargeCacheType::getNumBins()+HugeCacheType::sizeToIdx(size);
}
void LargeObjectCache::putList(ExtMemoryPool *extMemPool, LargeMemoryBlock *list)
{
LargeMemoryBlock *toProcess, *n;
for (LargeMemoryBlock *curr = list; curr; curr = toProcess) {
LargeMemoryBlock *tail = curr;
toProcess = curr->next;
if (curr->unalignedSize >= maxHugeSize) {
extMemPool->backend.returnLargeObject(curr);
continue;
}
int currIdx = sizeToIdx(curr->unalignedSize);
// Find all blocks fitting to same bin. Not use more efficient sorting
// algorithm because list is short (commonly,
// LocalLOC's HIGH_MARK-LOW_MARK, i.e. 24 items).
for (LargeMemoryBlock *b = toProcess; b; b = n) {
n = b->next;
if (sizeToIdx(b->unalignedSize) == currIdx) {
tail->next = b;
tail = b;
if (toProcess == b)
toProcess = toProcess->next;
else {
b->prev->next = b->next;
if (b->next)
b->next->prev = b->prev;
}
}
}
tail->next = NULL;
if (curr->unalignedSize < maxLargeSize)
largeCache.putList(extMemPool, curr);
else
hugeCache.putList(extMemPool, curr);
}
}
void LargeObjectCache::put(ExtMemoryPool *extMemPool, LargeMemoryBlock *largeBlock)
{
if (largeBlock->unalignedSize < maxHugeSize) {
largeBlock->next = NULL;
if (largeBlock->unalignedSize<maxLargeSize)
largeCache.putList(extMemPool, largeBlock);
else
hugeCache.putList(extMemPool, largeBlock);
} else
extMemPool->backend.returnLargeObject(largeBlock);
}
LargeMemoryBlock *LargeObjectCache::get(Backend *backend, size_t size)
{
MALLOC_ASSERT( size%largeBlockCacheStep==0, ASSERT_TEXT );
MALLOC_ASSERT( size>=minLargeSize, ASSERT_TEXT );
if ( size < maxHugeSize) {
uintptr_t currTime = getCurrTime();
cleanupCacheIfNeeded(backend, currTime);
return size < maxLargeSize?
largeCache.get(currTime, size) : hugeCache.get(currTime, size);
}
return NULL;
}
LargeMemoryBlock *ExtMemoryPool::mallocLargeObject(size_t allocationSize)
{
#if __TBB_MALLOC_LOCACHE_STAT
AtomicIncrement(mallocCalls);
AtomicAdd(memAllocKB, allocationSize/1024);
#endif
LargeMemoryBlock* lmb = loc.get(&backend, allocationSize);
if (!lmb) {
BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/true);
if (backRefIdx.isInvalid())
return NULL;
// unalignedSize is set in getLargeBlock
lmb = backend.getLargeBlock(allocationSize);
if (!lmb) {
removeBackRef(backRefIdx);
loc.rollbackCacheState(allocationSize);
return NULL;
}
lmb->backRefIdx = backRefIdx;
STAT_increment(getThreadId(), ThreadCommonCounters, allocNewLargeObj);
} else {
#if __TBB_MALLOC_LOCACHE_STAT
AtomicIncrement(cacheHits);
AtomicAdd(memHitKB, allocationSize/1024);
#endif
}
return lmb;
}
void ExtMemoryPool::freeLargeObject(LargeMemoryBlock *mBlock)
{
loc.put(this, mBlock);
}
void ExtMemoryPool::freeLargeObjectList(LargeMemoryBlock *head)
{
loc.putList(this, head);
}
bool ExtMemoryPool::softCachesCleanup()
{
// TODO: cleanup small objects as well
return loc.regularCleanup(&backend);
}
bool ExtMemoryPool::hardCachesCleanup()
{
// thread-local caches must be cleaned before LOC,
// because object from thread-local cache can be released to LOC
bool tlCaches = releaseTLCaches(), locCaches = loc.cleanAll(&backend);
return tlCaches || locCaches;
}
/*********** End allocation of large objects **********/
} // namespace internal
} // namespace rml