[rustc.git] / src / compiler-rt / lib / esan / working_set.cpp

//===-- working_set.cpp ---------------------------------------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file is a part of EfficiencySanitizer, a family of performance tuners.
//
// This file contains working-set-specific code.
//===----------------------------------------------------------------------===//

#include "working_set.h"
#include "esan.h"
#include "esan_circular_buffer.h"
#include "esan_flags.h"
#include "esan_shadow.h"
#include "esan_sideline.h"
#include "sanitizer_common/sanitizer_procmaps.h"

// We shadow every cache line of app memory with one shadow byte.
// - The highest bit of each shadow byte indicates whether the corresponding
//   cache line has ever been accessed.
// - The lowest bit of each shadow byte indicates whether the corresponding
//   cache line was accessed since the last sample.
// - The other bits are used for working set snapshots at successively
//   lower frequencies, each bit to the left from the lowest bit stepping
//   down the frequency by 2 to the power of getFlags()->snapshot_step.
// Thus we have something like this:
//   Bit 0: Since last sample
//   Bit 1: Since last 2^2 samples
//   Bit 2: Since last 2^4 samples
//   Bit 3: ...
//   Bit 7: Ever accessed.
// We live with races in accessing each shadow byte.
typedef unsigned char byte;

namespace __esan {

// Our shadow memory assumes that the line size is 64.
static const u32 CacheLineSize = 64;

// See the shadow byte layout description above.
static const u32 TotalWorkingSetBitIdx = 7;
// We accumulate to the left until we hit this bit.
// We don't need to accumulate to the final bit as it's set on each ref
// by the compiler instrumentation.
static const u32 MaxAccumBitIdx = 6;
static const u32 CurWorkingSetBitIdx = 0;
static const byte ShadowAccessedVal =
  (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx);

static SidelineThread Thread;
// If we use real-time-based timer samples this won't overflow in any realistic
// scenario, but if we switch to some other unit (such as memory accesses) we
// may want to consider a 64-bit int.
static u32 SnapshotNum;

// We store the wset size for each of 8 different sampling frequencies.
static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
// We cannot use static objects as the global destructor is called
// prior to our finalize routine.
// These are each circular buffers, sized up front.
CircularBuffer<u32> SizePerFreq[NumFreq];
// We cannot rely on static initializers (they may run too late) but
// we record the size here for clarity:
u32 CircularBufferSizes[NumFreq] = {
  // These are each mmap-ed so our minimum is one page.
  32*1024,
  16*1024,
  8*1024,
  4*1024,
  4*1024,
  4*1024,
  4*1024,
  4*1024,
};

void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
                                  bool IsWrite) {
  if (Size == 0)
    return;
  SIZE_T I = 0;
  uptr LineSize = getFlags()->cache_line_size;
  // As Addr+Size could overflow at the top of a 32-bit address space,
  // we avoid the simpler formula that rounds the start and end.
  SIZE_T NumLines = Size / LineSize +
    // Add any extra at the start or end adding on an extra line:
    (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
  byte *Shadow = (byte *)appToShadow(Addr);
  // Write shadow bytes until we're word-aligned.
  while (I < NumLines && (uptr)Shadow % 4 != 0) {
    if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
      *Shadow |= ShadowAccessedVal;
    ++Shadow;
    ++I;
  }
  // Write whole shadow words at a time.
  // Using a word-stride loop improves the runtime of a microbenchmark of
  // memset calls by 10%.
  u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 |
    ShadowAccessedVal << 16 | ShadowAccessedVal << 24;
  while (I + 4 <= NumLines) {
    if ((*(u32*)Shadow & WordValue) != WordValue)
      *(u32*)Shadow |= WordValue;
    Shadow += 4;
    I += 4;
  }
  // Write any trailing shadow bytes.
  while (I < NumLines) {
    if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
      *Shadow |= ShadowAccessedVal;
    ++Shadow;
    ++I;
  }
}

// This routine will word-align ShadowStart and ShadowEnd prior to scanning.
// It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
// measures the access during the entire execution and should never be cleared.
static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
                                     uptr ShadowEnd) {
  u32 WorkingSetSize = 0;
  u32 ByteValue = 0x1 << BitIdx;
  u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 |
    ByteValue << 24;
  // Get word aligned start.
  ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
  bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
  // Do not clear the bit that measures access during the entire execution.
  bool Clear = BitIdx < TotalWorkingSetBitIdx;
  for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
    if ((*Ptr & WordValue) != 0) {
      byte *BytePtr = (byte *)Ptr;
      for (u32 j = 0; j < sizeof(u32); ++j) {
        if (BytePtr[j] & ByteValue) {
          ++WorkingSetSize;
          if (Accum) {
            // Accumulate to the lower-frequency bit to the left.
            BytePtr[j] |= (ByteValue << 1);
          }
        }
      }
      if (Clear) {
        // Clear this bit from every shadow byte.
        *Ptr &= ~WordValue;
      }
    }
  }
  return WorkingSetSize;
}

// Scan shadow memory to calculate the number of cache lines being accessed,
// i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
// We also clear the lowest bits (most recent working set snapshot).
// We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
// measures the access during the entire execution and should never be cleared.
static u32 computeWorkingSizeAndReset(u32 BitIdx) {
  u32 WorkingSetSize = 0;
  MemoryMappingLayout MemIter(true/*cache*/);
  uptr Start, End, Prot;
  while (MemIter.Next(&Start, &End, nullptr/*offs*/, nullptr/*file*/,
                      0/*file size*/, &Prot)) {
    VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n",
            __FUNCTION__, Start, End, Prot, isAppMem(Start),
            isShadowMem(Start));
    if (isShadowMem(Start) && (Prot & MemoryMappingLayout::kProtectionWrite)) {
      VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Start, End);
      WorkingSetSize += countAndClearShadowValues(BitIdx, Start, End);
    }
  }
  return WorkingSetSize;
}

// This is invoked from a signal handler but in a sideline thread doing nothing
// else so it is a little less fragile than a typical signal handler.
static void takeSample(void *Arg) {
  u32 BitIdx = CurWorkingSetBitIdx;
  u32 Freq = 1;
  ++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
  while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
    u32 NumLines = computeWorkingSizeAndReset(BitIdx);
    VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
            SnapshotNum, BitIdx, Freq, NumLines);
    SizePerFreq[BitIdx].push_back(NumLines);
    Freq = Freq << getFlags()->snapshot_step;
    BitIdx++;
  }
}

unsigned int getSampleCountWorkingSet()
{
  return SnapshotNum;
}

// Initialization that must be done before any instrumented code is executed.
void initializeShadowWorkingSet() {
  CHECK(getFlags()->cache_line_size == CacheLineSize);
  registerMemoryFaultHandler();
}

void initializeWorkingSet() {
  if (getFlags()->record_snapshots) {
    for (u32 i = 0; i < NumFreq; ++i)
      SizePerFreq[i].initialize(CircularBufferSizes[i]);
    Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
  }
}

static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
  if (MilliSec > 600000) {
    Unit = "min";
    return MilliSec / 60000;
  } else if (MilliSec > 10000) {
    Unit = "sec";
    return MilliSec / 1000;
  } else {
    Unit = "ms";
    return MilliSec;
  }
}

static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
  // We need a constant to avoid software divide support:
  static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
  static const u32 MegabyteCachelines = KilobyteCachelines << 10;

  if (NumOfCachelines > 10 * MegabyteCachelines) {
    Unit = "MB";
    return NumOfCachelines / MegabyteCachelines;
  } else if (NumOfCachelines > 10 * KilobyteCachelines) {
    Unit = "KB";
    return NumOfCachelines / KilobyteCachelines;
  } else {
    Unit = "Bytes";
    return NumOfCachelines * CacheLineSize;
  }
}

void reportWorkingSet() {
  const char *Unit;
  if (getFlags()->record_snapshots) {
    u32 Freq = 1;
    Report(" Total number of samples: %u\n", SnapshotNum);
    for (u32 i = 0; i < NumFreq; ++i) {
      u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
      Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
      // FIXME: report whether we wrapped around and thus whether we
      // have data on the whole run or just the last N samples.
      for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
        u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
        Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
               SizePerFreq[i][j]);
      }
      Freq = Freq << getFlags()->snapshot_step;
    }
  }

  // Get the working set size for the entire execution.
  u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
  u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
  Report(" %s: the total working set size: %u %s (%u cache lines)\n",
         SanitizerToolName, Size, Unit, NumOfCachelines);
}

int finalizeWorkingSet() {
  if (getFlags()->record_snapshots)
    Thread.joinThread();
  reportWorkingSet();
  if (getFlags()->record_snapshots) {
    for (u32 i = 0; i < NumFreq; ++i)
      SizePerFreq[i].free();
  }
  return 0;
}

} // namespace __esan
Commit	Line	Data
5bcae85e SL	1	//===-- working_set.cpp ---------------------------------------------------===//
	2	//
	3	// The LLVM Compiler Infrastructure
	4	//
	5	// This file is distributed under the University of Illinois Open Source
	6	// License. See LICENSE.TXT for details.
	7	//
	8	//===----------------------------------------------------------------------===//
	9	//
	10	// This file is a part of EfficiencySanitizer, a family of performance tuners.
	11	//
	12	// This file contains working-set-specific code.
	13	//===----------------------------------------------------------------------===//
	14
	15	#include "working_set.h"
	16	#include "esan.h"
	17	#include "esan_circular_buffer.h"
	18	#include "esan_flags.h"
	19	#include "esan_shadow.h"
	20	#include "esan_sideline.h"
	21	#include "sanitizer_common/sanitizer_procmaps.h"
	22
	23	// We shadow every cache line of app memory with one shadow byte.
	24	// - The highest bit of each shadow byte indicates whether the corresponding
	25	// cache line has ever been accessed.
	26	// - The lowest bit of each shadow byte indicates whether the corresponding
	27	// cache line was accessed since the last sample.
	28	// - The other bits are used for working set snapshots at successively
	29	// lower frequencies, each bit to the left from the lowest bit stepping
	30	// down the frequency by 2 to the power of getFlags()->snapshot_step.
	31	// Thus we have something like this:
	32	// Bit 0: Since last sample
	33	// Bit 1: Since last 2^2 samples
	34	// Bit 2: Since last 2^4 samples
	35	// Bit 3: ...
	36	// Bit 7: Ever accessed.
	37	// We live with races in accessing each shadow byte.
	38	typedef unsigned char byte;
	39
	40	namespace __esan {
	41
	42	// Our shadow memory assumes that the line size is 64.
	43	static const u32 CacheLineSize = 64;
	44
	45	// See the shadow byte layout description above.
	46	static const u32 TotalWorkingSetBitIdx = 7;
	47	// We accumulate to the left until we hit this bit.
	48	// We don't need to accumulate to the final bit as it's set on each ref
	49	// by the compiler instrumentation.
	50	static const u32 MaxAccumBitIdx = 6;
	51	static const u32 CurWorkingSetBitIdx = 0;
	52	static const byte ShadowAccessedVal =
	53	(1 << TotalWorkingSetBitIdx) \| (1 << CurWorkingSetBitIdx);
	54
	55	static SidelineThread Thread;
	56	// If we use real-time-based timer samples this won't overflow in any realistic
	57	// scenario, but if we switch to some other unit (such as memory accesses) we
	58	// may want to consider a 64-bit int.
	59	static u32 SnapshotNum;
	60
	61	// We store the wset size for each of 8 different sampling frequencies.
	62	static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
	63	// We cannot use static objects as the global destructor is called
	64	// prior to our finalize routine.
65	// These are each circular buffers, sized up front.
66	CircularBuffer<u32> SizePerFreq[NumFreq];
67	// We cannot rely on static initializers (they may run too late) but
68	// we record the size here for clarity:
69	u32 CircularBufferSizes[NumFreq] = {
70	// These are each mmap-ed so our minimum is one page.
71	32*1024,
72	16*1024,
73	8*1024,
74	4*1024,
75	4*1024,
76	4*1024,
77	4*1024,
78	4*1024,
79	};
80
81	void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
82	bool IsWrite) {
83	if (Size == 0)
84	return;
85	SIZE_T I = 0;
86	uptr LineSize = getFlags()->cache_line_size;
87	// As Addr+Size could overflow at the top of a 32-bit address space,
88	// we avoid the simpler formula that rounds the start and end.
89	SIZE_T NumLines = Size / LineSize +
90	// Add any extra at the start or end adding on an extra line:
91	(LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
92	byte Shadow = (byte )appToShadow(Addr);
93	// Write shadow bytes until we're word-aligned.
94	while (I < NumLines && (uptr)Shadow % 4 != 0) {
95	if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
96	*Shadow \|= ShadowAccessedVal;
97	++Shadow;
98	++I;
99	}
100	// Write whole shadow words at a time.
101	// Using a word-stride loop improves the runtime of a microbenchmark of
102	// memset calls by 10%.
103	u32 WordValue = ShadowAccessedVal \| ShadowAccessedVal << 8 \|
104	ShadowAccessedVal << 16 \| ShadowAccessedVal << 24;
105	while (I + 4 <= NumLines) {
106	if (((u32)Shadow & WordValue) != WordValue)
107	(u32)Shadow \|= WordValue;
108	Shadow += 4;
109	I += 4;
110	}
111	// Write any trailing shadow bytes.
112	while (I < NumLines) {
113	if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
114	*Shadow \|= ShadowAccessedVal;
115	++Shadow;
116	++I;
117	}
118	}
119
120	// This routine will word-align ShadowStart and ShadowEnd prior to scanning.
121	// It does not clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
122	// measures the access during the entire execution and should never be cleared.
123	static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
124	uptr ShadowEnd) {
125	u32 WorkingSetSize = 0;
126	u32 ByteValue = 0x1 << BitIdx;
127	u32 WordValue = ByteValue \| ByteValue << 8 \| ByteValue << 16 \|
128	ByteValue << 24;
129	// Get word aligned start.
130	ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
131	bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
132	// Do not clear the bit that measures access during the entire execution.
133	bool Clear = BitIdx < TotalWorkingSetBitIdx;
134	for (u32 Ptr = (u32 )ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
135	if ((*Ptr & WordValue) != 0) {
136	byte BytePtr = (byte )Ptr;
137	for (u32 j = 0; j < sizeof(u32); ++j) {
138	if (BytePtr[j] & ByteValue) {
139	++WorkingSetSize;
140	if (Accum) {
141	// Accumulate to the lower-frequency bit to the left.
142	BytePtr[j] \|= (ByteValue << 1);
143	}
144	}
145	}
146	if (Clear) {
147	// Clear this bit from every shadow byte.
148	*Ptr &= ~WordValue;
149	}
150	}
151	}
152	return WorkingSetSize;
153	}
154
155	// Scan shadow memory to calculate the number of cache lines being accessed,
156	// i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
157	// We also clear the lowest bits (most recent working set snapshot).
158	// We do not clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
159	// measures the access during the entire execution and should never be cleared.
160	static u32 computeWorkingSizeAndReset(u32 BitIdx) {
161	u32 WorkingSetSize = 0;
162	MemoryMappingLayout MemIter(true/cache/);
163	uptr Start, End, Prot;
164	while (MemIter.Next(&Start, &End, nullptr/offs/, nullptr/file/,
165	0/file size/, &Prot)) {
166	VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n",
167	__FUNCTION__, Start, End, Prot, isAppMem(Start),
168	isShadowMem(Start));
169	if (isShadowMem(Start) && (Prot & MemoryMappingLayout::kProtectionWrite)) {
170	VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Start, End);
171	WorkingSetSize += countAndClearShadowValues(BitIdx, Start, End);
172	}
173	}
174	return WorkingSetSize;
175	}
176
177	// This is invoked from a signal handler but in a sideline thread doing nothing
178	// else so it is a little less fragile than a typical signal handler.
179	static void takeSample(void *Arg) {
180	u32 BitIdx = CurWorkingSetBitIdx;
181	u32 Freq = 1;
182	++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
183	while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
184	u32 NumLines = computeWorkingSizeAndReset(BitIdx);
185	VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
186	SnapshotNum, BitIdx, Freq, NumLines);
187	SizePerFreq[BitIdx].push_back(NumLines);
188	Freq = Freq << getFlags()->snapshot_step;
189	BitIdx++;
190	}
191	}
192
7cac9316 XL	193	unsigned int getSampleCountWorkingSet()
	194	{
	195	return SnapshotNum;
	196	}
	197
5bcae85e SL	198	// Initialization that must be done before any instrumented code is executed.
	199	void initializeShadowWorkingSet() {
	200	CHECK(getFlags()->cache_line_size == CacheLineSize);
	201	registerMemoryFaultHandler();
	202	}
	203
	204	void initializeWorkingSet() {
	205	if (getFlags()->record_snapshots) {
	206	for (u32 i = 0; i < NumFreq; ++i)
	207	SizePerFreq[i].initialize(CircularBufferSizes[i]);
	208	Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
	209	}
	210	}
	211
	212	static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
	213	if (MilliSec > 600000) {
	214	Unit = "min";
	215	return MilliSec / 60000;
	216	} else if (MilliSec > 10000) {
	217	Unit = "sec";
	218	return MilliSec / 1000;
	219	} else {
	220	Unit = "ms";
	221	return MilliSec;
	222	}
	223	}
	224
	225	static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
	226	// We need a constant to avoid software divide support:
	227	static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
	228	static const u32 MegabyteCachelines = KilobyteCachelines << 10;
	229
	230	if (NumOfCachelines > 10 * MegabyteCachelines) {
	231	Unit = "MB";
	232	return NumOfCachelines / MegabyteCachelines;
	233	} else if (NumOfCachelines > 10 * KilobyteCachelines) {
	234	Unit = "KB";
	235	return NumOfCachelines / KilobyteCachelines;
	236	} else {
	237	Unit = "Bytes";
	238	return NumOfCachelines * CacheLineSize;
	239	}
	240	}
	241
	242	void reportWorkingSet() {
	243	const char *Unit;
	244	if (getFlags()->record_snapshots) {
	245	u32 Freq = 1;
	246	Report(" Total number of samples: %u\n", SnapshotNum);
	247	for (u32 i = 0; i < NumFreq; ++i) {
	248	u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
	249	Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
	250	// FIXME: report whether we wrapped around and thus whether we
	251	// have data on the whole run or just the last N samples.
	252	for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
	253	u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
	254	Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
	255	SizePerFreq[i][j]);
	256	}
	257	Freq = Freq << getFlags()->snapshot_step;
	258	}
	259	}
	260
	261	// Get the working set size for the entire execution.
262	u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
263	u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
264	Report(" %s: the total working set size: %u %s (%u cache lines)\n",
265	SanitizerToolName, Size, Unit, NumOfCachelines);
266	}
267
268	int finalizeWorkingSet() {
269	if (getFlags()->record_snapshots)
270	Thread.joinThread();
271	reportWorkingSet();
272	if (getFlags()->record_snapshots) {
273	for (u32 i = 0; i < NumFreq; ++i)
274	SizePerFreq[i].free();
275	}
276	return 0;
277	}
278
279	} // namespace __esan