]>
Commit | Line | Data |
---|---|---|
5bcae85e SL |
1 | //===-- working_set.cpp ---------------------------------------------------===// |
2 | // | |
3 | // The LLVM Compiler Infrastructure | |
4 | // | |
5 | // This file is distributed under the University of Illinois Open Source | |
6 | // License. See LICENSE.TXT for details. | |
7 | // | |
8 | //===----------------------------------------------------------------------===// | |
9 | // | |
10 | // This file is a part of EfficiencySanitizer, a family of performance tuners. | |
11 | // | |
12 | // This file contains working-set-specific code. | |
13 | //===----------------------------------------------------------------------===// | |
14 | ||
15 | #include "working_set.h" | |
16 | #include "esan.h" | |
17 | #include "esan_circular_buffer.h" | |
18 | #include "esan_flags.h" | |
19 | #include "esan_shadow.h" | |
20 | #include "esan_sideline.h" | |
21 | #include "sanitizer_common/sanitizer_procmaps.h" | |
22 | ||
23 | // We shadow every cache line of app memory with one shadow byte. | |
24 | // - The highest bit of each shadow byte indicates whether the corresponding | |
25 | // cache line has ever been accessed. | |
26 | // - The lowest bit of each shadow byte indicates whether the corresponding | |
27 | // cache line was accessed since the last sample. | |
28 | // - The other bits are used for working set snapshots at successively | |
29 | // lower frequencies, each bit to the left from the lowest bit stepping | |
30 | // down the frequency by 2 to the power of getFlags()->snapshot_step. | |
31 | // Thus we have something like this: | |
32 | // Bit 0: Since last sample | |
33 | // Bit 1: Since last 2^2 samples | |
34 | // Bit 2: Since last 2^4 samples | |
35 | // Bit 3: ... | |
36 | // Bit 7: Ever accessed. | |
37 | // We live with races in accessing each shadow byte. | |
38 | typedef unsigned char byte; | |
39 | ||
40 | namespace __esan { | |
41 | ||
42 | // Our shadow memory assumes that the line size is 64. | |
43 | static const u32 CacheLineSize = 64; | |
44 | ||
45 | // See the shadow byte layout description above. | |
46 | static const u32 TotalWorkingSetBitIdx = 7; | |
47 | // We accumulate to the left until we hit this bit. | |
48 | // We don't need to accumulate to the final bit as it's set on each ref | |
49 | // by the compiler instrumentation. | |
50 | static const u32 MaxAccumBitIdx = 6; | |
51 | static const u32 CurWorkingSetBitIdx = 0; | |
52 | static const byte ShadowAccessedVal = | |
53 | (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx); | |
54 | ||
55 | static SidelineThread Thread; | |
56 | // If we use real-time-based timer samples this won't overflow in any realistic | |
57 | // scenario, but if we switch to some other unit (such as memory accesses) we | |
58 | // may want to consider a 64-bit int. | |
59 | static u32 SnapshotNum; | |
60 | ||
61 | // We store the wset size for each of 8 different sampling frequencies. | |
62 | static const u32 NumFreq = 8; // One for each bit of our shadow bytes. | |
63 | // We cannot use static objects as the global destructor is called | |
64 | // prior to our finalize routine. | |
65 | // These are each circular buffers, sized up front. | |
66 | CircularBuffer<u32> SizePerFreq[NumFreq]; | |
67 | // We cannot rely on static initializers (they may run too late) but | |
68 | // we record the size here for clarity: | |
69 | u32 CircularBufferSizes[NumFreq] = { | |
70 | // These are each mmap-ed so our minimum is one page. | |
71 | 32*1024, | |
72 | 16*1024, | |
73 | 8*1024, | |
74 | 4*1024, | |
75 | 4*1024, | |
76 | 4*1024, | |
77 | 4*1024, | |
78 | 4*1024, | |
79 | }; | |
80 | ||
81 | void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size, | |
82 | bool IsWrite) { | |
83 | if (Size == 0) | |
84 | return; | |
85 | SIZE_T I = 0; | |
86 | uptr LineSize = getFlags()->cache_line_size; | |
87 | // As Addr+Size could overflow at the top of a 32-bit address space, | |
88 | // we avoid the simpler formula that rounds the start and end. | |
89 | SIZE_T NumLines = Size / LineSize + | |
90 | // Add any extra at the start or end adding on an extra line: | |
91 | (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize; | |
92 | byte *Shadow = (byte *)appToShadow(Addr); | |
93 | // Write shadow bytes until we're word-aligned. | |
94 | while (I < NumLines && (uptr)Shadow % 4 != 0) { | |
95 | if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal) | |
96 | *Shadow |= ShadowAccessedVal; | |
97 | ++Shadow; | |
98 | ++I; | |
99 | } | |
100 | // Write whole shadow words at a time. | |
101 | // Using a word-stride loop improves the runtime of a microbenchmark of | |
102 | // memset calls by 10%. | |
103 | u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 | | |
104 | ShadowAccessedVal << 16 | ShadowAccessedVal << 24; | |
105 | while (I + 4 <= NumLines) { | |
106 | if ((*(u32*)Shadow & WordValue) != WordValue) | |
107 | *(u32*)Shadow |= WordValue; | |
108 | Shadow += 4; | |
109 | I += 4; | |
110 | } | |
111 | // Write any trailing shadow bytes. | |
112 | while (I < NumLines) { | |
113 | if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal) | |
114 | *Shadow |= ShadowAccessedVal; | |
115 | ++Shadow; | |
116 | ++I; | |
117 | } | |
118 | } | |
119 | ||
120 | // This routine will word-align ShadowStart and ShadowEnd prior to scanning. | |
121 | // It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit | |
122 | // measures the access during the entire execution and should never be cleared. | |
123 | static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart, | |
124 | uptr ShadowEnd) { | |
125 | u32 WorkingSetSize = 0; | |
126 | u32 ByteValue = 0x1 << BitIdx; | |
127 | u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 | | |
128 | ByteValue << 24; | |
129 | // Get word aligned start. | |
130 | ShadowStart = RoundDownTo(ShadowStart, sizeof(u32)); | |
131 | bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx; | |
132 | // Do not clear the bit that measures access during the entire execution. | |
133 | bool Clear = BitIdx < TotalWorkingSetBitIdx; | |
134 | for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) { | |
135 | if ((*Ptr & WordValue) != 0) { | |
136 | byte *BytePtr = (byte *)Ptr; | |
137 | for (u32 j = 0; j < sizeof(u32); ++j) { | |
138 | if (BytePtr[j] & ByteValue) { | |
139 | ++WorkingSetSize; | |
140 | if (Accum) { | |
141 | // Accumulate to the lower-frequency bit to the left. | |
142 | BytePtr[j] |= (ByteValue << 1); | |
143 | } | |
144 | } | |
145 | } | |
146 | if (Clear) { | |
147 | // Clear this bit from every shadow byte. | |
148 | *Ptr &= ~WordValue; | |
149 | } | |
150 | } | |
151 | } | |
152 | return WorkingSetSize; | |
153 | } | |
154 | ||
155 | // Scan shadow memory to calculate the number of cache lines being accessed, | |
156 | // i.e., the number of non-zero bits indexed by BitIdx in each shadow byte. | |
157 | // We also clear the lowest bits (most recent working set snapshot). | |
158 | // We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit | |
159 | // measures the access during the entire execution and should never be cleared. | |
160 | static u32 computeWorkingSizeAndReset(u32 BitIdx) { | |
161 | u32 WorkingSetSize = 0; | |
162 | MemoryMappingLayout MemIter(true/*cache*/); | |
163 | uptr Start, End, Prot; | |
164 | while (MemIter.Next(&Start, &End, nullptr/*offs*/, nullptr/*file*/, | |
165 | 0/*file size*/, &Prot)) { | |
166 | VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n", | |
167 | __FUNCTION__, Start, End, Prot, isAppMem(Start), | |
168 | isShadowMem(Start)); | |
169 | if (isShadowMem(Start) && (Prot & MemoryMappingLayout::kProtectionWrite)) { | |
170 | VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Start, End); | |
171 | WorkingSetSize += countAndClearShadowValues(BitIdx, Start, End); | |
172 | } | |
173 | } | |
174 | return WorkingSetSize; | |
175 | } | |
176 | ||
177 | // This is invoked from a signal handler but in a sideline thread doing nothing | |
178 | // else so it is a little less fragile than a typical signal handler. | |
179 | static void takeSample(void *Arg) { | |
180 | u32 BitIdx = CurWorkingSetBitIdx; | |
181 | u32 Freq = 1; | |
182 | ++SnapshotNum; // Simpler to skip 0 whose mod matches everything. | |
183 | while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) { | |
184 | u32 NumLines = computeWorkingSizeAndReset(BitIdx); | |
185 | VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName, | |
186 | SnapshotNum, BitIdx, Freq, NumLines); | |
187 | SizePerFreq[BitIdx].push_back(NumLines); | |
188 | Freq = Freq << getFlags()->snapshot_step; | |
189 | BitIdx++; | |
190 | } | |
191 | } | |
192 | ||
7cac9316 XL |
193 | unsigned int getSampleCountWorkingSet() |
194 | { | |
195 | return SnapshotNum; | |
196 | } | |
197 | ||
5bcae85e SL |
198 | // Initialization that must be done before any instrumented code is executed. |
199 | void initializeShadowWorkingSet() { | |
200 | CHECK(getFlags()->cache_line_size == CacheLineSize); | |
201 | registerMemoryFaultHandler(); | |
202 | } | |
203 | ||
204 | void initializeWorkingSet() { | |
205 | if (getFlags()->record_snapshots) { | |
206 | for (u32 i = 0; i < NumFreq; ++i) | |
207 | SizePerFreq[i].initialize(CircularBufferSizes[i]); | |
208 | Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq); | |
209 | } | |
210 | } | |
211 | ||
212 | static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) { | |
213 | if (MilliSec > 600000) { | |
214 | Unit = "min"; | |
215 | return MilliSec / 60000; | |
216 | } else if (MilliSec > 10000) { | |
217 | Unit = "sec"; | |
218 | return MilliSec / 1000; | |
219 | } else { | |
220 | Unit = "ms"; | |
221 | return MilliSec; | |
222 | } | |
223 | } | |
224 | ||
225 | static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) { | |
226 | // We need a constant to avoid software divide support: | |
227 | static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize; | |
228 | static const u32 MegabyteCachelines = KilobyteCachelines << 10; | |
229 | ||
230 | if (NumOfCachelines > 10 * MegabyteCachelines) { | |
231 | Unit = "MB"; | |
232 | return NumOfCachelines / MegabyteCachelines; | |
233 | } else if (NumOfCachelines > 10 * KilobyteCachelines) { | |
234 | Unit = "KB"; | |
235 | return NumOfCachelines / KilobyteCachelines; | |
236 | } else { | |
237 | Unit = "Bytes"; | |
238 | return NumOfCachelines * CacheLineSize; | |
239 | } | |
240 | } | |
241 | ||
242 | void reportWorkingSet() { | |
243 | const char *Unit; | |
244 | if (getFlags()->record_snapshots) { | |
245 | u32 Freq = 1; | |
246 | Report(" Total number of samples: %u\n", SnapshotNum); | |
247 | for (u32 i = 0; i < NumFreq; ++i) { | |
248 | u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit); | |
249 | Report(" Samples array #%d at period %u %s\n", i, Time, Unit); | |
250 | // FIXME: report whether we wrapped around and thus whether we | |
251 | // have data on the whole run or just the last N samples. | |
252 | for (u32 j = 0; j < SizePerFreq[i].size(); ++j) { | |
253 | u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit); | |
254 | Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit, | |
255 | SizePerFreq[i][j]); | |
256 | } | |
257 | Freq = Freq << getFlags()->snapshot_step; | |
258 | } | |
259 | } | |
260 | ||
261 | // Get the working set size for the entire execution. | |
262 | u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx); | |
263 | u32 Size = getSizeForPrinting(NumOfCachelines, Unit); | |
264 | Report(" %s: the total working set size: %u %s (%u cache lines)\n", | |
265 | SanitizerToolName, Size, Unit, NumOfCachelines); | |
266 | } | |
267 | ||
268 | int finalizeWorkingSet() { | |
269 | if (getFlags()->record_snapshots) | |
270 | Thread.joinThread(); | |
271 | reportWorkingSet(); | |
272 | if (getFlags()->record_snapshots) { | |
273 | for (u32 i = 0; i < NumFreq; ++i) | |
274 | SizePerFreq[i].free(); | |
275 | } | |
276 | return 0; | |
277 | } | |
278 | ||
279 | } // namespace __esan |