1 //===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a profile inference algorithm. Given an incomplete and
10 // possibly imprecise block counts, the algorithm reconstructs realistic block
11 // and edge counts that satisfy flow conservation rules, while minimally modify
12 // input block counts.
13 //
14 //===----------------------------------------------------------------------===//
15
16 #include "llvm/Transforms/Utils/SampleProfileInference.h"
17 #include "llvm/ADT/BitVector.h"
18 #include "llvm/Support/CommandLine.h"
19 #include "llvm/Support/Debug.h"
20 #include <queue>
21 #include <set>
22 #include <stack>
23
24 using namespace llvm;
25 #define DEBUG_TYPE "sample-profile-inference"
26
27 namespace {
28
29 static cl::opt<bool> SampleProfileEvenFlowDistribution(
30 "sample-profile-even-flow-distribution", cl::init(true), cl::Hidden,
31 cl::desc("Try to evenly distribute flow when there are multiple equally "
32 "likely options."));
33
34 static cl::opt<bool> SampleProfileRebalanceUnknown(
35 "sample-profile-rebalance-unknown", cl::init(true), cl::Hidden,
36 cl::desc("Evenly re-distribute flow among unknown subgraphs."));
37
38 static cl::opt<bool> SampleProfileJoinIslands(
39 "sample-profile-join-islands", cl::init(true), cl::Hidden,
40 cl::desc("Join isolated components having positive flow."));
41
42 static cl::opt<unsigned> SampleProfileProfiCostBlockInc(
43 "sample-profile-profi-cost-block-inc", cl::init(10), cl::Hidden,
44 cl::desc("The cost of increasing a block's count by one."));
45
46 static cl::opt<unsigned> SampleProfileProfiCostBlockDec(
47 "sample-profile-profi-cost-block-dec", cl::init(20), cl::Hidden,
48 cl::desc("The cost of decreasing a block's count by one."));
49
50 static cl::opt<unsigned> SampleProfileProfiCostBlockEntryInc(
51 "sample-profile-profi-cost-block-entry-inc", cl::init(40), cl::Hidden,
52 cl::desc("The cost of increasing the entry block's count by one."));
53
54 static cl::opt<unsigned> SampleProfileProfiCostBlockEntryDec(
55 "sample-profile-profi-cost-block-entry-dec", cl::init(10), cl::Hidden,
56 cl::desc("The cost of decreasing the entry block's count by one."));
57
58 static cl::opt<unsigned> SampleProfileProfiCostBlockZeroInc(
59 "sample-profile-profi-cost-block-zero-inc", cl::init(11), cl::Hidden,
60 cl::desc("The cost of increasing a count of zero-weight block by one."));
61
62 static cl::opt<unsigned> SampleProfileProfiCostBlockUnknownInc(
63 "sample-profile-profi-cost-block-unknown-inc", cl::init(0), cl::Hidden,
64 cl::desc("The cost of increasing an unknown block's count by one."));
65
66 /// A value indicating an infinite flow/capacity/weight of a block/edge.
67 /// Not using numeric_limits<int64_t>::max(), as the values can be summed up
68 /// during the execution.
69 static constexpr int64_t INF = ((int64_t)1) << 50;
70
71 /// The minimum-cost maximum flow algorithm.
72 ///
73 /// The algorithm finds the maximum flow of minimum cost on a given (directed)
74 /// network using a modified version of the classical Moore-Bellman-Ford
75 /// approach. The algorithm applies a number of augmentation iterations in which
76 /// flow is sent along paths of positive capacity from the source to the sink.
77 /// The worst-case time complexity of the implementation is O(v(f)*m*n), where
78 /// where m is the number of edges, n is the number of vertices, and v(f) is the
79 /// value of the maximum flow. However, the observed running time on typical
80 /// instances is sub-quadratic, that is, o(n^2).
81 ///
82 /// The input is a set of edges with specified costs and capacities, and a pair
83 /// of nodes (source and sink). The output is the flow along each edge of the
84 /// minimum total cost respecting the given edge capacities.
85 class MinCostMaxFlow {
86 public:
MinCostMaxFlow(const ProfiParams & Params)87 MinCostMaxFlow(const ProfiParams &Params) : Params(Params) {}
88
89 // Initialize algorithm's data structures for a network of a given size.
initialize(uint64_t NodeCount,uint64_t SourceNode,uint64_t SinkNode)90 void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) {
91 Source = SourceNode;
92 Target = SinkNode;
93
94 Nodes = std::vector<Node>(NodeCount);
95 Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>());
96 if (Params.EvenFlowDistribution)
97 AugmentingEdges =
98 std::vector<std::vector<Edge *>>(NodeCount, std::vector<Edge *>());
99 }
100
101 // Run the algorithm.
run()102 int64_t run() {
103 LLVM_DEBUG(dbgs() << "Starting profi for " << Nodes.size() << " nodes\n");
104
105 // Iteratively find an augmentation path/dag in the network and send the
106 // flow along its edges
107 size_t AugmentationIters = applyFlowAugmentation();
108
109 // Compute the total flow and its cost
110 int64_t TotalCost = 0;
111 int64_t TotalFlow = 0;
112 for (uint64_t Src = 0; Src < Nodes.size(); Src++) {
113 for (auto &Edge : Edges[Src]) {
114 if (Edge.Flow > 0) {
115 TotalCost += Edge.Cost * Edge.Flow;
116 if (Src == Source)
117 TotalFlow += Edge.Flow;
118 }
119 }
120 }
121 LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters
122 << " iterations with " << TotalFlow << " total flow"
123 << " of " << TotalCost << " cost\n");
124 (void)TotalFlow;
125 (void)AugmentationIters;
126 return TotalCost;
127 }
128
129 /// Adding an edge to the network with a specified capacity and a cost.
130 /// Multiple edges between a pair of nodes are allowed but self-edges
131 /// are not supported.
addEdge(uint64_t Src,uint64_t Dst,int64_t Capacity,int64_t Cost)132 void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) {
133 assert(Capacity > 0 && "adding an edge of zero capacity");
134 assert(Src != Dst && "loop edge are not supported");
135
136 Edge SrcEdge;
137 SrcEdge.Dst = Dst;
138 SrcEdge.Cost = Cost;
139 SrcEdge.Capacity = Capacity;
140 SrcEdge.Flow = 0;
141 SrcEdge.RevEdgeIndex = Edges[Dst].size();
142
143 Edge DstEdge;
144 DstEdge.Dst = Src;
145 DstEdge.Cost = -Cost;
146 DstEdge.Capacity = 0;
147 DstEdge.Flow = 0;
148 DstEdge.RevEdgeIndex = Edges[Src].size();
149
150 Edges[Src].push_back(SrcEdge);
151 Edges[Dst].push_back(DstEdge);
152 }
153
154 /// Adding an edge to the network of infinite capacity and a given cost.
addEdge(uint64_t Src,uint64_t Dst,int64_t Cost)155 void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) {
156 addEdge(Src, Dst, INF, Cost);
157 }
158
159 /// Get the total flow from a given source node.
160 /// Returns a list of pairs (target node, amount of flow to the target).
getFlow(uint64_t Src) const161 const std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const {
162 std::vector<std::pair<uint64_t, int64_t>> Flow;
163 for (const auto &Edge : Edges[Src]) {
164 if (Edge.Flow > 0)
165 Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow));
166 }
167 return Flow;
168 }
169
170 /// Get the total flow between a pair of nodes.
getFlow(uint64_t Src,uint64_t Dst) const171 int64_t getFlow(uint64_t Src, uint64_t Dst) const {
172 int64_t Flow = 0;
173 for (const auto &Edge : Edges[Src]) {
174 if (Edge.Dst == Dst) {
175 Flow += Edge.Flow;
176 }
177 }
178 return Flow;
179 }
180
181 private:
182 /// Iteratively find an augmentation path/dag in the network and send the
183 /// flow along its edges. The method returns the number of applied iterations.
applyFlowAugmentation()184 size_t applyFlowAugmentation() {
185 size_t AugmentationIters = 0;
186 while (findAugmentingPath()) {
187 uint64_t PathCapacity = computeAugmentingPathCapacity();
188 while (PathCapacity > 0) {
189 bool Progress = false;
190 if (Params.EvenFlowDistribution) {
191 // Identify node/edge candidates for augmentation
192 identifyShortestEdges(PathCapacity);
193
194 // Find an augmenting DAG
195 auto AugmentingOrder = findAugmentingDAG();
196
197 // Apply the DAG augmentation
198 Progress = augmentFlowAlongDAG(AugmentingOrder);
199 PathCapacity = computeAugmentingPathCapacity();
200 }
201
202 if (!Progress) {
203 augmentFlowAlongPath(PathCapacity);
204 PathCapacity = 0;
205 }
206
207 AugmentationIters++;
208 }
209 }
210 return AugmentationIters;
211 }
212
213 /// Compute the capacity of the cannonical augmenting path. If the path is
214 /// saturated (that is, no flow can be sent along the path), then return 0.
computeAugmentingPathCapacity()215 uint64_t computeAugmentingPathCapacity() {
216 uint64_t PathCapacity = INF;
217 uint64_t Now = Target;
218 while (Now != Source) {
219 uint64_t Pred = Nodes[Now].ParentNode;
220 auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
221
222 assert(Edge.Capacity >= Edge.Flow && "incorrect edge flow");
223 uint64_t EdgeCapacity = uint64_t(Edge.Capacity - Edge.Flow);
224 PathCapacity = std::min(PathCapacity, EdgeCapacity);
225
226 Now = Pred;
227 }
228 return PathCapacity;
229 }
230
231 /// Check for existence of an augmenting path with a positive capacity.
findAugmentingPath()232 bool findAugmentingPath() {
233 // Initialize data structures
234 for (auto &Node : Nodes) {
235 Node.Distance = INF;
236 Node.ParentNode = uint64_t(-1);
237 Node.ParentEdgeIndex = uint64_t(-1);
238 Node.Taken = false;
239 }
240
241 std::queue<uint64_t> Queue;
242 Queue.push(Source);
243 Nodes[Source].Distance = 0;
244 Nodes[Source].Taken = true;
245 while (!Queue.empty()) {
246 uint64_t Src = Queue.front();
247 Queue.pop();
248 Nodes[Src].Taken = false;
249 // Although the residual network contains edges with negative costs
250 // (in particular, backward edges), it can be shown that there are no
251 // negative-weight cycles and the following two invariants are maintained:
252 // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V,
253 // where Dist is the length of the shortest path between two nodes. This
254 // allows to prune the search-space of the path-finding algorithm using
255 // the following early-stop criteria:
256 // -- If we find a path with zero-distance from Source to Target, stop the
257 // search, as the path is the shortest since Dist[Source, Target] >= 0;
258 // -- If we have Dist[Source, V] > Dist[Source, Target], then do not
259 // process node V, as it is guaranteed _not_ to be on a shortest path
260 // from Source to Target; it follows from inequalities
261 // Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target]
262 // >= Dist[Source, V]
263 if (!Params.EvenFlowDistribution && Nodes[Target].Distance == 0)
264 break;
265 if (Nodes[Src].Distance > Nodes[Target].Distance)
266 continue;
267
268 // Process adjacent edges
269 for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) {
270 auto &Edge = Edges[Src][EdgeIdx];
271 if (Edge.Flow < Edge.Capacity) {
272 uint64_t Dst = Edge.Dst;
273 int64_t NewDistance = Nodes[Src].Distance + Edge.Cost;
274 if (Nodes[Dst].Distance > NewDistance) {
275 // Update the distance and the parent node/edge
276 Nodes[Dst].Distance = NewDistance;
277 Nodes[Dst].ParentNode = Src;
278 Nodes[Dst].ParentEdgeIndex = EdgeIdx;
279 // Add the node to the queue, if it is not there yet
280 if (!Nodes[Dst].Taken) {
281 Queue.push(Dst);
282 Nodes[Dst].Taken = true;
283 }
284 }
285 }
286 }
287 }
288
289 return Nodes[Target].Distance != INF;
290 }
291
292 /// Update the current flow along the augmenting path.
augmentFlowAlongPath(uint64_t PathCapacity)293 void augmentFlowAlongPath(uint64_t PathCapacity) {
294 assert(PathCapacity > 0 && "found an incorrect augmenting path");
295 uint64_t Now = Target;
296 while (Now != Source) {
297 uint64_t Pred = Nodes[Now].ParentNode;
298 auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
299 auto &RevEdge = Edges[Now][Edge.RevEdgeIndex];
300
301 Edge.Flow += PathCapacity;
302 RevEdge.Flow -= PathCapacity;
303
304 Now = Pred;
305 }
306 }
307
308 /// Find an Augmenting DAG order using a modified version of DFS in which we
309 /// can visit a node multiple times. In the DFS search, when scanning each
310 /// edge out of a node, continue search at Edge.Dst endpoint if it has not
311 /// been discovered yet and its NumCalls < MaxDfsCalls. The algorithm
312 /// runs in O(MaxDfsCalls * |Edges| + |Nodes|) time.
313 /// It returns an Augmenting Order (Taken nodes in decreasing Finish time)
314 /// that starts with Source and ends with Target.
findAugmentingDAG()315 std::vector<uint64_t> findAugmentingDAG() {
316 // We use a stack based implemenation of DFS to avoid recursion.
317 // Defining DFS data structures:
318 // A pair (NodeIdx, EdgeIdx) at the top of the Stack denotes that
319 // - we are currently visiting Nodes[NodeIdx] and
320 // - the next edge to scan is Edges[NodeIdx][EdgeIdx]
321 typedef std::pair<uint64_t, uint64_t> StackItemType;
322 std::stack<StackItemType> Stack;
323 std::vector<uint64_t> AugmentingOrder;
324
325 // Phase 0: Initialize Node attributes and Time for DFS run
326 for (auto &Node : Nodes) {
327 Node.Discovery = 0;
328 Node.Finish = 0;
329 Node.NumCalls = 0;
330 Node.Taken = false;
331 }
332 uint64_t Time = 0;
333 // Mark Target as Taken
334 // Taken attribute will be propagated backwards from Target towards Source
335 Nodes[Target].Taken = true;
336
337 // Phase 1: Start DFS traversal from Source
338 Stack.emplace(Source, 0);
339 Nodes[Source].Discovery = ++Time;
340 while (!Stack.empty()) {
341 auto NodeIdx = Stack.top().first;
342 auto EdgeIdx = Stack.top().second;
343
344 // If we haven't scanned all edges out of NodeIdx, continue scanning
345 if (EdgeIdx < Edges[NodeIdx].size()) {
346 auto &Edge = Edges[NodeIdx][EdgeIdx];
347 auto &Dst = Nodes[Edge.Dst];
348 Stack.top().second++;
349
350 if (Edge.OnShortestPath) {
351 // If we haven't seen Edge.Dst so far, continue DFS search there
352 if (Dst.Discovery == 0 && Dst.NumCalls < MaxDfsCalls) {
353 Dst.Discovery = ++Time;
354 Stack.emplace(Edge.Dst, 0);
355 Dst.NumCalls++;
356 } else if (Dst.Taken && Dst.Finish != 0) {
357 // Else, if Edge.Dst already have a path to Target, so that NodeIdx
358 Nodes[NodeIdx].Taken = true;
359 }
360 }
361 } else {
362 // If we are done scanning all edge out of NodeIdx
363 Stack.pop();
364 // If we haven't found a path from NodeIdx to Target, forget about it
365 if (!Nodes[NodeIdx].Taken) {
366 Nodes[NodeIdx].Discovery = 0;
367 } else {
368 // If we have found a path from NodeIdx to Target, then finish NodeIdx
369 // and propagate Taken flag to DFS parent unless at the Source
370 Nodes[NodeIdx].Finish = ++Time;
371 // NodeIdx == Source if and only if the stack is empty
372 if (NodeIdx != Source) {
373 assert(!Stack.empty() && "empty stack while running dfs");
374 Nodes[Stack.top().first].Taken = true;
375 }
376 AugmentingOrder.push_back(NodeIdx);
377 }
378 }
379 }
380 // Nodes are collected decreasing Finish time, so the order is reversed
381 std::reverse(AugmentingOrder.begin(), AugmentingOrder.end());
382
383 // Phase 2: Extract all forward (DAG) edges and fill in AugmentingEdges
384 for (size_t Src : AugmentingOrder) {
385 AugmentingEdges[Src].clear();
386 for (auto &Edge : Edges[Src]) {
387 uint64_t Dst = Edge.Dst;
388 if (Edge.OnShortestPath && Nodes[Src].Taken && Nodes[Dst].Taken &&
389 Nodes[Dst].Finish < Nodes[Src].Finish) {
390 AugmentingEdges[Src].push_back(&Edge);
391 }
392 }
393 assert((Src == Target || !AugmentingEdges[Src].empty()) &&
394 "incorrectly constructed augmenting edges");
395 }
396
397 return AugmentingOrder;
398 }
399
400 /// Update the current flow along the given (acyclic) subgraph specified by
401 /// the vertex order, AugmentingOrder. The objective is to send as much flow
402 /// as possible while evenly distributing flow among successors of each node.
403 /// After the update at least one edge is saturated.
augmentFlowAlongDAG(const std::vector<uint64_t> & AugmentingOrder)404 bool augmentFlowAlongDAG(const std::vector<uint64_t> &AugmentingOrder) {
405 // Phase 0: Initialization
406 for (uint64_t Src : AugmentingOrder) {
407 Nodes[Src].FracFlow = 0;
408 Nodes[Src].IntFlow = 0;
409 for (auto &Edge : AugmentingEdges[Src]) {
410 Edge->AugmentedFlow = 0;
411 }
412 }
413
414 // Phase 1: Send a unit of fractional flow along the DAG
415 uint64_t MaxFlowAmount = INF;
416 Nodes[Source].FracFlow = 1.0;
417 for (uint64_t Src : AugmentingOrder) {
418 assert((Src == Target || Nodes[Src].FracFlow > 0.0) &&
419 "incorrectly computed fractional flow");
420 // Distribute flow evenly among successors of Src
421 uint64_t Degree = AugmentingEdges[Src].size();
422 for (auto &Edge : AugmentingEdges[Src]) {
423 double EdgeFlow = Nodes[Src].FracFlow / Degree;
424 Nodes[Edge->Dst].FracFlow += EdgeFlow;
425 if (Edge->Capacity == INF)
426 continue;
427 uint64_t MaxIntFlow = double(Edge->Capacity - Edge->Flow) / EdgeFlow;
428 MaxFlowAmount = std::min(MaxFlowAmount, MaxIntFlow);
429 }
430 }
431 // Stop early if we cannot send any (integral) flow from Source to Target
432 if (MaxFlowAmount == 0)
433 return false;
434
435 // Phase 2: Send an integral flow of MaxFlowAmount
436 Nodes[Source].IntFlow = MaxFlowAmount;
437 for (uint64_t Src : AugmentingOrder) {
438 if (Src == Target)
439 break;
440 // Distribute flow evenly among successors of Src, rounding up to make
441 // sure all flow is sent
442 uint64_t Degree = AugmentingEdges[Src].size();
443 // We are guaranteeed that Node[Src].IntFlow <= SuccFlow * Degree
444 uint64_t SuccFlow = (Nodes[Src].IntFlow + Degree - 1) / Degree;
445 for (auto &Edge : AugmentingEdges[Src]) {
446 uint64_t Dst = Edge->Dst;
447 uint64_t EdgeFlow = std::min(Nodes[Src].IntFlow, SuccFlow);
448 EdgeFlow = std::min(EdgeFlow, uint64_t(Edge->Capacity - Edge->Flow));
449 Nodes[Dst].IntFlow += EdgeFlow;
450 Nodes[Src].IntFlow -= EdgeFlow;
451 Edge->AugmentedFlow += EdgeFlow;
452 }
453 }
454 assert(Nodes[Target].IntFlow <= MaxFlowAmount);
455 Nodes[Target].IntFlow = 0;
456
457 // Phase 3: Send excess flow back traversing the nodes backwards.
458 // Because of rounding, not all flow can be sent along the edges of Src.
459 // Hence, sending the remaining flow back to maintain flow conservation
460 for (size_t Idx = AugmentingOrder.size() - 1; Idx > 0; Idx--) {
461 uint64_t Src = AugmentingOrder[Idx - 1];
462 // Try to send excess flow back along each edge.
463 // Make sure we only send back flow we just augmented (AugmentedFlow).
464 for (auto &Edge : AugmentingEdges[Src]) {
465 uint64_t Dst = Edge->Dst;
466 if (Nodes[Dst].IntFlow == 0)
467 continue;
468 uint64_t EdgeFlow = std::min(Nodes[Dst].IntFlow, Edge->AugmentedFlow);
469 Nodes[Dst].IntFlow -= EdgeFlow;
470 Nodes[Src].IntFlow += EdgeFlow;
471 Edge->AugmentedFlow -= EdgeFlow;
472 }
473 }
474
475 // Phase 4: Update flow values along all edges
476 bool HasSaturatedEdges = false;
477 for (uint64_t Src : AugmentingOrder) {
478 // Verify that we have sent all the excess flow from the node
479 assert(Src == Source || Nodes[Src].IntFlow == 0);
480 for (auto &Edge : AugmentingEdges[Src]) {
481 assert(uint64_t(Edge->Capacity - Edge->Flow) >= Edge->AugmentedFlow);
482 // Update flow values along the edge and its reverse copy
483 auto &RevEdge = Edges[Edge->Dst][Edge->RevEdgeIndex];
484 Edge->Flow += Edge->AugmentedFlow;
485 RevEdge.Flow -= Edge->AugmentedFlow;
486 if (Edge->Capacity == Edge->Flow && Edge->AugmentedFlow > 0)
487 HasSaturatedEdges = true;
488 }
489 }
490
491 // The augmentation is successful iff at least one edge becomes saturated
492 return HasSaturatedEdges;
493 }
494
495 /// Identify candidate (shortest) edges for augmentation.
identifyShortestEdges(uint64_t PathCapacity)496 void identifyShortestEdges(uint64_t PathCapacity) {
497 assert(PathCapacity > 0 && "found an incorrect augmenting DAG");
498 // To make sure the augmentation DAG contains only edges with large residual
499 // capacity, we prune all edges whose capacity is below a fraction of
500 // the capacity of the augmented path.
501 // (All edges of the path itself are always in the DAG)
502 uint64_t MinCapacity = std::max(PathCapacity / 2, uint64_t(1));
503
504 // Decide which edges are on a shortest path from Source to Target
505 for (size_t Src = 0; Src < Nodes.size(); Src++) {
506 // An edge cannot be augmenting if the endpoint has large distance
507 if (Nodes[Src].Distance > Nodes[Target].Distance)
508 continue;
509
510 for (auto &Edge : Edges[Src]) {
511 uint64_t Dst = Edge.Dst;
512 Edge.OnShortestPath =
513 Src != Target && Dst != Source &&
514 Nodes[Dst].Distance <= Nodes[Target].Distance &&
515 Nodes[Dst].Distance == Nodes[Src].Distance + Edge.Cost &&
516 Edge.Capacity > Edge.Flow &&
517 uint64_t(Edge.Capacity - Edge.Flow) >= MinCapacity;
518 }
519 }
520 }
521
522 /// Maximum number of DFS iterations for DAG finding.
523 static constexpr uint64_t MaxDfsCalls = 10;
524
525 /// A node in a flow network.
526 struct Node {
527 /// The cost of the cheapest path from the source to the current node.
528 int64_t Distance;
529 /// The node preceding the current one in the path.
530 uint64_t ParentNode;
531 /// The index of the edge between ParentNode and the current node.
532 uint64_t ParentEdgeIndex;
533 /// An indicator of whether the current node is in a queue.
534 bool Taken;
535
536 /// Data fields utilized in DAG-augmentation:
537 /// Fractional flow.
538 double FracFlow;
539 /// Integral flow.
540 uint64_t IntFlow;
541 /// Discovery time.
542 uint64_t Discovery;
543 /// Finish time.
544 uint64_t Finish;
545 /// NumCalls.
546 uint64_t NumCalls;
547 };
548
549 /// An edge in a flow network.
550 struct Edge {
551 /// The cost of the edge.
552 int64_t Cost;
553 /// The capacity of the edge.
554 int64_t Capacity;
555 /// The current flow on the edge.
556 int64_t Flow;
557 /// The destination node of the edge.
558 uint64_t Dst;
559 /// The index of the reverse edge between Dst and the current node.
560 uint64_t RevEdgeIndex;
561
562 /// Data fields utilized in DAG-augmentation:
563 /// Whether the edge is currently on a shortest path from Source to Target.
564 bool OnShortestPath;
565 /// Extra flow along the edge.
566 uint64_t AugmentedFlow;
567 };
568
569 /// The set of network nodes.
570 std::vector<Node> Nodes;
571 /// The set of network edges.
572 std::vector<std::vector<Edge>> Edges;
573 /// Source node of the flow.
574 uint64_t Source;
575 /// Target (sink) node of the flow.
576 uint64_t Target;
577 /// Augmenting edges.
578 std::vector<std::vector<Edge *>> AugmentingEdges;
579 /// Params for flow computation.
580 const ProfiParams &Params;
581 };
582
583 /// A post-processing adjustment of the control flow. It applies two steps by
584 /// rerouting some flow and making it more realistic:
585 ///
586 /// - First, it removes all isolated components ("islands") with a positive flow
587 /// that are unreachable from the entry block. For every such component, we
588 /// find the shortest from the entry to an exit passing through the component,
589 /// and increase the flow by one unit along the path.
590 ///
591 /// - Second, it identifies all "unknown subgraphs" consisting of basic blocks
592 /// with no sampled counts. Then it rebalnces the flow that goes through such
593 /// a subgraph so that each branch is taken with probability 50%.
594 /// An unknown subgraph is such that for every two nodes u and v:
595 /// - u dominates v and u is not unknown;
596 /// - v post-dominates u; and
597 /// - all inner-nodes of all (u,v)-paths are unknown.
598 ///
599 class FlowAdjuster {
600 public:
FlowAdjuster(const ProfiParams & Params,FlowFunction & Func)601 FlowAdjuster(const ProfiParams &Params, FlowFunction &Func)
602 : Params(Params), Func(Func) {}
603
604 /// Apply the post-processing.
run()605 void run() {
606 if (Params.JoinIslands) {
607 // Adjust the flow to get rid of isolated components
608 joinIsolatedComponents();
609 }
610
611 if (Params.RebalanceUnknown) {
612 // Rebalance the flow inside unknown subgraphs
613 rebalanceUnknownSubgraphs();
614 }
615 }
616
617 private:
joinIsolatedComponents()618 void joinIsolatedComponents() {
619 // Find blocks that are reachable from the source
620 auto Visited = BitVector(NumBlocks(), false);
621 findReachable(Func.Entry, Visited);
622
623 // Iterate over all non-reachable blocks and adjust their weights
624 for (uint64_t I = 0; I < NumBlocks(); I++) {
625 auto &Block = Func.Blocks[I];
626 if (Block.Flow > 0 && !Visited[I]) {
627 // Find a path from the entry to an exit passing through the block I
628 auto Path = findShortestPath(I);
629 // Increase the flow along the path
630 assert(Path.size() > 0 && Path[0]->Source == Func.Entry &&
631 "incorrectly computed path adjusting control flow");
632 Func.Blocks[Func.Entry].Flow += 1;
633 for (auto &Jump : Path) {
634 Jump->Flow += 1;
635 Func.Blocks[Jump->Target].Flow += 1;
636 // Update reachability
637 findReachable(Jump->Target, Visited);
638 }
639 }
640 }
641 }
642
643 /// Run BFS from a given block along the jumps with a positive flow and mark
644 /// all reachable blocks.
findReachable(uint64_t Src,BitVector & Visited)645 void findReachable(uint64_t Src, BitVector &Visited) {
646 if (Visited[Src])
647 return;
648 std::queue<uint64_t> Queue;
649 Queue.push(Src);
650 Visited[Src] = true;
651 while (!Queue.empty()) {
652 Src = Queue.front();
653 Queue.pop();
654 for (auto *Jump : Func.Blocks[Src].SuccJumps) {
655 uint64_t Dst = Jump->Target;
656 if (Jump->Flow > 0 && !Visited[Dst]) {
657 Queue.push(Dst);
658 Visited[Dst] = true;
659 }
660 }
661 }
662 }
663
664 /// Find the shortest path from the entry block to an exit block passing
665 /// through a given block.
findShortestPath(uint64_t BlockIdx)666 std::vector<FlowJump *> findShortestPath(uint64_t BlockIdx) {
667 // A path from the entry block to BlockIdx
668 auto ForwardPath = findShortestPath(Func.Entry, BlockIdx);
669 // A path from BlockIdx to an exit block
670 auto BackwardPath = findShortestPath(BlockIdx, AnyExitBlock);
671
672 // Concatenate the two paths
673 std::vector<FlowJump *> Result;
674 Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end());
675 Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end());
676 return Result;
677 }
678
679 /// Apply the Dijkstra algorithm to find the shortest path from a given
680 /// Source to a given Target block.
681 /// If Target == -1, then the path ends at an exit block.
findShortestPath(uint64_t Source,uint64_t Target)682 std::vector<FlowJump *> findShortestPath(uint64_t Source, uint64_t Target) {
683 // Quit early, if possible
684 if (Source == Target)
685 return std::vector<FlowJump *>();
686 if (Func.Blocks[Source].isExit() && Target == AnyExitBlock)
687 return std::vector<FlowJump *>();
688
689 // Initialize data structures
690 auto Distance = std::vector<int64_t>(NumBlocks(), INF);
691 auto Parent = std::vector<FlowJump *>(NumBlocks(), nullptr);
692 Distance[Source] = 0;
693 std::set<std::pair<uint64_t, uint64_t>> Queue;
694 Queue.insert(std::make_pair(Distance[Source], Source));
695
696 // Run the Dijkstra algorithm
697 while (!Queue.empty()) {
698 uint64_t Src = Queue.begin()->second;
699 Queue.erase(Queue.begin());
700 // If we found a solution, quit early
701 if (Src == Target ||
702 (Func.Blocks[Src].isExit() && Target == AnyExitBlock))
703 break;
704
705 for (auto *Jump : Func.Blocks[Src].SuccJumps) {
706 uint64_t Dst = Jump->Target;
707 int64_t JumpDist = jumpDistance(Jump);
708 if (Distance[Dst] > Distance[Src] + JumpDist) {
709 Queue.erase(std::make_pair(Distance[Dst], Dst));
710
711 Distance[Dst] = Distance[Src] + JumpDist;
712 Parent[Dst] = Jump;
713
714 Queue.insert(std::make_pair(Distance[Dst], Dst));
715 }
716 }
717 }
718 // If Target is not provided, find the closest exit block
719 if (Target == AnyExitBlock) {
720 for (uint64_t I = 0; I < NumBlocks(); I++) {
721 if (Func.Blocks[I].isExit() && Parent[I] != nullptr) {
722 if (Target == AnyExitBlock || Distance[Target] > Distance[I]) {
723 Target = I;
724 }
725 }
726 }
727 }
728 assert(Parent[Target] != nullptr && "a path does not exist");
729
730 // Extract the constructed path
731 std::vector<FlowJump *> Result;
732 uint64_t Now = Target;
733 while (Now != Source) {
734 assert(Now == Parent[Now]->Target && "incorrect parent jump");
735 Result.push_back(Parent[Now]);
736 Now = Parent[Now]->Source;
737 }
738 // Reverse the path, since it is extracted from Target to Source
739 std::reverse(Result.begin(), Result.end());
740 return Result;
741 }
742
743 /// A distance of a path for a given jump.
744 /// In order to incite the path to use blocks/jumps with large positive flow,
745 /// and avoid changing branch probability of outgoing edges drastically,
746 /// set the jump distance so as:
747 /// - to minimize the number of unlikely jumps used and subject to that,
748 /// - to minimize the number of Flow == 0 jumps used and subject to that,
749 /// - minimizes total multiplicative Flow increase for the remaining edges.
750 /// To capture this objective with integer distances, we round off fractional
751 /// parts to a multiple of 1 / BaseDistance.
jumpDistance(FlowJump * Jump) const752 int64_t jumpDistance(FlowJump *Jump) const {
753 if (Jump->IsUnlikely)
754 return Params.CostUnlikely;
755 uint64_t BaseDistance =
756 std::max(FlowAdjuster::MinBaseDistance,
757 std::min(Func.Blocks[Func.Entry].Flow,
758 Params.CostUnlikely / (2 * (NumBlocks() + 1))));
759 if (Jump->Flow > 0)
760 return BaseDistance + BaseDistance / Jump->Flow;
761 return 2 * BaseDistance * (NumBlocks() + 1);
762 };
763
NumBlocks() const764 uint64_t NumBlocks() const { return Func.Blocks.size(); }
765
766 /// Rebalance unknown subgraphs so that the flow is split evenly across the
767 /// outgoing branches of every block of the subgraph. The method iterates over
768 /// blocks with known weight and identifies unknown subgraphs rooted at the
769 /// blocks. Then it verifies if flow rebalancing is feasible and applies it.
rebalanceUnknownSubgraphs()770 void rebalanceUnknownSubgraphs() {
771 // Try to find unknown subgraphs from each block
772 for (const FlowBlock &SrcBlock : Func.Blocks) {
773 // Verify if rebalancing rooted at SrcBlock is feasible
774 if (!canRebalanceAtRoot(&SrcBlock))
775 continue;
776
777 // Find an unknown subgraphs starting at SrcBlock. Along the way,
778 // fill in known destinations and intermediate unknown blocks.
779 std::vector<FlowBlock *> UnknownBlocks;
780 std::vector<FlowBlock *> KnownDstBlocks;
781 findUnknownSubgraph(&SrcBlock, KnownDstBlocks, UnknownBlocks);
782
783 // Verify if rebalancing of the subgraph is feasible. If the search is
784 // successful, find the unique destination block (which can be null)
785 FlowBlock *DstBlock = nullptr;
786 if (!canRebalanceSubgraph(&SrcBlock, KnownDstBlocks, UnknownBlocks,
787 DstBlock))
788 continue;
789
790 // We cannot rebalance subgraphs containing cycles among unknown blocks
791 if (!isAcyclicSubgraph(&SrcBlock, DstBlock, UnknownBlocks))
792 continue;
793
794 // Rebalance the flow
795 rebalanceUnknownSubgraph(&SrcBlock, DstBlock, UnknownBlocks);
796 }
797 }
798
799 /// Verify if rebalancing rooted at a given block is possible.
canRebalanceAtRoot(const FlowBlock * SrcBlock)800 bool canRebalanceAtRoot(const FlowBlock *SrcBlock) {
801 // Do not attempt to find unknown subgraphs from an unknown or a
802 // zero-flow block
803 if (SrcBlock->HasUnknownWeight || SrcBlock->Flow == 0)
804 return false;
805
806 // Do not attempt to process subgraphs from a block w/o unknown sucessors
807 bool HasUnknownSuccs = false;
808 for (auto *Jump : SrcBlock->SuccJumps) {
809 if (Func.Blocks[Jump->Target].HasUnknownWeight) {
810 HasUnknownSuccs = true;
811 break;
812 }
813 }
814 if (!HasUnknownSuccs)
815 return false;
816
817 return true;
818 }
819
820 /// Find an unknown subgraph starting at block SrcBlock. The method sets
821 /// identified destinations, KnownDstBlocks, and intermediate UnknownBlocks.
findUnknownSubgraph(const FlowBlock * SrcBlock,std::vector<FlowBlock * > & KnownDstBlocks,std::vector<FlowBlock * > & UnknownBlocks)822 void findUnknownSubgraph(const FlowBlock *SrcBlock,
823 std::vector<FlowBlock *> &KnownDstBlocks,
824 std::vector<FlowBlock *> &UnknownBlocks) {
825 // Run BFS from SrcBlock and make sure all paths are going through unknown
826 // blocks and end at a known DstBlock
827 auto Visited = BitVector(NumBlocks(), false);
828 std::queue<uint64_t> Queue;
829
830 Queue.push(SrcBlock->Index);
831 Visited[SrcBlock->Index] = true;
832 while (!Queue.empty()) {
833 auto &Block = Func.Blocks[Queue.front()];
834 Queue.pop();
835 // Process blocks reachable from Block
836 for (auto *Jump : Block.SuccJumps) {
837 // If Jump can be ignored, skip it
838 if (ignoreJump(SrcBlock, nullptr, Jump))
839 continue;
840
841 uint64_t Dst = Jump->Target;
842 // If Dst has been visited, skip Jump
843 if (Visited[Dst])
844 continue;
845 // Process block Dst
846 Visited[Dst] = true;
847 if (!Func.Blocks[Dst].HasUnknownWeight) {
848 KnownDstBlocks.push_back(&Func.Blocks[Dst]);
849 } else {
850 Queue.push(Dst);
851 UnknownBlocks.push_back(&Func.Blocks[Dst]);
852 }
853 }
854 }
855 }
856
857 /// Verify if rebalancing of the subgraph is feasible. If the checks are
858 /// successful, set the unique destination block, DstBlock (can be null).
canRebalanceSubgraph(const FlowBlock * SrcBlock,const std::vector<FlowBlock * > & KnownDstBlocks,const std::vector<FlowBlock * > & UnknownBlocks,FlowBlock * & DstBlock)859 bool canRebalanceSubgraph(const FlowBlock *SrcBlock,
860 const std::vector<FlowBlock *> &KnownDstBlocks,
861 const std::vector<FlowBlock *> &UnknownBlocks,
862 FlowBlock *&DstBlock) {
863 // If the list of unknown blocks is empty, we don't need rebalancing
864 if (UnknownBlocks.empty())
865 return false;
866
867 // If there are multiple known sinks, we can't rebalance
868 if (KnownDstBlocks.size() > 1)
869 return false;
870 DstBlock = KnownDstBlocks.empty() ? nullptr : KnownDstBlocks.front();
871
872 // Verify sinks of the subgraph
873 for (auto *Block : UnknownBlocks) {
874 if (Block->SuccJumps.empty()) {
875 // If there are multiple (known and unknown) sinks, we can't rebalance
876 if (DstBlock != nullptr)
877 return false;
878 continue;
879 }
880 size_t NumIgnoredJumps = 0;
881 for (auto *Jump : Block->SuccJumps) {
882 if (ignoreJump(SrcBlock, DstBlock, Jump))
883 NumIgnoredJumps++;
884 }
885 // If there is a non-sink block in UnknownBlocks with all jumps ignored,
886 // then we can't rebalance
887 if (NumIgnoredJumps == Block->SuccJumps.size())
888 return false;
889 }
890
891 return true;
892 }
893
894 /// Decide whether the Jump is ignored while processing an unknown subgraphs
895 /// rooted at basic block SrcBlock with the destination block, DstBlock.
ignoreJump(const FlowBlock * SrcBlock,const FlowBlock * DstBlock,const FlowJump * Jump)896 bool ignoreJump(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
897 const FlowJump *Jump) {
898 // Ignore unlikely jumps with zero flow
899 if (Jump->IsUnlikely && Jump->Flow == 0)
900 return true;
901
902 auto JumpSource = &Func.Blocks[Jump->Source];
903 auto JumpTarget = &Func.Blocks[Jump->Target];
904
905 // Do not ignore jumps coming into DstBlock
906 if (DstBlock != nullptr && JumpTarget == DstBlock)
907 return false;
908
909 // Ignore jumps out of SrcBlock to known blocks
910 if (!JumpTarget->HasUnknownWeight && JumpSource == SrcBlock)
911 return true;
912
913 // Ignore jumps to known blocks with zero flow
914 if (!JumpTarget->HasUnknownWeight && JumpTarget->Flow == 0)
915 return true;
916
917 return false;
918 }
919
920 /// Verify if the given unknown subgraph is acyclic, and if yes, reorder
921 /// UnknownBlocks in the topological order (so that all jumps are "forward").
isAcyclicSubgraph(const FlowBlock * SrcBlock,const FlowBlock * DstBlock,std::vector<FlowBlock * > & UnknownBlocks)922 bool isAcyclicSubgraph(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
923 std::vector<FlowBlock *> &UnknownBlocks) {
924 // Extract local in-degrees in the considered subgraph
925 auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0);
926 auto fillInDegree = [&](const FlowBlock *Block) {
927 for (auto *Jump : Block->SuccJumps) {
928 if (ignoreJump(SrcBlock, DstBlock, Jump))
929 continue;
930 LocalInDegree[Jump->Target]++;
931 }
932 };
933 fillInDegree(SrcBlock);
934 for (auto *Block : UnknownBlocks) {
935 fillInDegree(Block);
936 }
937 // A loop containing SrcBlock
938 if (LocalInDegree[SrcBlock->Index] > 0)
939 return false;
940
941 std::vector<FlowBlock *> AcyclicOrder;
942 std::queue<uint64_t> Queue;
943 Queue.push(SrcBlock->Index);
944 while (!Queue.empty()) {
945 FlowBlock *Block = &Func.Blocks[Queue.front()];
946 Queue.pop();
947 // Stop propagation once we reach DstBlock, if any
948 if (DstBlock != nullptr && Block == DstBlock)
949 break;
950
951 // Keep an acyclic order of unknown blocks
952 if (Block->HasUnknownWeight && Block != SrcBlock)
953 AcyclicOrder.push_back(Block);
954
955 // Add to the queue all successors with zero local in-degree
956 for (auto *Jump : Block->SuccJumps) {
957 if (ignoreJump(SrcBlock, DstBlock, Jump))
958 continue;
959 uint64_t Dst = Jump->Target;
960 LocalInDegree[Dst]--;
961 if (LocalInDegree[Dst] == 0) {
962 Queue.push(Dst);
963 }
964 }
965 }
966
967 // If there is a cycle in the subgraph, AcyclicOrder contains only a subset
968 // of all blocks
969 if (UnknownBlocks.size() != AcyclicOrder.size())
970 return false;
971 UnknownBlocks = AcyclicOrder;
972 return true;
973 }
974
975 /// Rebalance a given subgraph rooted at SrcBlock, ending at DstBlock and
976 /// having UnknownBlocks intermediate blocks.
rebalanceUnknownSubgraph(const FlowBlock * SrcBlock,const FlowBlock * DstBlock,const std::vector<FlowBlock * > & UnknownBlocks)977 void rebalanceUnknownSubgraph(const FlowBlock *SrcBlock,
978 const FlowBlock *DstBlock,
979 const std::vector<FlowBlock *> &UnknownBlocks) {
980 assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph");
981
982 // Ditribute flow from the source block
983 uint64_t BlockFlow = 0;
984 // SrcBlock's flow is the sum of outgoing flows along non-ignored jumps
985 for (auto *Jump : SrcBlock->SuccJumps) {
986 if (ignoreJump(SrcBlock, DstBlock, Jump))
987 continue;
988 BlockFlow += Jump->Flow;
989 }
990 rebalanceBlock(SrcBlock, DstBlock, SrcBlock, BlockFlow);
991
992 // Ditribute flow from the remaining blocks
993 for (auto *Block : UnknownBlocks) {
994 assert(Block->HasUnknownWeight && "incorrect unknown subgraph");
995 uint64_t BlockFlow = 0;
996 // Block's flow is the sum of incoming flows
997 for (auto *Jump : Block->PredJumps) {
998 BlockFlow += Jump->Flow;
999 }
1000 Block->Flow = BlockFlow;
1001 rebalanceBlock(SrcBlock, DstBlock, Block, BlockFlow);
1002 }
1003 }
1004
1005 /// Redistribute flow for a block in a subgraph rooted at SrcBlock,
1006 /// and ending at DstBlock.
rebalanceBlock(const FlowBlock * SrcBlock,const FlowBlock * DstBlock,const FlowBlock * Block,uint64_t BlockFlow)1007 void rebalanceBlock(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
1008 const FlowBlock *Block, uint64_t BlockFlow) {
1009 // Process all successor jumps and update corresponding flow values
1010 size_t BlockDegree = 0;
1011 for (auto *Jump : Block->SuccJumps) {
1012 if (ignoreJump(SrcBlock, DstBlock, Jump))
1013 continue;
1014 BlockDegree++;
1015 }
1016 // If all successor jumps of the block are ignored, skip it
1017 if (DstBlock == nullptr && BlockDegree == 0)
1018 return;
1019 assert(BlockDegree > 0 && "all outgoing jumps are ignored");
1020
1021 // Each of the Block's successors gets the following amount of flow.
1022 // Rounding the value up so that all flow is propagated
1023 uint64_t SuccFlow = (BlockFlow + BlockDegree - 1) / BlockDegree;
1024 for (auto *Jump : Block->SuccJumps) {
1025 if (ignoreJump(SrcBlock, DstBlock, Jump))
1026 continue;
1027 uint64_t Flow = std::min(SuccFlow, BlockFlow);
1028 Jump->Flow = Flow;
1029 BlockFlow -= Flow;
1030 }
1031 assert(BlockFlow == 0 && "not all flow is propagated");
1032 }
1033
1034 /// A constant indicating an arbitrary exit block of a function.
1035 static constexpr uint64_t AnyExitBlock = uint64_t(-1);
1036 /// Minimum BaseDistance for the jump distance values in island joining.
1037 static constexpr uint64_t MinBaseDistance = 10000;
1038
1039 /// Params for flow computation.
1040 const ProfiParams &Params;
1041 /// The function.
1042 FlowFunction &Func;
1043 };
1044
1045 std::pair<int64_t, int64_t> assignBlockCosts(const ProfiParams &Params,
1046 const FlowBlock &Block);
1047 std::pair<int64_t, int64_t> assignJumpCosts(const ProfiParams &Params,
1048 const FlowJump &Jump);
1049
1050 /// Initializing flow network for a given function.
1051 ///
1052 /// Every block is split into two nodes that are responsible for (i) an
1053 /// incoming flow, (ii) an outgoing flow; they penalize an increase or a
1054 /// reduction of the block weight.
initializeNetwork(const ProfiParams & Params,MinCostMaxFlow & Network,FlowFunction & Func)1055 void initializeNetwork(const ProfiParams &Params, MinCostMaxFlow &Network,
1056 FlowFunction &Func) {
1057 uint64_t NumBlocks = Func.Blocks.size();
1058 assert(NumBlocks > 1 && "Too few blocks in a function");
1059 uint64_t NumJumps = Func.Jumps.size();
1060 assert(NumJumps > 0 && "Too few jumps in a function");
1061
1062 // Introducing dummy source/sink pairs to allow flow circulation.
1063 // The nodes corresponding to blocks of the function have indicies in
1064 // the range [0 .. 2 * NumBlocks); the dummy sources/sinks are indexed by the
1065 // next four values.
1066 uint64_t S = 2 * NumBlocks;
1067 uint64_t T = S + 1;
1068 uint64_t S1 = S + 2;
1069 uint64_t T1 = S + 3;
1070
1071 Network.initialize(2 * NumBlocks + 4, S1, T1);
1072
1073 // Initialize nodes of the flow network
1074 for (uint64_t B = 0; B < NumBlocks; B++) {
1075 auto &Block = Func.Blocks[B];
1076
1077 // Split every block into two auxiliary nodes to allow
1078 // increase/reduction of the block count.
1079 uint64_t Bin = 2 * B;
1080 uint64_t Bout = 2 * B + 1;
1081
1082 // Edges from S and to T
1083 if (Block.isEntry()) {
1084 Network.addEdge(S, Bin, 0);
1085 } else if (Block.isExit()) {
1086 Network.addEdge(Bout, T, 0);
1087 }
1088
1089 // Assign costs for increasing/decreasing the block counts
1090 auto [AuxCostInc, AuxCostDec] = assignBlockCosts(Params, Block);
1091
1092 // Add the corresponding edges to the network
1093 Network.addEdge(Bin, Bout, AuxCostInc);
1094 if (Block.Weight > 0) {
1095 Network.addEdge(Bout, Bin, Block.Weight, AuxCostDec);
1096 Network.addEdge(S1, Bout, Block.Weight, 0);
1097 Network.addEdge(Bin, T1, Block.Weight, 0);
1098 }
1099 }
1100
1101 // Initialize edges of the flow network
1102 for (uint64_t J = 0; J < NumJumps; J++) {
1103 auto &Jump = Func.Jumps[J];
1104
1105 // Get the endpoints corresponding to the jump
1106 uint64_t Jin = 2 * Jump.Source + 1;
1107 uint64_t Jout = 2 * Jump.Target;
1108
1109 // Assign costs for increasing/decreasing the jump counts
1110 auto [AuxCostInc, AuxCostDec] = assignJumpCosts(Params, Jump);
1111
1112 // Add the corresponding edges to the network
1113 Network.addEdge(Jin, Jout, AuxCostInc);
1114 if (Jump.Weight > 0) {
1115 Network.addEdge(Jout, Jin, Jump.Weight, AuxCostDec);
1116 Network.addEdge(S1, Jout, Jump.Weight, 0);
1117 Network.addEdge(Jin, T1, Jump.Weight, 0);
1118 }
1119 }
1120
1121 // Make sure we have a valid flow circulation
1122 Network.addEdge(T, S, 0);
1123 }
1124
1125 /// Assign costs for increasing/decreasing the block counts.
assignBlockCosts(const ProfiParams & Params,const FlowBlock & Block)1126 std::pair<int64_t, int64_t> assignBlockCosts(const ProfiParams &Params,
1127 const FlowBlock &Block) {
1128 // Modifying the weight of an unlikely block is expensive
1129 if (Block.IsUnlikely)
1130 return std::make_pair(Params.CostUnlikely, Params.CostUnlikely);
1131
1132 // Assign default values for the costs
1133 int64_t CostInc = Params.CostBlockInc;
1134 int64_t CostDec = Params.CostBlockDec;
1135 // Update the costs depending on the block metadata
1136 if (Block.HasUnknownWeight) {
1137 CostInc = Params.CostBlockUnknownInc;
1138 CostDec = 0;
1139 } else {
1140 // Increasing the count for "cold" blocks with zero initial count is more
1141 // expensive than for "hot" ones
1142 if (Block.Weight == 0)
1143 CostInc = Params.CostBlockZeroInc;
1144 // Modifying the count of the entry block is expensive
1145 if (Block.isEntry()) {
1146 CostInc = Params.CostBlockEntryInc;
1147 CostDec = Params.CostBlockEntryDec;
1148 }
1149 }
1150 return std::make_pair(CostInc, CostDec);
1151 }
1152
1153 /// Assign costs for increasing/decreasing the jump counts.
assignJumpCosts(const ProfiParams & Params,const FlowJump & Jump)1154 std::pair<int64_t, int64_t> assignJumpCosts(const ProfiParams &Params,
1155 const FlowJump &Jump) {
1156 // Modifying the weight of an unlikely jump is expensive
1157 if (Jump.IsUnlikely)
1158 return std::make_pair(Params.CostUnlikely, Params.CostUnlikely);
1159
1160 // Assign default values for the costs
1161 int64_t CostInc = Params.CostJumpInc;
1162 int64_t CostDec = Params.CostJumpDec;
1163 // Update the costs depending on the block metadata
1164 if (Jump.Source + 1 == Jump.Target) {
1165 // Adjusting the fall-through branch
1166 CostInc = Params.CostJumpFTInc;
1167 CostDec = Params.CostJumpFTDec;
1168 }
1169 if (Jump.HasUnknownWeight) {
1170 // The cost is different for fall-through and non-fall-through branches
1171 if (Jump.Source + 1 == Jump.Target)
1172 CostInc = Params.CostJumpUnknownFTInc;
1173 else
1174 CostInc = Params.CostJumpUnknownInc;
1175 CostDec = 0;
1176 } else {
1177 assert(Jump.Weight > 0 && "found zero-weight jump with a positive weight");
1178 }
1179 return std::make_pair(CostInc, CostDec);
1180 }
1181
1182 /// Extract resulting block and edge counts from the flow network.
extractWeights(const ProfiParams & Params,MinCostMaxFlow & Network,FlowFunction & Func)1183 void extractWeights(const ProfiParams &Params, MinCostMaxFlow &Network,
1184 FlowFunction &Func) {
1185 uint64_t NumBlocks = Func.Blocks.size();
1186 uint64_t NumJumps = Func.Jumps.size();
1187
1188 // Extract resulting jump counts
1189 for (uint64_t J = 0; J < NumJumps; J++) {
1190 auto &Jump = Func.Jumps[J];
1191 uint64_t SrcOut = 2 * Jump.Source + 1;
1192 uint64_t DstIn = 2 * Jump.Target;
1193
1194 int64_t Flow = 0;
1195 int64_t AuxFlow = Network.getFlow(SrcOut, DstIn);
1196 if (Jump.Source != Jump.Target)
1197 Flow = int64_t(Jump.Weight) + AuxFlow;
1198 else
1199 Flow = int64_t(Jump.Weight) + (AuxFlow > 0 ? AuxFlow : 0);
1200
1201 Jump.Flow = Flow;
1202 assert(Flow >= 0 && "negative jump flow");
1203 }
1204
1205 // Extract resulting block counts
1206 auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
1207 auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
1208 for (auto &Jump : Func.Jumps) {
1209 InFlow[Jump.Target] += Jump.Flow;
1210 OutFlow[Jump.Source] += Jump.Flow;
1211 }
1212 for (uint64_t B = 0; B < NumBlocks; B++) {
1213 auto &Block = Func.Blocks[B];
1214 Block.Flow = std::max(OutFlow[B], InFlow[B]);
1215 }
1216 }
1217
1218 #ifndef NDEBUG
1219 /// Verify that the provided block/jump weights are as expected.
verifyInput(const FlowFunction & Func)1220 void verifyInput(const FlowFunction &Func) {
1221 // Verify the entry block
1222 assert(Func.Entry == 0 && Func.Blocks[0].isEntry());
1223 for (size_t I = 1; I < Func.Blocks.size(); I++) {
1224 assert(!Func.Blocks[I].isEntry() && "multiple entry blocks");
1225 }
1226 // Verify CFG jumps
1227 for (auto &Block : Func.Blocks) {
1228 assert((!Block.isEntry() || !Block.isExit()) &&
1229 "a block cannot be an entry and an exit");
1230 }
1231 // Verify input block weights
1232 for (auto &Block : Func.Blocks) {
1233 assert((!Block.HasUnknownWeight || Block.Weight == 0 || Block.isEntry()) &&
1234 "non-zero weight of a block w/o weight except for an entry");
1235 }
1236 // Verify input jump weights
1237 for (auto &Jump : Func.Jumps) {
1238 assert((!Jump.HasUnknownWeight || Jump.Weight == 0) &&
1239 "non-zero weight of a jump w/o weight");
1240 }
1241 }
1242
1243 /// Verify that the computed flow values satisfy flow conservation rules.
verifyOutput(const FlowFunction & Func)1244 void verifyOutput(const FlowFunction &Func) {
1245 const uint64_t NumBlocks = Func.Blocks.size();
1246 auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
1247 auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
1248 for (const auto &Jump : Func.Jumps) {
1249 InFlow[Jump.Target] += Jump.Flow;
1250 OutFlow[Jump.Source] += Jump.Flow;
1251 }
1252
1253 uint64_t TotalInFlow = 0;
1254 uint64_t TotalOutFlow = 0;
1255 for (uint64_t I = 0; I < NumBlocks; I++) {
1256 auto &Block = Func.Blocks[I];
1257 if (Block.isEntry()) {
1258 TotalInFlow += Block.Flow;
1259 assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
1260 } else if (Block.isExit()) {
1261 TotalOutFlow += Block.Flow;
1262 assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
1263 } else {
1264 assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
1265 assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
1266 }
1267 }
1268 assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow");
1269
1270 // Verify that there are no isolated flow components
1271 // One could modify FlowFunction to hold edges indexed by the sources, which
1272 // will avoid a creation of the object
1273 auto PositiveFlowEdges = std::vector<std::vector<uint64_t>>(NumBlocks);
1274 for (const auto &Jump : Func.Jumps) {
1275 if (Jump.Flow > 0) {
1276 PositiveFlowEdges[Jump.Source].push_back(Jump.Target);
1277 }
1278 }
1279
1280 // Run BFS from the source along edges with positive flow
1281 std::queue<uint64_t> Queue;
1282 auto Visited = BitVector(NumBlocks, false);
1283 Queue.push(Func.Entry);
1284 Visited[Func.Entry] = true;
1285 while (!Queue.empty()) {
1286 uint64_t Src = Queue.front();
1287 Queue.pop();
1288 for (uint64_t Dst : PositiveFlowEdges[Src]) {
1289 if (!Visited[Dst]) {
1290 Queue.push(Dst);
1291 Visited[Dst] = true;
1292 }
1293 }
1294 }
1295
1296 // Verify that every block that has a positive flow is reached from the source
1297 // along edges with a positive flow
1298 for (uint64_t I = 0; I < NumBlocks; I++) {
1299 auto &Block = Func.Blocks[I];
1300 assert((Visited[I] || Block.Flow == 0) && "an isolated flow component");
1301 }
1302 }
1303 #endif
1304
1305 } // end of anonymous namespace
1306
1307 /// Apply the profile inference algorithm for a given function
applyFlowInference(const ProfiParams & Params,FlowFunction & Func)1308 void llvm::applyFlowInference(const ProfiParams &Params, FlowFunction &Func) {
1309 #ifndef NDEBUG
1310 // Verify the input data
1311 verifyInput(Func);
1312 #endif
1313
1314 // Create and apply an inference network model
1315 auto InferenceNetwork = MinCostMaxFlow(Params);
1316 initializeNetwork(Params, InferenceNetwork, Func);
1317 InferenceNetwork.run();
1318
1319 // Extract flow values for every block and every edge
1320 extractWeights(Params, InferenceNetwork, Func);
1321
1322 // Post-processing adjustments to the flow
1323 auto Adjuster = FlowAdjuster(Params, Func);
1324 Adjuster.run();
1325
1326 #ifndef NDEBUG
1327 // Verify the result
1328 verifyOutput(Func);
1329 #endif
1330 }
1331
1332 /// Apply the profile inference algorithm for a given flow function
applyFlowInference(FlowFunction & Func)1333 void llvm::applyFlowInference(FlowFunction &Func) {
1334 ProfiParams Params;
1335 // Set the params from the command-line flags.
1336 Params.EvenFlowDistribution = SampleProfileEvenFlowDistribution;
1337 Params.RebalanceUnknown = SampleProfileRebalanceUnknown;
1338 Params.JoinIslands = SampleProfileJoinIslands;
1339 Params.CostBlockInc = SampleProfileProfiCostBlockInc;
1340 Params.CostBlockDec = SampleProfileProfiCostBlockDec;
1341 Params.CostBlockEntryInc = SampleProfileProfiCostBlockEntryInc;
1342 Params.CostBlockEntryDec = SampleProfileProfiCostBlockEntryDec;
1343 Params.CostBlockZeroInc = SampleProfileProfiCostBlockZeroInc;
1344 Params.CostBlockUnknownInc = SampleProfileProfiCostBlockUnknownInc;
1345
1346 applyFlowInference(Params, Func);
1347 }
1348