PaStiX Handbook  6.3.2
propmap.c
Go to the documentation of this file.
1 /**
2  *
3  * @file propmap.c
4  *
5  * PaStiX analyse proportionnal mapping functions.
6  *
7  * @copyright 2004-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.3.2
11  * @author Pascal Henon
12  * @author Mathieu Faverge
13  * @author Gregoire Pichon
14  * @author Xavier Lacoste
15  * @date 2023-07-21
16  *
17  * @addtogroup blend_dev_propmap
18  * @{
19  *
20  **/
21 #include "common.h"
22 #include "symbol/symbol.h"
23 #include "blend/elimintree.h"
24 #include "blend/cost.h"
25 #include "blend/cand.h"
26 #include "blend/extendVector.h"
27 #include "blend/blendctrl.h"
28 #include "queue.h"
29 
30 /**
31  * @brief Minimal work ratio to accept the contribution from one additional candidate
32  */
33 #define CROSS_TOLERANCE 0.1
34 
35 /**
36  * @brief Proportional mapping structure to forward the arguments throught the
37  * recursive calls.
38  */
39 typedef struct propmap_s {
40  const EliminTree *etree; /**< Elimination tree to map */
41  Cand *candtab; /**< Candidate array for each node of the etree */
42  pastix_int_t candnbr; /**< Number of candidates available */
43  int nocrossproc; /**< Enable/disable the distribution of one candidate on multiple branches */
45 
46 /**
47  *******************************************************************************
48  *
49  * @brief Set the given candidates to all the subtree w/o conditions.
50  *
51  *******************************************************************************
52  *
53  * @param[in] pmptr
54  * Pointer to the parameters of the proportional mapping algorithm.
55  *
56  * @param[in] rootnum
57  * Index of the root of the subtree to be given to [fcandnum,lcandnum]
58  *
59  * @param[in] fcandnum
60  * Rank of the first candidate attributed to this subtree.
61  *
62  * @param[in] lcandnum
63  * Rank of the last candidate attributed to this subtree.
64  *
65  * @param[in] cluster
66  * The cluster value for the subtree.
67  *
68  *******************************************************************************/
69 static inline void
71  pastix_int_t rootnum,
72  pastix_int_t fcandnum,
73  pastix_int_t lcandnum,
74  pastix_int_t cluster )
75 {
76  pastix_int_t i;
77  pastix_int_t sonsnbr;
78 
79  pmptr->candtab[rootnum].fcandnum = fcandnum;
80  pmptr->candtab[rootnum].lcandnum = lcandnum;
81  pmptr->candtab[rootnum].cluster = cluster;
82 
83  /* Recursively apply the affectation to the sons */
84  sonsnbr = pmptr->etree->nodetab[rootnum].sonsnbr;
85  for(i=0;i<sonsnbr;i++) {
86  propMappSubtreeOn1P( pmptr, eTreeSonI(pmptr->etree, rootnum, i),
87  fcandnum, lcandnum, cluster );
88  }
89 
90  return;
91 }
92 
93 /**
94  *******************************************************************************
95  *
96  * @brief Apply the proportional mapping algorithm to a subtree.
97  *
98  *******************************************************************************
99  *
100  * @param[in] pmptr
101  * Pointer to the parameters of the proportional mapping algorithm.
102  *
103  * @param[in] rootnum
104  * Index of the root of the subtree to process.
105  *
106  * @param[in] fcandnum
107  * Rank of the first candidate attributed to this subtree.
108  *
109  * @param[in] lcandnum
110  * Rank of the last candidate attributed to this subtree.
111  *
112  * @param[in] cluster
113  * The cluster value for the subtree.
114  *
115  * @param[inout] cost_remain
116  * Array of size (lcandnum-fcandnum+1).
117  * Stores the remaining idle time of each candidate to balance the load
118  * among them. On exit, the cost is updated with the affected subtrees.
119  *
120  *******************************************************************************/
121 static inline void
123  pastix_int_t rootnum,
124  pastix_int_t fcandnum,
125  pastix_int_t lcandnum,
126  pastix_int_t cluster,
127  double *cost_remain )
128 {
129  pastix_queue_t *queue_tree;
130  pastix_int_t p;
131  pastix_int_t candnbr, scandnbr;
132  pastix_int_t fcand = 0;
133  pastix_int_t lcand = 0;
134  pastix_int_t i;
135  pastix_int_t sonsnbr;
136  double isocost;
137  double aspt_cost;
138  double cumul_cost;
139  double *sub_cost_remain = NULL;
140  double epsilon;
141 
142  candnbr = lcandnum - fcandnum + 1;
143 
144  /* If only one candidate, all the subtree belongs to it */
145  if (candnbr == 1)
146  {
147  propMappSubtreeOn1P( pmptr, rootnum, fcandnum, lcandnum, cluster );
148  return;
149  }
150 
151  /* Set the cand group for this tree node */
152  pmptr->candtab[rootnum].fcandnum = fcandnum;
153  pmptr->candtab[rootnum].lcandnum = lcandnum;
154  pmptr->candtab[rootnum].cluster = cluster;
155 
156  /* This treenode is a leave, return */
157  if(pmptr->etree->nodetab[rootnum].sonsnbr == 0)
158  {
159  return;
160  }
161 
162  assert( pmptr->etree->nodetab[rootnum].ndecost <= pmptr->etree->nodetab[rootnum].subcost );
163 
164  /* Work that each processor is intended to get from this treenode */
165  isocost = pmptr->etree->nodetab[rootnum].ndecost / candnbr;
166  for(p=0;p<candnbr;p++)
167  cost_remain[p] -= isocost;
168 
169  /* Get cost remaining in the descendance of the treenode */
170  aspt_cost = pmptr->etree->nodetab[rootnum].subcost - pmptr->etree->nodetab[rootnum].ndecost;
171 
172  /*
173  * If the first and last candidate have already received more work that they
174  * should, we remove them from the set
175  */
176  if(cost_remain[0] <= 0)
177  fcand = 1;
178  else
179  fcand = 0;
180  if (cost_remain[candnbr-1] <= 0)
181  candnbr--;
182 
183  assert(fcand < candnbr);
184  assert(candnbr >= 1);
185 
186  /* Make sure that the sum of cost_remain in used proc is at least equals to aspt_cost */
187  cumul_cost = 0;
188  for(i=fcand; i<candnbr; i++)
189  cumul_cost += cost_remain[i];
190 
191  if (cumul_cost < aspt_cost) {
192  double ratio = aspt_cost / cumul_cost;
193  for(i=fcand; i<candnbr; i++)
194  cost_remain[i] *= ratio;
195  }
196 
197  /* Compute the minimun participation rate of a candidat processor */
198  epsilon = (CROSS_TOLERANCE * cumul_cost) / ((double)candnbr);
199 
200  /*
201  * Compute the cand group for each proc
202  */
203  sonsnbr = pmptr->etree->nodetab[rootnum].sonsnbr;
204 
205  /* Create the list of sons sorted by descending order of cost */
206  MALLOC_INTERN(queue_tree, 1, pastix_queue_t);
207  pqueueInit(queue_tree, sonsnbr);
208  double soncost;
209  for(i=0; i<sonsnbr; i++)
210  {
211  /* Cost in the current subtree to be mapped */
212  cumul_cost = -pmptr->etree->nodetab[eTreeSonI(pmptr->etree, rootnum, i)].subcost;
213 
214  /* Cost of the root node in the subtree */
215  soncost = -pmptr->etree->nodetab[eTreeSonI(pmptr->etree, rootnum, i)].ndecost;
216 
217  pqueuePush2(queue_tree, i, cumul_cost, (pastix_int_t)soncost);
218  }
219 
220  /* Proportionnal mapping of the subtree on remaining candidates */
221  /* The first stage deals only with nodes that require multiple candidates */
222  while (pqueueSize(queue_tree) > 0)
223  {
224  i = pqueuePop2( queue_tree, &cumul_cost, NULL );
225  cumul_cost = -cumul_cost;
226 
227  /*
228  * If the subtree cost is smaller than what the first candidate can
229  * process, following nodes will be smaller and following candidates have
230  * at least the same amount of ressources available so, we skip to the
231  * second stage
232  */
233  if (cumul_cost <= cost_remain[fcand]) {
234  cost_remain[fcand] -= cumul_cost;
235  propMappSubtreeOn1P( pmptr, eTreeSonI(pmptr->etree, rootnum, i),
236  fcandnum+fcand, fcandnum+fcand, cluster);
237  break;
238  }
239 
240  /*
241  * If a candidate cannot participate to multiple subtrees, we check with
242  * epsilon to avoid overflow
243  */
244  if ( pmptr->nocrossproc &&
245  (cumul_cost <= (cost_remain[fcand]+epsilon)) )
246  {
247  cost_remain[fcand] -= cumul_cost;
248  propMappSubtreeOn1P( pmptr, eTreeSonI(pmptr->etree, rootnum, i),
249  fcandnum+fcand, fcandnum+fcand, cluster );
250  break;
251  }
252 
253  /* If the first candidate doesn't have enough ressources, we skip it */
254  if( (fcand < candnbr-1) && (cost_remain[fcand] <= epsilon) )
255  {
256  fcand++;
257  }
258 
259  /*
260  * Computes how many candidate will participate to this node. We add
261  * candidate as long as we have some and they all have more than espilon
262  * extra work.
263  */
264  lcand = fcand;
265  scandnbr = 1;
266  cumul_cost -= cost_remain[fcand];
267  while ((( pmptr->nocrossproc && (cumul_cost > ((double)scandnbr)*epsilon)) ||
268  (!pmptr->nocrossproc && (cumul_cost > epsilon))) &&
269  (lcand < candnbr - 1))
270  {
271  lcand++; scandnbr++;
272  assert( cost_remain[lcand] > 0 );
273  cumul_cost -= cost_remain[lcand];
274  }
275 
276  /*
277  * Prepare the sub_cost_remain array.
278  * If some cost is remaining, we distribute equally the work to each
279  * candidate, otherwise we give the cost remaining to the last candidate
280  * and set to zero the remaining cost of the first candidates.
281  */
282  MALLOC_INTERN(sub_cost_remain, lcand-fcand+1, double);
283  if (cumul_cost > 0)
284  {
285  isocost = cumul_cost / scandnbr;
286  for(p=0; p<scandnbr; p++)
287  {
288  sub_cost_remain[p] = cost_remain[fcand+p] + isocost;
289  cost_remain[fcand+p] = - isocost;
290  }
291  }
292  else
293  {
294  for(p=0; p<scandnbr-1; p++)
295  {
296  sub_cost_remain[p] = cost_remain[fcand+p];
297  cost_remain[fcand+p] = 0.0;
298  }
299  sub_cost_remain[scandnbr-1] = cost_remain[lcand] + cumul_cost;
300  cost_remain[fcand+scandnbr-1] = -cumul_cost; /* cumul_cost <= 0 */
301  }
302  /* Go on to subtree */
303  propMappSubtree( pmptr, eTreeSonI(pmptr->etree, rootnum, i),
304  fcandnum+fcand, fcandnum+lcand, cluster, sub_cost_remain);
305  memFree_null(sub_cost_remain);
306 
307  if ( (lcand < candnbr - 1) &&
308  ( pmptr->nocrossproc ||
309  (cost_remain[lcand] < epsilon) ) )
310  {
311  fcand = lcand+1;
312  }
313  else
314  {
315  if ( pmptr->nocrossproc )
316  break;
317  fcand = lcand;
318  }
319  }
320 
321  if (pqueueSize(queue_tree) > 0)
322  {
323  pastix_queue_t *queue_proc;
324 
325  /*
326  * Second stage:
327  * Distribute the single candidate subtree. At each step of the algorithm we
328  * associate together the candidate with maximum cost_remain and the largest
329  * son.
330  */
331 
332  /* Fill queue proc order by remain cost descending */
333  MALLOC_INTERN(queue_proc, 1, pastix_queue_t);
334  pqueueInit(queue_proc, candnbr);
335  for (i=0; i<candnbr; i++) {
336  pqueuePush1(queue_proc, i, -cost_remain[i]);
337  }
338 
339  while (pqueueSize(queue_tree) > 0)
340  {
341  /* Get the largest node */
342  i = pqueuePop2( queue_tree, &cumul_cost, NULL );
343  cumul_cost = -cumul_cost;
344 
345  /* Get the candidate with the largest cost_remain */
346  fcand = pqueuePop(queue_proc);
347 
348  /* Map them together */
349  propMappSubtreeOn1P( pmptr, eTreeSonI(pmptr->etree, rootnum, i),
350  fcandnum+fcand, fcandnum+fcand, cluster);
351 
352  /* Update cost_remain and re-insert into the sorted queue */
353  cost_remain[fcand] -= cumul_cost;
354  pqueuePush1(queue_proc, fcand, -cost_remain[fcand]);
355  }
356 
357  pqueueExit(queue_proc);
358  memFree(queue_proc);
359  }
360 
361  pqueueExit(queue_tree);
362  memFree(queue_tree);
363  return;
364 }
365 
366 /**
367  * @}
368  */
369 
370 /**
371  *******************************************************************************
372  *
373  * @ingroup pastix_blend
374  *
375  * @brief Apply the proportional mapping algorithm.
376  *
377  * This function computes the proportionnal mapping of the elimination tree. The
378  * result is a set of potential candidates to compute each node of the
379  * elimination tree. The real candidate will be affected during the simulation
380  * with simuRun(). It is then important to reduce as much as possible the number
381  * of candidates per node, while keeping enough freedom for the scheduling to
382  * allow a good load balance and few idle times in the final static decision.
383  *
384  *******************************************************************************
385  *
386  * @param[inout] candtab
387  * On entry, the candtab array must contain the cost of each node of
388  * the elimination tree, and their depth in the tree as computed by
389  * candBuild().
390  * On exit, the fields fcandnum and lcandnum are computed with the
391  * proportional mapping algorithm that tries to balance the load
392  * between the candidates and distribute the branches to everyone
393  * according to their cost.
394  *
395  * @param[in] etree
396  * The elimination tree to map on the ressources.
397  *
398  * @param[in] candnbr
399  * The total number of candidates to distribute over the elimination
400  * tree.
401  *
402  * @param[in] nocrossproc
403  * If nocrossproc is enabled, candidates can NOT be part of two
404  * subranches with different co-workers in each branch.
405  * If nocrossproc is disabled, candidate can be shared between two
406  * subranches if the amount of extra work exceeds 10%.
407  *
408  * @param[in] allcand
409  * No proportional mapping is performed and everyone is candidate to
410  * everything. This will have a large performance impact on the
411  * simulation.
412  *
413  *******************************************************************************/
414 void
415 propMappTree( Cand *candtab,
416  const EliminTree *etree,
417  pastix_int_t candnbr,
418  int nocrossproc, int allcand )
419 {
420  propmap_t pmdata;
421  pastix_int_t p;
422 
423  /* Prepare the stucture */
424  pmdata.candtab = candtab;
425  pmdata.etree = etree;
426  pmdata.candnbr = candnbr;
427  pmdata.nocrossproc = nocrossproc;
428 
429  if ( allcand ) {
430  propMappSubtreeOn1P( &pmdata, eTreeRoot(etree),
431  0, candnbr-1, 0 );
432  }
433  else {
434  double *cost_remain = NULL;
435  double isocost;
436 
437  /* Prepare the initial cost_remain array */
438  MALLOC_INTERN(cost_remain, candnbr, double);
439  isocost = etree->nodetab[ eTreeRoot(etree) ].subcost / candnbr;
440 
441  for(p=0; p<candnbr; p++) {
442  cost_remain[p] = isocost;
443  }
444 
445  propMappSubtree( &pmdata, eTreeRoot(etree),
446  0, candnbr-1,
447  0, cost_remain );
448 
449  memFree_null(cost_remain);
450  }
451 }
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
Processor candidate group to own a column blok.
Definition: cand.h:28
double subcost
Definition: elimintree.h:28
eTreeNode_t * nodetab
Definition: elimintree.h:42
static pastix_int_t eTreeRoot(const EliminTree *etree)
Return the root of the elimination tree.
Definition: elimintree.h:126
static pastix_int_t eTreeSonI(const EliminTree *etree, pastix_int_t node, pastix_int_t i)
Return the i^{th} son of a given node.
Definition: elimintree.h:105
Elimination tree.
Definition: elimintree.h:39
static void propMappSubtreeOn1P(const propmap_t *pmptr, pastix_int_t rootnum, pastix_int_t fcandnum, pastix_int_t lcandnum, pastix_int_t cluster)
Set the given candidates to all the subtree w/o conditions.
Definition: propmap.c:70
#define CROSS_TOLERANCE
Minimal work ratio to accept the contribution from one additional candidate.
Definition: propmap.c:33
static void propMappSubtree(const propmap_t *pmptr, pastix_int_t rootnum, pastix_int_t fcandnum, pastix_int_t lcandnum, pastix_int_t cluster, double *cost_remain)
Apply the proportional mapping algorithm to a subtree.
Definition: propmap.c:122
struct propmap_s propmap_t
Proportional mapping structure to forward the arguments throught the recursive calls.
void pqueuePush2(pastix_queue_t *, pastix_int_t, double, double)
Insert an element into the sorted queue.
Definition: queue.c:178
static void pqueuePush1(pastix_queue_t *q, pastix_int_t elt, double key1)
Push an element with a single key.
Definition: queue.h:64
void pqueueExit(pastix_queue_t *)
Free the structure associated to the queue.
Definition: queue.c:110
pastix_int_t pqueuePop2(pastix_queue_t *, double *, double *)
Remove the first element of the queue and return its keys if needed.
Definition: queue.c:268
pastix_int_t pqueueSize(const pastix_queue_t *)
Return the size of the queue.
Definition: queue.c:135
static pastix_int_t pqueuePop(pastix_queue_t *q)
Pop the head of the queue whithout returning the keys.
Definition: queue.h:75
int pqueueInit(pastix_queue_t *, pastix_int_t)
Initialize the queue structure with an initial space to store the elements.
Definition: queue.c:81
Queue structure.
Definition: queue.h:38
void propMappTree(Cand *candtab, const EliminTree *etree, pastix_int_t candnbr, int nocrossproc, int allcand)
Apply the proportional mapping algorithm.
Definition: propmap.c:415