PIPS
freia_terapix.c
Go to the documentation of this file.
1 /*
2 
3  $Id: freia_terapix.c 23495 2018-10-24 09:19:47Z coelho $
4 
5  Copyright 1989-2016 MINES ParisTech
6 
7  This file is part of PIPS.
8 
9  PIPS is free software: you can redistribute it and/or modify it
10  under the terms of the GNU General Public License as published by
11  the Free Software Foundation, either version 3 of the License, or
12  any later version.
13 
14  PIPS is distributed in the hope that it will be useful, but WITHOUT ANY
15  WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  FITNESS FOR A PARTICULAR PURPOSE.
17 
18  See the GNU General Public License for more details.
19 
20  You should have received a copy of the GNU General Public License
21  along with PIPS. If not, see <http://www.gnu.org/licenses/>.
22 
23 */
24 
25 #ifdef HAVE_CONFIG_H
26 #include "pips_config.h"
27 #endif
28 
29 #include <stdint.h>
30 #include <stdlib.h>
31 
32 #include "genC.h"
33 #include "misc.h"
34 
35 #include "linear.h"
36 
37 #include "ri.h"
38 #include "effects.h"
39 #include "ri-util.h"
40 #include "effects-util.h"
41 #include "properties.h"
42 
43 #include "freia.h"
44 #include "freia_spoc_private.h"
45 #include "hwac.h"
46 
47 /********************************************************** TERAPIX ANALYSES */
48 
49 /* @return the dead vertices (their output is dead) after computing v in d.
50  * ??? should it take care that an output node is never dead?
51  */
53  (set deads, const set computed, const dag d, const dagvtx v)
54 {
55  list preds = dag_vertex_preds(d, v);
56  set futured_computed = set_dup(computed);
57  set_add_element(futured_computed, futured_computed, v);
58  FOREACH(dagvtx, p, preds)
59  if (// !gen_in_list_p(p, dag_outputs(d)) &&
60  list_in_set_p(dagvtx_succs(p), futured_computed))
61  set_add_element(deads, deads, p);
62  gen_free_list(preds);
63  set_free(futured_computed);
64 }
65 
66 /* tell whether the kernel is used on each of the 4 directions.
67  */
69  (dagvtx v, bool * north, bool * south, bool * west, bool * east)
70 {
71  // default result
72  *north = true, *south = true, *west = true, *east = true;
73  intptr_t k00, k10, k20, k01, k11, k21, k02, k12, k22;
74  freia_extract_kernel_vtx(v, false,
75  &k00, &k10, &k20, &k01, &k11, &k21, &k02, &k12, &k22);
76  // summarize for each four directions
77  *north = k00 || k10 || k20;
78  *south = k02 || k12 || k22;
79  *west = k00 || k01 || k02;
80  *east = k20 || k21 || k22;
81 }
82 
83 // stupid hack, to have only one hash table for the 4 directions:
84 // as the key is a pointer, the alignment ensures that +0 to +3
85 // are distinct values thus should not clash one with another.
86 #define NORTH(v) ((void*) (((_int)v)+0))
87 #define SOUTH(v) ((void*) (((_int)v)+1))
88 #define WEST(v) ((void*) (((_int)v)+2))
89 #define EAST(v) ((void*) (((_int)v)+3))
90 
91 /* update_erosions().
92  * compute and store the imagelet erosion on vertex v output.
93  */
94 static void update_erosions
95  (const dag d, const dagvtx v, hash_table erosion)
96 {
97  _int n = 0, s = 0, w = 0, e = 0;
98 
99  // compute most eroded imagelet
100  const list preds = dag_vertex_preds(d, v);
101  FOREACH(dagvtx, p, preds)
102  {
103  if ((_int)hash_get(erosion, NORTH(p))>n)
104  n = (_int) hash_get(erosion, NORTH(p));
105  if ((_int)hash_get(erosion, SOUTH(p))>s)
106  s = (_int) hash_get(erosion, SOUTH(p));
107  if ((_int)hash_get(erosion, WEST(p))>w)
108  w = (_int) hash_get(erosion, WEST(p));
109  if ((_int)hash_get(erosion, EAST(p))>e)
110  e = (_int) hash_get(erosion, EAST(p));
111  }
112  gen_free_list(preds);
113 
114  // update with vertex erosion
115  const freia_api_t * api = dagvtx_freia_api(v);
116 
117  // for erode/dilate, I look at the kernel, and if it is a
118  // "const" with initial values { 000 XXX XXX } => north=0 and so on.
119  // this is interesting on licensePlate, even if the zero
120  // computations are still performed...
121  if (freia_convolution_p(v)) // convolution special handling...
122  {
123  _int width, height;
124  if (freia_convolution_width_height(v, &width, &height, false))
125  {
126  w+=width/2;
127  e+=width/2;
128  n+=height/2;
129  s+=height/2;
130  }
131  // else simply ignore, should not be used anyway...
132  }
133  else if (api->terapix.north) // erode & dilate
134  {
135  bool north = true, south = true, west = true, east = true;
136  erosion_optimization(v, &north, &south, &west, &east);
137  if (north) n += api->terapix.north;
138  if (south) s += api->terapix.south;
139  if (west) w += api->terapix.west;
140  if (east) e += api->terapix.east;
141  }
142 
143  // store results
144  hash_put(erosion, NORTH(v), (void*) n);
145  hash_put(erosion, SOUTH(v), (void*) s);
146  hash_put(erosion, WEST(v), (void*) w);
147  hash_put(erosion, EAST(v), (void*) e);
148 }
149 
150 /* compute some measures about DAG d.
151  * @return its length (depth)
152  * width, aka maximum live produced image in a level by level
153  * terapix computation cost (per row)
154  * scheduling (could do a better job with a better scheduling?)
155  * maximal erosion in all four directions
156  */
158  (const dag d, hash_table erosion,
159  int * width, int * cost, int * nops,
160  int * north, int * south, int * west, int * east)
161 {
162  set processed = set_make(set_pointer);
163  int dcost = 0, dlength = 0, dwidth = gen_length(dag_inputs(d)), dnops = 0;
164  bool keep_erosion = erosion!=NULL;
165  // vertex output -> NSWE erosion (v+0 to v+3 is N S W E)
166  if (!keep_erosion) erosion = hash_table_make(hash_pointer, 0);
167 
168  FOREACH(dagvtx, in, dag_inputs(d))
169  update_erosions(d, in, erosion);
170 
171  list lv;
172  while ((lv = dag_computable_vertices(d, processed, processed, processed)))
173  {
174  dlength++;
175  int level_width = 0;
176  FOREACH(dagvtx, v, lv)
177  {
178  const freia_api_t * api = dagvtx_freia_api(v);
179  if (freia_convolution_p(v)) // special handling...
180  {
181  _int w, h;
182  if (freia_convolution_width_height(v, &w, &h, false))
183  dcost += 8+(api->terapix.cost*w*h); // hmmm? 3x3 is 35?
184  else
185  dcost += 35; // au pif 3x3
186  }
187  else
188  dcost += api->terapix.cost;
189  // only count non null operations
190  if (api->terapix.cost && api->terapix.cost!=-1) dnops ++;
191  if (api->arg_img_out) level_width++;
192  update_erosions(d, v, erosion);
193  }
194  if (level_width>dwidth) dwidth = level_width;
195 
196  set_append_list(processed, lv);
197  gen_free_list(lv);
198  }
199 
200  // update width
201  int nouts = gen_length(dag_outputs(d));
202  if (nouts>dwidth) dwidth = nouts;
203 
204  // compute overall worth erosion
205  int n=0, s=0, w=0, e=0;
207  {
208  if ((_int)hash_get(erosion, NORTH(out))>n)
209  n = (int) (_int) hash_get(erosion, NORTH(out));
210  if ((_int)hash_get(erosion, SOUTH(out))>s)
211  s = (int) (_int) hash_get(erosion, SOUTH(out));
212  if ((_int)hash_get(erosion, WEST(out))>w)
213  w = (int) (_int) hash_get(erosion, WEST(out));
214  if ((_int)hash_get(erosion, EAST(out))>e)
215  e = (int) (_int) hash_get(erosion, EAST(out));
216  }
217 
218  // cleanup
219  set_free(processed);
220  if (!keep_erosion) hash_table_free(erosion);
221 
222  // return results
223  *north = n, *south = s, *west = w, *east = e,
224  *width = dwidth, *cost = dcost, *nops = dnops;
225  return dlength;
226 }
227 
228 /* @return the list of inputs to vertex v as imagelet numbers.
229  */
230 static list /* of ints */ dag_vertex_pred_imagelets
231  (const dag d, const dagvtx v, const hash_table allocation)
232 {
233  list limagelets = NIL;
235  {
236  dagvtx prod = dagvtx_get_producer(d, v, img, 0);
237  pips_assert("some producer found!", prod!=NULL);
238  limagelets =
239  gen_nconc(limagelets,
240  CONS(int, (int)(_int) hash_get(allocation, prod), NIL));
241  }
242  return limagelets;
243 }
244 
245 /************************************************** GLOBAL MEMORY MANAGEMENT */
246 
247 /* allocate bitfield to described used cells in global memory.
248  */
249 static bool * terapix_gram_init(void)
250 {
251  int row_size = get_int_property(trpx_gram_width);
252  int col_size = get_int_property(trpx_gram_height);
253  bool * gram = (bool *) malloc(sizeof(bool)*row_size*col_size);
254  pips_assert("malloc ok", gram);
255  for (int i=0; i<row_size*col_size; i++)
256  gram[i] = false;
257  return gram;
258 }
259 
260 /* terapix allocate widthxheight in global memory
261  * @return *x *y pointer to available memory
262  */
264  (bool * used, int width, int height, int * x, int * y)
265 {
266  int row_size = get_int_property(trpx_gram_width);
267  int col_size = get_int_property(trpx_gram_height);
268  for (int j = 0; j<col_size-height+1; j++)
269  {
270  for (int i = 0; i<row_size-width+1; i++)
271  {
272  bool ok = true;
273  for (int w = 0; ok && w<width; w++)
274  for (int h = 0; ok && h<height; h++)
275  ok &= !used[(i+w)+(j+h)*row_size];
276  if (ok)
277  {
278  for (int w = 0; w<width; w++)
279  for (int h = 0; h<height; h++)
280  used[(i+w)+(j+h)*row_size] = true;
281  *x = i;
282  *y = j;
283  return;
284  }
285  }
286  }
287  pips_internal_error("cannot find available memory for %dx%d", width, height);
288 }
289 
290 /********************************** TERAPIX CODE GENERATION HELPER FUNCTIONS */
291 
292 /* Return the first/last available imagelet, or create one if necessary
293  * This ensures that the choice is deterministic.
294  * Moreover, as first numbers are IO imagelets, this help putting outputs
295  * in the right imagelet so as to avoid additionnal copies, if possible.
296  */
297 static _int select_imagelet(set availables, int * nimgs, bool first)
298 {
299  ifdebug(8) {
300  pips_debug(8, "selecting first=%s\n", bool_to_string(first));
301  set_fprint(stderr, "availables", availables, (string (*)()) i2a);
302  }
303 
304  _int choice = 0; // zero means no choice yet
305  // allocate if no images are available
306  if (set_empty_p(availables))
307  {
308  pips_assert("can create new images", nimgs!=NULL);
309  (*nimgs)++;
310  choice = *nimgs;
311  }
312  else // search
313  {
314  SET_FOREACH(_int, i, availables)
315  {
316  if (choice==0) choice = i;
317  if (first && (i<choice)) choice = i;
318  if (!first && (i>choice)) choice = i;
319  }
320  set_del_element(availables, availables, (void*) choice);
321  }
322  pips_assert("some choice was made", choice>0);
323  pips_debug(8, "choice is %"_intFMT"\n", choice);
324  return choice;
325 }
326 
327 #define IMG_PTR "imagelet_"
328 #define RED_PTR "reduction_"
329 
330 /* generate an image symbolic pointer (a name:-).
331  */
332 static void terapix_image(string_buffer sb, int ff, int n)
333 {
334  pips_assert("valid flip-flop", ff==0 || ff==1);
335  pips_assert("valid image number", n!=0);
336  if (n>0)
337  sb_cat(sb, IMG_PTR, i2a(n));
338  else
339  sb_cat(sb, IMG_PTR "io_", i2a(-n), ff? "_1": "_0");
340 }
341 
342 /* set a double buffered image argument.
343  */
344 static void terapix_mcu_img(string_buffer code, int op, string ref, int n)
345 {
346  sb_cat(code, " mcu_macro[0][", i2a(op), "].", ref, " = ");
347  terapix_image(code, 0, n);
348  sb_cat(code, ";\n");
349  sb_cat(code, " mcu_macro[1][", i2a(op), "].", ref, " = ");
350  terapix_image(code, 1, n);
351  sb_cat(code, ";\n");
352 }
353 
354 /* set an integer argument.
355  */
356 static void terapix_mcu_int(string_buffer code, int op, string ref, int val)
357 {
358  sb_cat(code, " mcu_macro[0][", i2a(op), "].", ref);
359  sb_cat(code, " = ", i2a(val), ";\n");
360  sb_cat(code, " mcu_macro[1][", i2a(op), "].", ref);
361  sb_cat(code, " = ", i2a(val), ";\n");
362 }
363 
364 /* set some value string argument.
365  */
366 static void terapix_mcu_val(string_buffer code, int op, string r, string s)
367 {
368  sb_cat(code, " mcu_macro[0][", i2a(op), "].", r, " = ", s, ";\n");
369  sb_cat(code, " mcu_macro[1][", i2a(op), "].", r, " = ", s, ";\n");
370 }
371 
372 /* set some prefixed value string argument.
373  */
374 static void terapix_mcu_pval(string_buffer code, int op, string ref,
375  string p, string s)
376 {
377  sb_cat(code, " mcu_macro[0][", i2a(op), "].", ref,
378  " = ", p, s, ";\n");
379  sb_cat(code, " mcu_macro[1][", i2a(op), "].", ref,
380  " = ", p, s, ";\n");
381 }
382 
383 /* copy some operator *parameters* in the global ram (aka gram).
384  * the coordinates used are (x_<name>, y_<name>).
385  */
386 static void gram_param
388  string name, dagvtx v, hash_table hparams,
389  int width, int height, bool is_kernel, bool * used)
390 {
391  int size = width*height;
392  pips_assert("something to copy...", size>0);
393 
394  int x = 0, y = 0;
395  terapix_gram_allocate(used, width, height, &x, &y);
396 
397  sb_cat(decl, " // operation ", name, " parameters\n");
398  sb_cat(decl, " int16_t p_", name, "[", i2a(size), "];\n");
399  sb_cat(decl, " const int32_t x_", name, " = ", i2a(x), ";\n");
400  sb_cat(decl, " const int32_t y_", name, " = ", i2a(y), ";\n");
401 
402  sb_cat(code, " // copy of operation ", name, " parameters\n");
403  list largs = freia_get_vertex_params(v);
404  pips_assert("some args...", gen_length(largs)>0);
405  string p1 = hash_get(hparams, EXPRESSION(CAR(largs)));
406  // copy code...
407  if (is_kernel)
408  {
409  sb_cat(code,
410  " for(i=0; i<", i2a(size), "; i++)\n"
411  " p_", name, "[i] = ", p1, "[i];\n");
412  }
413  else
414  {
415  switch (size)
416  {
417  case 1: // constant
418  sb_cat(code, " p_", name, "[0] = ", p1, ";\n");
419  break;
420  case 3: // threshold min/max/bin
421  sb_cat(code, " p_", name, "[0] = ", p1, ";\n");
422  sb_cat(code, " p_", name, "[1] = ",
423  hash_get(hparams, EXPRESSION(CAR(CDR(largs)))), ";\n");
424  sb_cat(code, " p_", name, "[2] = ",
425  hash_get(hparams, EXPRESSION(CAR(CDR(CDR(largs))))), ";\n");
426  break;
427  default:
428  pips_internal_error("unexpected gram size");
429  }
430  }
431 
432  sb_cat(code, " gram.xoffset = x_", name, ";\n");
433  sb_cat(code, " gram.yoffset = y_", name, ";\n");
434  sb_cat(code, " gram.width = ", i2a(width), ";\n");
435  sb_cat(code, " gram.height = ", i2a(height), ";\n");
436  sb_cat(code, " gram.params = p_", name, ";\n");
437  sb_cat(code, " freia_mg_write_dynamic_param(&dyn_param);\n");
438 }
439 
440 /* manage GRAM global memory to pass parameters.
441  */
443  (string_buffer code, // generated code
444  string_buffer decl, // generated declarations
445  int op, // operation number
446  const freia_api_t * api,
447  const dagvtx v, // current vertex
448  hash_table hparams, // expression -> parameter...
449  bool * used) // current usage of GRAM
450 {
451  if (!api->arg_misc_in) return;
452 
453  list largs = freia_get_vertex_params(v);
454  string p1 = hash_get(hparams, EXPRESSION(CAR(largs)));
455 
456  // is it a new, never handled, parameter?
457  bool initialize = !hash_defined_p(hparams, p1);
458  // name suffix for variables...
459  if (initialize) hash_put(hparams, p1, strdup(i2a(op)));
460  string name = hash_get(hparams, p1);
461 
462  if (initialize)
463  {
464  switch (api->arg_misc_in)
465  {
466  case 3: // convolution or threshold
467  if (freia_convolution_p(v)) // convolution special case
468  {
469  _int w, h;
470  freia_convolution_width_height(v, &w, &h, true);
471  gram_param(code, decl, name, v, hparams, w, h, true, used);
472  }
473  else // threshold
474  gram_param(code, decl, name, v, hparams, 3, 1, false, used);
475  break;
476  case 1: // kernel or operation with a constant
477  if (api->terapix.north) // let us say it is a kernel...
478  gram_param(code, decl, name, v, hparams, 3, 3, true, used);
479  else
480  gram_param(code, decl, name, v, hparams, 1, 1, false, used);
481  break;
482  default:
483  pips_internal_error("unexpected number of input image arguments");
484  }
485  }
486 
487  // is it always [xy]min3?
488  terapix_mcu_pval(code, op, "xmin3", "x_", name);
489  terapix_mcu_pval(code, op, "ymin3", "y_", name);
490 }
491 
492 /* generate terapix code for
493  * @param code, code stream being generated
494  * @param decl, declaration stream being generated
495  * @param op, operation number
496  * @param api, actual freia operator called
497  * @param used, array to keep track of what gram cells are used
498  * @param hparam, expression to parameter mapping
499  * @param v, dag vertex of the current operation
500  * @param ins, list of image number inputs (i.e. operation arguments)
501  * @param out, image number output for the operation
502  */
505  int op, const freia_api_t * api, bool * used,
506  hash_table hparams, const dagvtx v, const list ins, int out)
507 {
508  // check image in/out consistency
509  pips_assert("#ins ok", gen_length(ins)==api->arg_img_in);
510  pips_assert("out ok", out? api->arg_img_out: !api->arg_img_out);
511 
512  switch (api->arg_img_in)
513  {
514  case 2:
515  pips_assert("2 ins, alu operation...", out);
516  int img1 = INT(CAR(ins)), img2 = INT(CAR(CDR(ins)));
517  terapix_mcu_img(code, op, "xmin1", api->terapix.reverse? img2: img1);
518  terapix_mcu_int(code, op, "ymin1", 0);
519  terapix_mcu_img(code, op, "xmin2", api->terapix.reverse? img1: img2);
520  terapix_mcu_int(code, op, "ymin2", 0);
521  terapix_mcu_img(code, op, "xmin3", out);
522  terapix_mcu_int(code, op, "ymin3", 0);
523  // ??? needed for replace const... although arg 3 is used already
524  // replace_const special argument management is handled directly elsewhere
525  // terapix_gram_management(code, decl, op, api, v, hparams, used);
526  break;
527  case 1:
528  // alu: image op cst 1
529  // threshold 3x1
530  // erode/dilate 3x3
531  // copy
532  terapix_mcu_img(code, op, "xmin1", INT(CAR(ins)));
533  terapix_mcu_int(code, op, "ymin1", 0);
534  if (out) {
535  terapix_mcu_img(code, op, "xmin2", out);
536  terapix_mcu_int(code, op, "ymin2", 0);
537  }
538  terapix_gram_management(code, decl, op, api, v, hparams, used);
539  break;
540  case 0:
541  pips_assert("no input, one output image", out);
542  // const image generation... NSP
543  terapix_mcu_img(code, op, "xmin1", out);
544  terapix_mcu_int(code, op, "ymin1", 0);
545  terapix_gram_management(code, decl, op, api, v, hparams, used);
546  break;
547  default:
548  pips_internal_error("unexpected number of input images");
549  }
550  terapix_mcu_val(code, op, "iter1", "TERAPIX_PE_NUMBER");
551  terapix_mcu_val(code, op, "iter2", "imagelet_size");
552  if (freia_convolution_p(v)) // convolution special case hack
553  {
554  _int w, h;
555  freia_convolution_width_height(v, &w, &h, true);
556  // ??? should I use the parameters?
557  // ??? or check their values?
558  // ??? or remove them from the list as they are inlined?
559  terapix_mcu_int(code, op, "iter3", (int) w);
560  terapix_mcu_int(code, op, "iter4", (int) h);
561  }
562  else
563  {
564  terapix_mcu_val(code, op, "iter3", "0");
565  terapix_mcu_val(code, op, "iter4", "0");
566  }
567  terapix_mcu_val(code, op, "addrStart", api->terapix.ucode);
568 }
569 
570 /* @brief initialize a few rows at mem address with value val
571  */
572 static void terapix_init_row(
573  string_buffer decl,
575  string base,
576  string suff,
577  string mem,
578  int nrow,
579  string val,
580  bool * used)
581 {
582  // get one memory cell for the value
583  int x = 0, y = 0;
584  terapix_gram_allocate(used, 1, 1, &x, &y);
585 
586  // operation name
587  string name = strdup(cat(base, "_", suff));
588 
589  // set the constant
590  sb_cat(decl, " // operation ", name, " initialization\n"
591  " int16_t p_", name, "[1];\n");
592  sb_cat(decl, " const int32_t x_", name, " = ", i2a(x), ";\n");
593  sb_cat(decl, " const int32_t y_", name, " = ", i2a(y), ";\n");
594 
595  sb_cat(code, " // initializing ", name, "\n"
596  " p_", name, "[0] = ", val, ";\n"
597  " gram.xoffset = x_", name, ";\n"
598  " gram.yoffset = y_", name, ";\n"
599  " gram.width = 1;\n"
600  " gram.height = 1;\n"
601  " gram.params = p_", name, ";\n"
602  " freia_mg_write_dynamic_param(&dyn_param);\n");
603 
604  // call the initialization
605  sb_cat(code,
606  " // initialize memory for operation ", name, "\n"
607  " mem_init.xmin1 = ", mem, ";\n"
608  " mem_init.ymin1 = 0;\n"
609  " mem_init.xmin2 = 0;\n"
610  " mem_init.ymin2 = 0;\n"
611  " mem_init.xmin3 = 0;\n"
612  " mem_init.ymin3 = 0;\n"
613  " mem_init.iter1 = TERAPIX_PE_NUMBER;\n"
614  " mem_init.iter2 = ", i2a(nrow),";\n"
615  " mem_init.iter3 = 0;\n"
616  " mem_init.iter4 = 0;\n"
617  " mem_init.addrStart = TERAPIX_UCODE_SET_CONST;\n"
618  " param.size = sizeof(terapix_mcu_macrocode); // not used?\n"
619  " param.raw = (void*) (&mem_init);\n"
620  " ret |= freia_mg_work(&param);\n"
621  " ret |= freia_mg_end_work();\n");
622 
623  // cleanup
624  free(name);
625 }
626 
627 /* @brief initialize the memory at addr depending on the operation to perform
628  * @param decl, added declarations are put there
629  * @param body, generated code is put there
630  * @param nop, current operation number
631  * @param mem, memory symbolic x address
632  * @param api, freia operation
633  * @param used, current use of Global RAM (gram)
634  */
636  string_buffer decl,
637  string_buffer body,
638  int nop,
639  string mem,
640  const freia_api_t * api,
641  bool * used)
642 {
643  string op = api->compact_name;
644  pips_assert("operation is a measure",
645  same_string_p(op, "min") || same_string_p(op, "min!") ||
646  same_string_p(op, "max") || same_string_p(op, "max!") ||
647  same_string_p(op, "vol"));
648  string sop = strdup(i2a(nop));
649 
650  // INT16 should be a property?
651 
652  if (same_string_p(op, "min") || same_string_p(op, "min!"))
653  terapix_init_row(decl, body, sop, "val", mem, 1, "INT16_MAX", used);
654  if (same_string_p(op, "max") || same_string_p(op, "max!"))
655  terapix_init_row(decl, body, sop, "val", mem, 1, "INT16_MIN", used);
656  if (same_string_p(op, "min!") || same_string_p(op, "max!"))
657  {
658  string memp1 = strdup(cat(mem,"+1"));
659  terapix_init_row(decl, body, sop, "loc", memp1, 4, "0", used);
660  free(memp1);
661  }
662  if (same_string_p(op, "vol"))
663  terapix_init_row(decl, body, sop, "val", mem, 2, "0", used);
664 
665  free(sop);
666 }
667 
668 /* @brief generate reduction extraction code
669  */
671  string_buffer decl,
672  string_buffer tail,
673  int n_op,
674  string mem,
675  const freia_api_t * api)
676 {
677  pips_assert("some results are expected", api->arg_misc_out>0);
678  string sop = strdup(i2a(n_op));
679  // I do not understand the underlying logic of these values
680  string width = api->arg_misc_out==3? "5": "1";
681  sb_cat(decl,
682  " // array for reduction ", sop, " extraction\n"
683  " int32_t red_", sop, "[", i2a(api->arg_misc_out), "];\n");
684  sb_cat(tail,
685  " redter.xres = ", mem, ";\n"
686  " redter.yres = 0;\n"
687  " redter.width = ", width, ";\n"
688  " redter.height = TERAPIX_PE_NUMBER;\n"
689  " redter.result = (void*) red_", sop, ";\n"
690  " redter.macroid = ", api->terapix.ucode, ";\n"
691  // just gessing that there must be a first input image
692  // ??? we assume that all image are of the same size?!
693  " redter.imgwidth = i0->width;\n"
694  " redter.imgheight = i0->height;\n"
695  " redter.subimgwidth = TERAPIX_PE_NUMBER;\n"
696  " redter.subimgheight = imagelet_size;\n"
697  "\n"
698  " ret |= freia_cg_read_reduction_results(&redres);\n"
699  "\n");
700  free(sop);
701 }
702 
703 /*************************************************** TERAPIX CODE GENERATION */
704 
705 /* generate a terapix call for dag thedag.
706  * the memory allocation is managed here.
707  * however this function is dumb, the scheduling is just inherited as is...
708  * @return number of output images...
709  */
711  (const string module,
712  const string fname_dag,
714  dag thedag,
715  list /* of expression */ *params)
716 {
717  // total number of imagelets used for computing the dag
718  // will be updated later, implicitely derived from the scheduling
719  int n_imagelets = 0;
720  // number of input images
721  int n_ins = gen_length(dag_inputs(thedag));
722  // number of output images
723  int n_outs = gen_length(dag_outputs(thedag));
724  // number of needed double buffers for I/Os.
725  // this is also the number of I/O images
726  int n_double_buffers;
727 
728  if (trpx_overlap_io_p())
729  n_double_buffers = n_ins+n_outs;
730  else
731  n_double_buffers = (n_ins>n_outs)? n_ins: n_outs; // max(#ins, #outs)
732 
733  pips_assert("some I/O images", n_double_buffers>0);
734 
735  // the memory will be decremented for "measures" data (reductions),
736  // and then divided among imagelets
737  int available_memory = get_int_property(trpx_mem_prop);
738 
740  head = string_buffer_make(true),
741  decl = string_buffer_make(true),
742  init = string_buffer_make(true),
743  body = string_buffer_make(true),
744  dbio = string_buffer_make(true),
745  tail = string_buffer_make(true);
746 
747  // array variable name in caller -> local kernel parameter
748  // used to detect if a kernel is already available, so as to skip
749  // its copy and share the generated parameter.
751 
752  // number of arguments to generated function
753  int nargs = 0;
754 
755  // get stats
756  int length, width, cost, nops, n, s, w, e;
757  length = dag_terapix_measures(thedag, NULL,
758  &width, &cost, &nops, &n, &s, &w, &e);
759 
760  int comm = get_int_property(trpx_dmabw_prop);
761 
762 // integer property to string
763 #define ip2s(n) i2a(get_int_property(n))
764 
765  // show stats in function's comments
766  sb_cat(head, "\n"
767  "/* FREIA terapix helper function for module ", module, "\n");
768  sb_cat(head, " *\n");
769  // show terapix code generation parameters
770  sb_cat(head, " * RAMPE = ", ip2s(trpx_mem_prop), "\n");
771  sb_cat(head, " * NPE = ", ip2s(trpx_npe_prop), "\n");
772  sb_cat(head, " * DMA BW = ", ip2s(trpx_dmabw_prop), "\n");
773  sb_cat(head, " * GRAM W = ", ip2s(trpx_gram_width), "\n");
774  sb_cat(head, " * GRAM H = ", ip2s(trpx_gram_height), "\n");
775  sb_cat(head, " * DAG CUT = ", get_string_property(trpx_dag_cut), "\n");
776  sb_cat(head, " * OVERLAP = ", bool_to_string(trpx_overlap_io_p()), "\n");
777  sb_cat(head, " * IMAGE H = ", ip2s("FREIA_IMAGE_HEIGHT"), "\n");
778  sb_cat(head, " * MAX SIZE = ", ip2s(trpx_max_size), "\n");
779  sb_cat(head, " *\n");
780  // show dag statistics
781  sb_cat(head, " * ", i2a(n_ins), " input image", n_ins>1? "s": "");
782  sb_cat(head, ", ", i2a(n_outs), " output image", n_outs>1? "s": "", "\n");
783  sb_cat(head, " * ", i2a(nops), " image operations in dag\n");
784  sb_cat(head, " * dag length is ", i2a(length));
785  sb_cat(head, ", dag width is ", i2a(width), "\n");
786  sb_cat(head, " * costs in cycles per imagelet row:\n");
787  sb_cat(head, " * - computation: ", i2a(cost), "\n");
788  // number of transfers depends on overlapping
789  int n_trs = trpx_overlap_io_p()? (n_ins>n_outs? n_ins: n_outs): n_ins+n_outs;
790  sb_cat(head, " * - communication: ", i2a(comm*n_trs), "\n");
791  sb_cat(head, " */\n");
792 
793  // generate function declaration
794  sb_cat(head, "freia_status ", fname_dag, "(");
795  for (int i = 0; i<n_outs; i++)
796  sb_cat(head, nargs++? ",": "", "\n " FREIA_IMAGE "o", i2a(i));
797  for (int i = 0; i<n_ins; i++)
798  sb_cat(head, nargs++? ",": "", "\n const " FREIA_IMAGE "i", i2a(i));
799  // other arguments to come...
800 
801  // corresponding helper call arguments
802  list limg = NIL;
803  FOREACH(dagvtx, voa, dag_outputs(thedag))
804  limg = CONS(entity, vtxcontent_out(dagvtx_content(voa)), limg);
805  FOREACH(dagvtx, via, dag_inputs(thedag))
806  limg = CONS(entity, vtxcontent_out(dagvtx_content(via)), limg);
807  limg = gen_nreverse(limg);
808 
809  sb_cat(decl,
810  "{\n"
811  " // declarations:\n"
812  " freia_microcode mcode;\n"
813  " freia_op_param param;\n"
814  " freia_dynamic_param dyn_param;\n"
815  " terapix_gram gram;\n"
816  " int i;\n" // not always used...
817  " freia_status ret = FREIA_OK;\n"
818  " // data structures for reductions\n"
819  " terapix_mcu_macrocode mem_init;\n"
820  " freia_reduction_results redres;\n"
821  " terapix_reduction redter;\n"
822  " // overall structure which describes the computation\n"
823  " terapix_mcu_instr mcu_instr;\n");
824 
825  sb_cat(body,
826  "\n"
827  " // body:\n"
828  " // mcode param\n"
829  " mcode.raw = (void*) terapix_ucode_array;\n"
830  " mcode.size = TERAPIX_UCODE_SIZE_T;\n"
831  " freia_mg_write_microcode(&mcode);\n"
832  "\n"
833  " // dyn_param contents\n"
834  " dyn_param.raw = &gram;\n"
835  " dyn_param.size = sizeof(terapix_gram);\n"
836  "\n"
837  " // redres contents\n"
838  " redres.raw = (void*) &redter;\n"
839  " redres.size = sizeof(terapix_reduction);\n"
840  "\n");
841 
842  // string_buffer head, decls, end, settings;
843 
844  // schedule to imagelet numbers as needed...
845  // use a named pointer the value of which will be known later,
846  // depending on the number of needed imagelets
847  // operation -> imagelet number
848  // the imagelet number is inverted if it is an I/O
850  set computed = set_make(set_pointer);
851 
852  // the GRAM initialization may be shared between helper calls?
853  bool * used = terapix_gram_init();
854 
855  // currently available imagelets
856  set avail_img = set_make(set_pointer);
857 
858  // output images are the first ones when I/O comms overlap
859  if (trpx_overlap_io_p())
860  while (n_imagelets<n_outs)
861  set_add_element(avail_img, avail_img, (void*) (_int) ++n_imagelets);
862 
863  if (n_ins)
864  {
865  // ??? they should be given in the order of the arguments
866  // when calling the runtime function.
867  int n = 0;
868  sb_cat(dbio, "\n // inputs:\n");
869  FOREACH(dagvtx, in, dag_inputs(thedag))
870  {
871  // update primary imagelet number
872  n_imagelets++;
873  set_add_element(computed, computed, in);
874  // ??? stupid bug which filters undefined values, i.e. -16
875  // I should really use a container...
876  hash_put(allocation, in, (void*) (_int) -n_imagelets);
877 
878  string sn = strdup(i2a(n)), si = strdup(i2a(n_imagelets));
879 
880  // ??? tell that n_imagelets is an input
881  sb_cat(dbio, " // - imagelet ", si, " is i", sn, " for ",
883  "\n");
884 
885  sb_cat(dbio, " tile_in[0][", sn, "].x = " IMG_PTR "io_", si, "_0;\n");
886  sb_cat(dbio, " tile_in[0][", sn, "].y = 0;\n");
887  sb_cat(dbio, " tile_in[1][", sn, "].x = " IMG_PTR "io_", si, "_1;\n");
888  sb_cat(dbio, " tile_in[1][", sn, "].y = 0;\n");
889  free(sn);
890  free(si);
891  n++;
892  }
893  sb_cat(dbio, "\n");
894  }
895  else
896  {
897  sb_cat(dbio, "\n // no input\n\n");
898  }
899 
900  // complete if need be, there will be AT LEAST this number of images
901  while (n_imagelets<n_double_buffers)
902  set_add_element(avail_img, avail_img, (void*) (_int) ++n_imagelets);
903 
904  set deads = set_make(set_pointer);
905  // newly created parameters at this round
906 
907  // generate code for every computation vertex
908  int n_ops = 0;
909  list vertices = gen_nreverse(gen_copy_seq(dag_vertices(thedag)));
910  FOREACH(dagvtx, current, vertices)
911  {
912  // skip this vertex
913  if (set_belong_p(computed, current))
914  continue;
916  continue;
917 
918  // compute freed images...
919  set_clear(deads);
920  compute_dead_vertices(deads, computed, thedag, current);
921 
923  pips_assert("there is a statement",
927  // int optype = dagvtx_optype(current);
928  int opid = dagvtx_opid(current);
929  const freia_api_t * api = get_freia_api(opid);
930  pips_assert("freia api found", api!=NULL);
931 
932  // if inplace, append freed images to availables
933  if (api->terapix.inplace)
934  {
935  SET_FOREACH(dagvtx, v, deads)
936  {
937  // but keep intermediate output images!
938  if (!gen_in_list_p(v, dag_outputs(thedag)))
939  {
940  _int img = (_int) hash_get(allocation, v);
941  if (img<0) img=-img;
942  set_add_element(avail_img, avail_img, (void*) img);
943  }
944  }
945  }
946 
947  // generate inS -> out computation
948  // - code
949  // imagelet inputs
951  sb_cat(body, " // ", i2a(n_ops), ": ", api->compact_name, "(");
952  if (ins)
953  {
954  // show input imagelet numbers
955  int in_count=0;
956  FOREACH(int, i, ins)
957  sb_cat(body, in_count++? ",": "", i2a(i>0? i: -i));
958  }
959  sb_cat(body, ")");
960 
961  // imagelet output
962  _int choice = 0;
963  if (api->arg_img_out==1)
964  {
965  bool is_output = gen_in_list_p(current, dag_outputs(thedag));
966  // SELECT one available imagelet
967  // if none is available, a new one is implicitely created
968  choice = select_imagelet(avail_img, &n_imagelets, is_output);
969  sb_cat(body, " -> ", i2a((int) choice));
970  // there is a subtlety here, if no I/O image was available
971  // then a copy will have to be inserted later on, see "PANIC".
972  if (choice<=n_double_buffers) choice = -choice;
973  hash_put(allocation, current, (void*) choice);
974  }
975  sb_cat(body, "\n");
976 
977  // update helper call arguments...
978  *params = gen_nconc(*params,
980  head, NULL, hparams, &nargs));
981 
982  // special case for replace_const, which needs a 4th argument
983  if (same_string_p(api->compact_name, ":"))
984  {
985  sb_cat(body, " // *special* set parameter for replace_const\n");
986  terapix_mcu_int(body, n_ops, "xmin1", 0);
987  terapix_mcu_int(body, n_ops, "ymin1", 0);
988  terapix_mcu_int(body, n_ops, "xmin2", 0);
989  terapix_mcu_int(body, n_ops, "ymin2", 0);
990  terapix_gram_management(body, decl, n_ops, api, current, hparams, used);
991  terapix_mcu_val(body, n_ops, "iter1", "TERAPIX_PE_NUMBER");
992  terapix_mcu_int(body, n_ops, "iter2", 0);
993  terapix_mcu_int(body, n_ops, "iter3", 0);
994  terapix_mcu_int(body, n_ops, "iter4", 0);
995  terapix_mcu_val(body, n_ops, "addrStart",
996  "TERAPIX_UCODE_SET_CONST_RAMREG");
997 
998  sb_cat(body, " // now take care of actual operation\n");
999  n_ops++;
1000  }
1001 
1002  if (api->terapix.memory)
1003  {
1004  string sop = strdup(i2a(n_ops));
1005  // reserve the necessary memory at the end of the segment
1006  available_memory -= api->terapix.memory;
1007  string mem = strdup(cat(RED_PTR, sop));
1008  sb_cat(init, " int ", mem, " = ", i2a(available_memory), ";\n");
1009 
1010  // initialize the memory based on the measure operation
1011  terapix_initialize_memory(decl, body, n_ops, mem, api, used);
1012 
1013  // imagelet computation
1014  sb_cat(body, " // set measure ", api->compact_name, " at ", mem, "\n");
1015  terapix_mcu_val(body, n_ops, "xmin2", mem);
1016  terapix_mcu_val(body, n_ops, "ymin2", "0");
1017 
1018  // should not be used, but just in case...
1019  terapix_mcu_val(body, n_ops, "xmin3", "0");
1020  terapix_mcu_val(body, n_ops, "ymin3", "0");
1021 
1022  // extraction
1023  sb_cat(tail, " // get measure ", api->compact_name,
1024  " result from ", mem, "\n");
1025  terapix_get_reduction(decl, tail, n_ops, mem, api);
1026 
1027  sb_cat(tail, " // assign reduction parameter",
1028  api->arg_misc_out>1? "s":"", "\n");
1029  int i = 0;
1031  {
1032  string var = (string) hash_get(hparams, arg);
1033  // hmmm, kind of a hack to get the possibly needed cast
1034  string cast = strdup(api->arg_out_types[i]);
1035  string space = strchr(cast, ' ');
1036  if (space) *space = '\0';
1037  sb_cat(tail, " *", var, " = (", cast, ") "
1038  "red_", sop, "[", i2a(i), "];\n");
1039  i++;
1040  free(cast);
1041  }
1042  free(mem);
1043  free(sop);
1044  }
1045 
1046  if (api==hwac_freia_api(AIPO "copy") && choice==INT(CAR(ins)))
1047  {
1048  // skip in place copy, which may happen if the selected target
1049  // image buffer happens to be the same as the input.
1050  sb_cat(body, " // in place copy skipped\n");
1051  n_ops--;
1052  }
1053  else
1054  {
1055  terapix_macro_code(body, decl, n_ops, api, used,
1056  hparams, current, ins, choice);
1057  }
1058 
1059  gen_free_list(ins), ins=NIL;
1060 
1061  // if NOT inplace, append freed images to availables now
1062  if (!api->terapix.inplace)
1063  {
1064  SET_FOREACH(dagvtx, v, deads)
1065  {
1066  // but keep intermediate output images!
1067  if (!gen_in_list_p(v, dag_outputs(thedag)))
1068  {
1069  _int img = (_int) hash_get(allocation, v);
1070  if (img<0) img=-img;
1071  set_add_element(avail_img, avail_img, (void*) img);
1072  }
1073  }
1074  }
1075 
1076  set_add_element(computed, computed, current);
1077  n_ops++;
1078  }
1079 
1080  // handle function image arguments
1082 
1083  if (n_outs)
1084  {
1085  int n = 0;
1086  sb_cat(dbio, " // outputs:\n");
1087  FOREACH(dagvtx, out, dag_outputs(thedag))
1088  {
1089  int oimg = (int) (_int) hash_get(allocation, out);
1090  if (oimg<0) oimg=-oimg;
1091  // when not overlapping, any I/O image is fine
1092  // when overlapping, must be one of the first
1093  // because the later ones are used in parallel as inputs
1094  if ((!trpx_overlap_io_p() && oimg>n_double_buffers) ||
1095  (trpx_overlap_io_p() && oimg>n_outs))
1096  {
1097  // PANIC:
1098  // if there is no available "IO" imagelet when an output is
1099  // produced, it will have to be put there with a copy later on.
1100  int old = oimg;
1101  oimg = select_imagelet(avail_img, NULL, true);
1102  pips_assert("IO imagelet found for output", oimg<=n_double_buffers);
1103 
1104  // generate copy code old -> oimg
1105  // hmmm... could not generate a test case where this is triggered...
1106  // the additional cost which should be reported?
1107  sb_cat(body, " // output copy ", i2a(old));
1108  sb_cat(body, " -> ", i2a(oimg), "\n");
1109  list lic = CONS(int, old, NIL);
1110  // -oimg to tell the code generator that we are dealing with
1111  // a double buffered image...
1112  terapix_macro_code(body, decl, n_ops, hwac_freia_api(AIPO "copy"),
1113  NULL, NULL, NULL, lic, -oimg);
1114  gen_free_list(lic);
1115  n_ops++;
1116  }
1117  // tell that oimg is an output
1118  // ??? tell that n_imagelets is an input
1119  string sn = strdup(i2a(n)), so = strdup(i2a(oimg));
1120  sb_cat(dbio, " // - imagelet ", so);
1121  sb_cat(dbio, " is o", sn, " for ");
1122  sb_cat(dbio,
1124  "\n");
1125  sb_cat(dbio, " tile_out[0][", sn, "].x = " IMG_PTR"io_", so, "_0;\n");
1126  sb_cat(dbio, " tile_out[0][", sn, "].y = 0;\n");
1127  sb_cat(dbio, " tile_out[1][", sn, "].x = " IMG_PTR"io_", so, "_1;\n");
1128  sb_cat(dbio, " tile_out[1][", sn, "].y = 0;\n");
1129  free(sn);
1130  free(so);
1131  n++;
1132  }
1133  sb_cat(dbio, "\n");
1134  sb_cat(body, "\n");
1135  }
1136  else
1137  {
1138  sb_cat(dbio, " // no output\n\n");
1139  }
1140 
1141  // now I know how many imagelets are needed
1142  int total_imagelets = n_imagelets + n_double_buffers;
1143  int imagelet_rows = available_memory/total_imagelets; // round down
1144  int imagelet_max_rows = imagelet_rows;
1145 
1146  // declarations when we know the number of operations
1147  // [2] for flip/flop double buffer handling
1148  sb_cat(decl, " // flip flop macro code and I/Os\n");
1149  sb_cat(decl, " terapix_mcu_macrocode mcu_macro[2][", i2a(n_ops), "];\n");
1150  if (n_ins)
1151  sb_cat(decl, " terapix_tile_info tile_in[2][", i2a(n_ins), "];\n");
1152  if (n_outs)
1153  sb_cat(decl, " terapix_tile_info tile_out[2][", i2a(n_outs), "];\n");
1154 
1155  // computed values
1156  sb_cat(decl, " // imagelets definitions:\n");
1157  sb_cat(decl, " // - ", i2a(n_imagelets), " computation imagelets\n");
1158  sb_cat(decl, " // - ", i2a(n_double_buffers), " double buffer imagelets\n");
1159 
1160  // we may optimize the row size for a target image height, if available
1161  int image_height = FREIA_DEFAULT_HEIGHT;
1162  int vertical_border = n>s? n: s;
1163  int max_computed_size = imagelet_rows-2*vertical_border;
1164  // this is really a MAXIMUM available size that can be set from outside
1165  int max_size = get_int_property(trpx_max_size);
1166 
1167  if (image_height==0)
1168  {
1169  // what about vol(cst())?
1170  pips_assert("at least one image is needed!", n_ins||n_outs);
1171  // dynamic adjustment of the imagelet size
1172  sb_cat(decl,
1173  " // dynamic optimal imagelet size computation\n"
1174  " // this formula must match what the scheduler does!\n"
1175  " int vertical_border = ", i2a(vertical_border), ";\n"
1176  // use first input image for the reference size, or default to output
1177  " int image_height = ", n_ins? "i": "o", "0->heightWa;\n");
1178  sb_cat(decl,
1179  " int max_computed_size = ", i2a(max_computed_size), ";\n"
1180  " int n_tiles = (image_height+max_computed_size-1)/max_computed_size;\n"
1181  " int imagelet_size = (n_tiles==1)? image_height:\n"
1182  " ((image_height+n_tiles-1)/n_tiles)+2*vertical_border;\n");
1183  if (max_size)
1184  {
1185  sb_cat(decl,
1186  " // max imagelet size requested..."
1187  " int max_size = ", i2a(max_size), ";\n"
1188  " if (imagelet_size>max_size)\n"
1189  " imagelet_size = max_size;\n");
1190  }
1191  }
1192  else // assume the provided image_height
1193  {
1194  // we adjust statically the imagelet size so that we avoid recomputing
1195  // pixels... the formula must match whatever the scheduler does!
1196  // ??? hmmm... only for inner tiles
1197  // #tiles is ceil(height/computed)
1198  int n_tiles = (image_height+max_computed_size-1)/max_computed_size;
1199  // now we compute back the row size
1200  int optim_rows = ((image_height+n_tiles-1)/n_tiles)+2*vertical_border;
1201  // fix if the tile is too large
1202  if (optim_rows>image_height) optim_rows = image_height;
1203  imagelet_rows = optim_rows;
1204 
1205  pips_assert("optimized row size lower than max row size",
1206  optim_rows<=imagelet_rows && optim_rows>0);
1207 
1208  // now we set the value directly
1209  sb_cat(decl, " // imagelet max size: ", i2a(imagelet_max_rows), "\n");
1210 
1211  // the runtime can use imagelet_rows or less
1212  sb_cat(decl, " int imagelet_size = ",
1213  i2a(max_size?
1214  // max_size is defined, may use it if smaller than computed size
1215  (max_size<imagelet_rows? max_size: imagelet_rows):
1216  // max_size is not defined
1217  imagelet_rows), ";\n");
1218  }
1219 
1220  // generate imagelet pointers
1221  for (int i=1; i<=total_imagelets; i++)
1222  {
1223  sb_cat(decl, " int " IMG_PTR, i2a(i), " = ");
1224  sb_cat(decl, i2a(imagelet_max_rows * (i-1)), ";\n");
1225  }
1226  // append reduction memory pointers
1227  sb_cat(decl, "\n");
1228 
1229  if (string_buffer_size(init)>0)
1230  {
1231  sb_cat(decl, " // memory for reductions\n");
1233  sb_cat(decl, "\n");
1234  }
1236 
1237  // generate imagelet double buffer pointers
1238  // sb_cat(dbio, " // double buffer management:\n");
1239  sb_cat(decl, " // double buffer assignment\n");
1240  for (int i=1; i<=n_double_buffers; i++)
1241  {
1242  // sb_cat(dbio, " // - buffer ", i2a(i), "/");
1243  // sb_cat(dbio, i2a(i+n_imagelets), "\n");
1244 
1245  sb_cat(decl, " int " IMG_PTR "io_", i2a(i), "_0 = ");
1246  sb_cat(decl, IMG_PTR, i2a(i), ";\n");
1247  sb_cat(decl, " int " IMG_PTR "io_", i2a(i), "_1 = ");
1248  sb_cat(decl, IMG_PTR, i2a(i+n_imagelets), ";\n");
1249  }
1250 
1251  // incorporate IO stuff
1252  string_buffer_append_sb(body, dbio);
1253  string_buffer_free(&dbio);
1254 
1255  // tell about imagelet erosion...
1256  // current output should be max(w,e) & max(n,s)
1257  sb_cat(body, " // imagelet erosion for the computation\n");
1258  // terapix runtime issue if n_tiles==1...
1259  sb_cat(body, " mcu_instr.borderTop = ", i2a(n), ";\n");
1260  sb_cat(body, " mcu_instr.borderBottom = ", i2a(s), ";\n");
1261  sb_cat(body, " mcu_instr.borderLeft = ", i2a(w), ";\n");
1262  sb_cat(body, " mcu_instr.borderRight = ", i2a(e), ";\n");
1263  sb_cat(body, " mcu_instr.imagelet_height = imagelet_size;\n"
1264  " mcu_instr.imagelet_width = TERAPIX_PE_NUMBER;\n"
1265  "\n");
1266 
1267  sb_cat(body, " // outputs\n"
1268  " mcu_instr.nbout = ", i2a(n_outs), ";\n");
1269  if (n_outs)
1270  sb_cat(body,
1271  " mcu_instr.out0 = tile_out[0];\n"
1272  " mcu_instr.out1 = tile_out[1];\n");
1273  else
1274  sb_cat(body,
1275  " mcu_instr.out0 = NULL;\n"
1276  " mcu_instr.out1 = NULL;\n");
1277 
1278  sb_cat(body, "\n"
1279  " // inputs\n"
1280  " mcu_instr.nbin = ", i2a(n_ins), ";\n");
1281  if (n_ins)
1282  sb_cat(body,
1283  " mcu_instr.in0 = tile_in[0];\n"
1284  " mcu_instr.in1 = tile_in[1];\n");
1285  else
1286  sb_cat(body,
1287  " mcu_instr.in0 = NULL;\n"
1288  " mcu_instr.in1 = NULL;\n");
1289 
1290  sb_cat(body,
1291  "\n"
1292  " // actual instructions\n"
1293  " mcu_instr.nbinstr = ", i2a(n_ops), ";\n"
1294  " mcu_instr.instr0 = mcu_macro[0];\n"
1295  " mcu_instr.instr1 = mcu_macro[1];\n");
1296 
1297  // tell about imagelet size
1298  // NOTE: the runtime *MUST* take care of possible in/out aliasing
1299  sb_cat(body,
1300  "\n"
1301  " // call terapix runtime\n"
1302  " param.size = -1; // not used\n"
1303  " param.raw = (void*) &mcu_instr;\n"
1304  " ret |= freia_cg_template_process(&param");
1305  for (int i=0; i<n_outs; i++)
1306  sb_cat(body, ", o", i2a(i));
1307  for (int i=0; i<n_ins; i++)
1308  sb_cat(body, ", i", i2a(i));
1309  sb_cat(body, ");\n");
1310 
1311  // ??? I must compute the total erosion
1312  // ??? I should check that something IS computed...
1313 
1315  sb_cat(code, ")\n");
1318  sb_cat(code, "\n");
1319  sb_cat(code, " // extract measures\n");
1321  sb_cat(code, "\n return ret;\n}\n\n");
1322 
1323  // cleanup computed vertices: they are REMOVED from the dag and "killed"
1324  // ??? should rather return them and the caller should to the cleaning?
1325  FOREACH(dagvtx, vr, vertices)
1326  {
1327  dag_remove_vertex(thedag, vr);
1328  if (set_belong_p(computed, vr))
1329  {
1333  free_dagvtx(vr);
1334  }
1335  }
1336  // cleanup
1337  gen_free_list(vertices), vertices = NIL;
1338  string_buffer_free(&head);
1339  string_buffer_free(&decl);
1340  string_buffer_free(&body);
1341  string_buffer_free(&tail);
1343  // ??? free strings!
1344  hash_table_free(hparams);
1345  set_free(avail_img);
1346  set_free(computed);
1347  set_free(deads);
1348  free(used);
1349 
1350  return n_outs;
1351 }
1352 
1353 /******************************************************************* ONE DAG */
1354 
1355 /* generate terapix code for this one dag, which should be already split.
1356  * return the statement number of the helper insertion
1357  */
1359  string module,
1360  list /* of statements */ ls,
1361  dag d,
1362  string fname_fulldag,
1363  int n_split,
1364  int n_cut,
1365  set global_remainings,
1366  FILE * helper_file,
1367  set helpers,
1368  int stnb,
1369  hash_table signatures)
1370 {
1371  ifdebug(4) {
1373  dag_dump(stderr, "one_dag", d);
1374  }
1375 
1376  set remainings = set_make(set_pointer);
1378 
1379  // name_<number>_<split>[_<cut>]
1380  string fname_dag = strdup(cat(fname_fulldag, "_", i2a(n_split)));
1381  if (n_cut!=-1)
1382  {
1383  string s = strdup(cat(fname_dag, "_", i2a(n_cut)));
1384  free(fname_dag);
1385  fname_dag = s;
1386  }
1387 
1388  dag_dot_dump(module, fname_dag, d, NIL, NIL);
1389 
1390  // - output function in helper file
1391  list lparams = NIL;
1392 
1394  _int nout = freia_terapix_call(module, fname_dag, code, d, &lparams);
1395  string_buffer_to_file(code, helper_file);
1397 
1398  // - and substitute its call...
1399  stnb = freia_substitute_by_helper_call(d, global_remainings, remainings,
1400  ls, fname_dag, lparams, helpers, stnb);
1401 
1402  // record (simple) signature
1403  hash_put(signatures, local_name_to_top_level_entity(fname_dag), (void*) nout);
1404 
1405  // cleanup
1406  free(fname_dag), fname_dag = NULL;
1407 
1408  return stnb;
1409 }
1410 
1411 /************************************************** TERAPIX DAG SCALAR SPLIT */
1412 
1413 /* fill in erosion hash table from dag d.
1414  */
1416 {
1417  int i = 0;
1418  dag_terapix_measures(d, erosion, &i, &i, &i, &i, &i, &i, &i);
1419 }
1420 
1421 /* global variable used by the dagvtx_terapix_priority function,
1422  * because qsort does not allow to pass some descriptor.
1423  */
1424 static hash_table erosion = NULL;
1425 
1426 static void dag_terapix_reset_erosion(const dag d)
1427 {
1428  pips_assert("erosion is allocated", erosion!=NULL);
1431 }
1432 
1433 /* comparison function for sorting dagvtx in qsort,
1434  * this is deep voodoo, because the priority has an impact on
1435  * correctness? that should not be the case as only computations
1436  * allowed by dependencies are schedule.
1437  * tells v1 < (before) v2 => -1
1438  */
1439 static int dagvtx_terapix_priority(const dagvtx * v1, const dagvtx * v2)
1440 {
1441  pips_assert("global erosion is set", erosion!=NULL);
1442 
1443  // ??? should prioritize if more outputs?
1444  // ??? should prioritize inplace?
1445  // ??? should prioritize no erosion first? levels do that currrently?
1446  string why = "none";
1447  int result = 0;
1448  vtxcontent
1449  c1 = dagvtx_content(*v1),
1450  c2 = dagvtx_content(*v2);
1451  const freia_api_t
1452  * a1 = dagvtx_freia_api(*v1),
1453  * a2 = dagvtx_freia_api(*v2);
1454 
1455  // prioritize first scalar ops, measures and last copies
1456  // if there is only one of them
1457  if (vtxcontent_optype(c1)!=vtxcontent_optype(c2))
1458  {
1459  // non implemented stuff
1461  result = 1, why = "impl";
1462  else if (!freia_aipo_terapix_implemented(a2))
1463  result = -1, why = "impl";
1464  // scalars operations first to remove (scalar) dependences
1465  else if (vtxcontent_optype(c1)==spoc_type_oth)
1466  result = -1, why = "scal";
1467  else if (vtxcontent_optype(c2)==spoc_type_oth)
1468  result = 1, why = "scal";
1469  // then measurements are put first
1470  else if (vtxcontent_optype(c1)==spoc_type_mes)
1471  result = -1, why = "mes";
1472  else if (vtxcontent_optype(c2)==spoc_type_mes)
1473  result = 1, why = "mes";
1474  // the copies are performed last...
1475  else if (vtxcontent_optype(c1)==spoc_type_nop)
1476  result = 1, why = "copy";
1477  else if (vtxcontent_optype(c2)==spoc_type_nop)
1478  result = -1, why = "copy";
1479  // idem with image generation...
1480  else if (vtxcontent_optype(c1)==spoc_type_alu &&
1481  vtxcontent_inputs(c1)==NIL)
1482  result = 1, why = "gen";
1483  else if (vtxcontent_optype(c2)==spoc_type_alu &&
1484  vtxcontent_inputs(c2)==NIL)
1485  result = -1, why = "gen";
1486  // ??? do inplace last
1487  // ??? or ONLY if there is a shared input?
1488  else if (a1->terapix.inplace && !a2->terapix.inplace)
1489  result = 1, why = "inplace";
1490  else if (!a1->terapix.inplace && a2->terapix.inplace)
1491  result = -1, why = "inplace";
1492  }
1493 
1494  // ??? priorise when an image is freed
1495 
1496  if (result==0 &&
1497  // is there an image output?
1502  {
1503  ifdebug(6) {
1504  dagvtx_dump(stderr, "v1", *v1);
1505  dagvtx_dump(stderr, "v2", *v2);
1506  }
1507  pips_assert("erosion is defined",
1508  hash_defined_p(erosion, NORTH(*v1)) &&
1509  hash_defined_p(erosion, NORTH(*v2)));
1510 
1511  // try to conclude with erosions:
1512  // not sure about the right partial order to use...
1513  int e1 = (int)
1514  ((_int) hash_get(erosion, NORTH(*v1)) +
1515  (_int) hash_get(erosion, SOUTH(*v1)) +
1516  (_int) hash_get(erosion, WEST(*v1)) +
1517  (_int) hash_get(erosion, EAST(*v1))),
1518  e2 = (int)
1519  ((_int) hash_get(erosion, NORTH(*v2)) +
1520  (_int) hash_get(erosion, SOUTH(*v2)) +
1521  (_int) hash_get(erosion, WEST(*v2)) +
1522  (_int) hash_get(erosion, EAST(*v2)));
1523 
1524  pips_debug(6, "e1=%d, e2=%d\n", e1, e2);
1525 
1526  if (e1!=e2)
1527  result = e1-e2, why = "erosion";
1528  }
1529 
1530  // ??? I should look at in place?
1531  // ??? I should look at the number live uses?
1532 
1533  if (result==0)
1534  {
1535  // if not set by previous case, use other criterions
1536  int
1537  l1 = (int) gen_length(vtxcontent_inputs(c1)),
1538  l2 = (int) gen_length(vtxcontent_inputs(c2));
1539 
1540  // count non mesure successors:
1541  int nms1 = 0, nms2 = 0;
1542 
1543  FOREACH(dagvtx, vs1, dagvtx_succs(*v1))
1544  if (dagvtx_optype(vs1)!=spoc_type_mes) nms1++;
1545 
1546  FOREACH(dagvtx, vs2, dagvtx_succs(*v2))
1547  if (dagvtx_optype(vs2)!=spoc_type_mes) nms2++;
1548 
1549  if (l1!=l2 && (l1==0 || l2==0))
1550  // put image generators at the end, after any other computation
1551  result = l2-l1, why = "args";
1552  else if (nms1!=nms2 && l1==1 && l2==1)
1553  // the less successors the better? the rational is:
1554  // - mesures are handled before and do not have successors anyway,
1555  // - so this is about whether a result of an unary op is reused by
1556  // two nodes, in which case it will just jam the pipeline, so
1557  // try to put other computations before it. Note that mes
1558  // successors do not really count, as the image is not lost.
1559  result = nms1 - nms2, why = "succs";
1560  else if (l1!=l2)
1561  // else ??? no effect on my validation.
1562  result = l2-l1, why = "args2";
1563  else if (vtxcontent_optype(c1)!=vtxcontent_optype(c2))
1564  // otherwise use the op types, which are somehow ordered
1565  // so that if all is well the pipe is filled in order.
1566  result = vtxcontent_optype(c1) - vtxcontent_optype(c2), why = "ops";
1567  else
1568  // if all else fails, rely on statement numbers.
1569  result = dagvtx_number(*v1) - dagvtx_number(*v2), why = "stats";
1570  }
1571 
1572  pips_debug(6, "%" _intFMT " %s %s %" _intFMT " %s (%s)\n",
1573  dagvtx_number(*v1), dagvtx_operation(*v1),
1574  result<0? ">": (result==0? "=": "<"),
1575  dagvtx_number(*v2), dagvtx_operation(*v2), why);
1576 
1577  pips_assert("total order", v1==v2 || result!=0);
1578  return result;
1579 }
1580 
1581 /* @brief whether vertex is not implemented in terapix
1582  */
1583 static bool not_implemented(dagvtx v)
1584 {
1585  if (freia_convolution_p(v)) // special case
1586  {
1587  // skip if parametric
1588  _int w, h;
1589  return !freia_convolution_width_height(v, &w, &h, false);
1590  }
1591  return !freia_aipo_terapix_implemented(dagvtx_freia_api(v));
1592 }
1593 
1594 /* @brief whether dag is not implemented in terapix
1595  */
1596 static bool terapix_not_implemented(dag d)
1597 {
1598  FOREACH(dagvtx, v, dag_vertices(d))
1599  if (not_implemented(v))
1600  return true;
1601  return false;
1602 }
1603 
1604 /* @brief choose a vertex, avoiding non combinable stuff if the list is started
1605  */
1606 static dagvtx choose_terapix_vertex(const list lv, bool started)
1607 {
1608  pips_assert("list contains vertices", lv);
1609  if (started)
1610  {
1611  FOREACH(dagvtx, v, lv)
1612  if (!not_implemented(v))
1613  return v;
1614  }
1615  // just return the first vertex
1616  return DAGVTX(CAR(lv));
1617 }
1618 
1619 /*********************************************************** TERAPIX DAG CUT */
1620 
1621 /* would it seem interesting to split d?
1622  * @return the erosion up to which to split, or 0 of no split
1623  * should we also/instead consider the expected cost?
1624  */
1625 static int cut_decision(dag d, hash_table erosion)
1626 {
1627  int com_cost_per_row = get_int_property(trpx_dmabw_prop);
1628  int width, cost, nops, n, s, w, e;
1629  (void)dag_terapix_measures(d, erosion, &width, &cost, &nops, &n, &s, &w, &e);
1630 
1631  // bye bye...
1632  if (width==0) return 0;
1633 
1634  int nins = gen_length(dag_inputs(d)), nouts = gen_length(dag_outputs(d));
1635 
1636  // if we assume that the imagelet size is quite large, say around 128
1637  // even with double buffers. The only reason to cut is because
1638  // of the erosion on the side which reduces the amount of valid data,
1639  // but there is really a point to do that only communications are still
1640  // masked by computations after splitting the dag...
1641 
1642  // first we compute a possible number of splits
1643  // computation cost = communication cost (in cycle per imagelet row)
1644  // communication cost = (nins + 2*width*n_splits + nouts) * cost_per_row
1645  // the width is taken as the expected number of images to extract and
1646  // reinject (hence 2*) if the dag is split.
1647  // this is really an approximation... indeed, nothing ensures that
1648  // the initial input is not still alive at the chosen cut?
1649 
1650  // for anr999 the gradient of depth 10 is just enough to cover the coms.
1651  // for lp, about 1(.2) split is suggested.
1652 
1653  // compute number of cuts, that is the number of amortizable load/store
1654  // ??? maybe I should incorporate a margin?
1655  double n_cuts;
1656 
1657  // please note that these formula are somehow approximated and the results
1658  // may be proved wrong.
1659  if (trpx_overlap_io_p())
1660  {
1661  // number of image to communicate is MAX(#in,#out)
1662  int nimgs = nins>nouts? nins: nouts;
1663  // the overhead of a cut is one transfer
1664  n_cuts = ((1.0*cost/com_cost_per_row)-nimgs)/(1.0*width);
1665  }
1666  else
1667  n_cuts = ((1.0*cost/com_cost_per_row)-nins-nouts)/(2.0*width);
1668 
1669  pips_debug(2, "cost=%d com_cost=%d nins=%d width=%d nouts=%d n_cuts=%f\n",
1670  cost, com_cost_per_row, nins, width, nouts, n_cuts);
1671 
1672  if (n_cuts < 1.0) return 0;
1673 
1674  // we also have to check that there is a significant erosion!
1675  // I first summarize the erosion to the max(n,s,e,w)
1676  // grrr... C really lacks a stupid max/min function varyadic!
1677  // I could compute per direction, if necessary...
1678  int erode = n;
1679  if (s>erode) erode=s;
1680  if (e>erode) erode=e;
1681  if (w>erode) erode=w;
1682 
1683  // then we should decide...
1684  // there should be enough computations to amortize a split,
1685  // given that an erode/dilate costs about 15 cycles per row
1686  // there should be about 2 of them to amortize/hide one imagelet transfer,
1687  // whether as input or output.
1688 
1689  int cut = erode/((int)(n_cuts+1));
1690 
1691  // try to fix the balance chosen by the integer division
1692  // hmmm... should really look at the weights to choose a side here...
1693  if (erode%2==1 && n_cuts<2.0 && nouts<=nins)
1694  cut++;
1695 
1696  return cut;
1697 }
1698 
1699 /* cut dag "d", possibly a subdag of "fulld", at "erosion" "cut"
1700  */
1701 static dag cut_perform(dag d, int cut, hash_table erodes, dag fulld,
1702  const set output_images)
1703 {
1704  pips_debug(2, "cutting with cut=%d\n", cut);
1705  pips_assert("something cut width", cut>0);
1706 
1707  set
1708  // current set of vertices to group
1709  current = set_make(set_pointer),
1710  // all vertices which are considered computed
1711  done = set_make(set_pointer);
1712 
1713  list lcurrent = NIL, computables;
1714  set_assign_list(done, dag_inputs(d));
1715 
1716  // GLOBAL
1717  pips_assert("erosion is clean", erosion==NULL);
1718  erosion = hash_table_make(hash_pointer, 0);
1719  dag_terapix_erosion(d, erosion);
1720 
1721  // transitive closure
1722  bool changed = true;
1723  while (changed &&
1724  (computables = dag_computable_vertices(d, done, done, current)))
1725  {
1726  // ensure determinism
1727  gen_sort_list(computables, (gen_cmp_func_t) dagvtx_terapix_priority);
1728  changed = false;
1729  FOREACH(dagvtx, v, computables)
1730  {
1731  // keep erosion up to cut
1732  // hmmm. what about \sigma_{d \in NSEW} erosion_d ?
1733  // would not work because the erosion only make sense if it is
1734  // the same for all imagelet, or said otherwise the erosion is
1735  // aligned to the worst case so that tiling can reasonnably take place.
1736  if ((((_int) hash_get(erodes, NORTH(v))) <= cut) &&
1737  (((_int) hash_get(erodes, SOUTH(v))) <= cut) &&
1738  (((_int) hash_get(erodes, EAST(v))) <= cut) &&
1739  (((_int) hash_get(erodes, WEST(v))) <= cut))
1740  {
1741  set_add_element(current, current, v);
1742  set_add_element(done, done, v);
1743  lcurrent = CONS(dagvtx, v, lcurrent);
1744  changed = true;
1745  }
1746  }
1747 
1748  // cleanup
1749  gen_free_list(computables), computables = NIL;
1750  }
1751 
1752  // cleanup GLOBAL
1753  hash_table_free(erosion), erosion = NULL;
1754 
1755  lcurrent = gen_nreverse(lcurrent);
1756  pips_assert("some vertices where extracted", lcurrent!=NIL);
1757 
1758  // build extracted dag
1759  dag nd = make_dag(NIL, NIL, NIL);
1760  FOREACH(dagvtx, v, lcurrent)
1761  {
1762  // pips_debug(7, "extracting node %" _intFMT "\n", dagvtx_number(v));
1763  dag_append_vertex(nd, copy_dagvtx_norec(v));
1764  }
1765  dag_compute_outputs(nd, NULL, output_images, NIL, false);
1766  dag_cleanup_other_statements(nd);
1767 
1768  // cleanup full dag
1769  FOREACH(dagvtx, v, lcurrent)
1770  dag_remove_vertex(d, v);
1771 
1772  // ??? should not be needed?
1773  freia_hack_fix_global_ins_outs(fulld, nd);
1774  freia_hack_fix_global_ins_outs(fulld, d);
1775 
1776  ifdebug(1)
1777  {
1778  dag_consistency_asserts(nd);
1779  dag_consistency_asserts(d);
1780  }
1781 
1782  // cleanup
1783  gen_free_list(lcurrent), lcurrent = NIL;
1784  set_free(done);
1785  set_free(current);
1786  return nd;
1787 }
1788 
1789 /*************************************************** TERAPIX HANDLE SEQUENCE */
1790 
1791 static void migrate_statements(sequence sq, dag d, set dones)
1792 {
1793  set stats = set_make(set_pointer);
1794  dag_statements(stats, d);
1795  freia_migrate_statements(sq, stats, dones);
1796  set_union(dones, dones, stats);
1797  set_free(stats);
1798 }
1799 
1800 /* do compile a list of statements for terapix
1801  * @param module, current module (function) name
1802  * @param ls, list of statements taken from the sequence
1803  * @param occs, occurences of images (image -> set of statements)
1804  * @param helper_file, file to which code is to be generated
1805  * @param number, number of this statement sequence in module
1806  * @return list of intermediate image to allocate
1807  */
1808 list freia_trpx_compile_calls
1809 (string module,
1810  dag fulld,
1811  sequence sq,
1812  list /* of statements */ ls,
1813  const hash_table occs,
1814  hash_table exchanges,
1815  const set output_images,
1816  FILE * helper_file,
1817  set helpers,
1818  int number)
1819 {
1820  bool reduce_cc =
1821  get_bool_property("HWAC_TERAPIX_REDUCE_TO_CONNECTED_COMPONENTS");
1822 
1823  // build DAG for ls
1824  pips_debug(3, "considering %d statements\n", (int) gen_length(ls));
1825  pips_assert("some statements", ls);
1826 
1827  int n_op_init, n_op_init_copies;
1828  freia_aipo_count(fulld, &n_op_init, &n_op_init_copies);
1829 
1830  // must have distinct images in the graph for optimizations
1831  hash_table init = hash_table_make(hash_pointer, 0);
1832  list new_images = dag_fix_image_reuse(fulld, init, occs);
1833 
1834  list added_before = NIL, added_after = NIL;
1835  freia_dag_optimize(fulld, exchanges, &added_before, &added_after);
1836 
1837  int n_op_opt, n_op_opt_copies;
1838  freia_aipo_count(fulld, &n_op_opt, &n_op_opt_copies);
1839 
1840  fprintf(helper_file,
1841  "\n"
1842  "// dag %d: %d ops and %d copies, "
1843  "optimized to %d ops and %d+%d+%d copies\n",
1844  number, n_op_init, n_op_init_copies,
1845  n_op_opt, n_op_opt_copies,
1846  (int) gen_length(added_before), (int) gen_length(added_after));
1847 
1848  // dump final dag
1849  dag_dot_dump_prefix(module, "dag_cleaned_", number, fulld,
1850  added_before, added_after);
1851 
1852  string fname_fulldag = strdup(cat(module, "_terapix", HELPER, i2a(number)));
1853 
1854  // First, split only on scalar deps...
1855  // is it that simple? NO!
1856  // consider A -> B -> s -> C -> D
1857  // \-> E -> F />
1858  // then ABEF / CD is chosen
1859  // although ABE / FCD and AB / EFCD would be also possible..
1860 
1861  pips_assert("erosion is clean", erosion==NULL);
1863  list ld = dag_split_on_scalars(fulld,
1868  output_images);
1869  hash_table_free(erosion), erosion = NULL;
1870 
1871  // split ld dags by connected components
1872  if (reduce_cc)
1873  {
1874  list nld = NIL;
1875  FOREACH(dag, d, ld)
1876  nld = gen_nconc(nld, dag_split_connected_components(d, output_images));
1877  gen_free_list(ld), ld = nld, nld = NIL;
1878  }
1879 
1880  pips_debug(4, "dag initial split in %d dags\n", (int) gen_length(ld));
1881 
1882  const char* dag_cut = get_string_property(trpx_dag_cut);
1883  pips_assert("valid cutting strategy", trpx_dag_cut_is_valid(dag_cut));
1884 
1885  // globally remaining statements
1886  set global_remainings = set_make(set_pointer);
1887  set_assign_list(global_remainings, ls);
1888 
1889  int n_split = 0;
1890  int stnb = -1;
1891  set dones = set_make(set_pointer);
1892 
1893  FOREACH(dag, d, ld)
1894  {
1895  // ??? should migrate beforehand?
1896 
1897  // skip if something is not implemented
1898  if (terapix_not_implemented(d))
1899  continue;
1900 
1901  if (dag_no_image_operation(d))
1902  continue;
1903 
1904  if (trpx_dag_cut_none_p(dag_cut))
1905  {
1906  migrate_statements(sq, d, dones);
1907  // direct handling of the dag
1908  stnb = freia_trpx_compile_one_dag(module, ls, d, fname_fulldag, n_split,
1909  -1, global_remainings, helper_file, helpers, stnb, init);
1910  }
1911  else if (trpx_dag_cut_compute_p(dag_cut))
1912  {
1913  // try split dag into subdags with a rough computed strategy
1915  int cut, n_cut = 0;
1916 
1917  // what about another strategy?
1918  // I can try every possible cuts and chose the best one,
1919  // that is to stop as soon as computation cost > communication cost?
1920  // or when costs are quite balanced in all cuts?
1921  // dag cutting strategy prop = none/computed/optimized?
1922 
1923  while ((cut = cut_decision(d, erosion)))
1924  {
1925  dag dc = cut_perform(d, cut, erosion, fulld, output_images);
1926 
1927  // may separate by connected components...
1928  list ld;
1929  if (reduce_cc)
1930  ld = dag_split_connected_components(dc, output_images);
1931  else
1932  ld = CONS(dag, dc, NIL);
1933 
1934  FOREACH(dag, dci, ld)
1935  {
1936  migrate_statements(sq, dci, dones);
1937  // generate code for cut
1938  stnb =
1939  freia_trpx_compile_one_dag(module, ls, dci, fname_fulldag, n_split,
1940  n_cut++, global_remainings, helper_file, helpers, stnb, init);
1941  // cleanup
1942  free_dag(dci);
1943  }
1944 
1946  gen_free_list(ld);
1947  }
1948 
1949  if (dag_vertices(d)) {
1950  // should it *ALWAYS* HAPPEN?
1951  migrate_statements(sq, d, dones);
1952  stnb = freia_trpx_compile_one_dag(module, ls, d, fname_fulldag, n_split,
1953  n_cut++, global_remainings, helper_file, helpers, stnb, init);
1954  }
1955 
1957  }
1958  else if (trpx_dag_cut_enumerate_p(dag_cut))
1959  pips_internal_error("not implemented yet");
1960  else
1961  pips_internal_error("cannot get there");
1962 
1963  n_split++;
1964  }
1965 
1966  freia_insert_added_stats(ls, added_before, true);
1967  added_before = NIL;
1968  freia_insert_added_stats(ls, added_after, false);
1969  added_after = NIL;
1970 
1971  // full cleanup
1972  set_free(global_remainings), global_remainings = NULL;
1973  free(fname_fulldag), fname_fulldag = NULL;
1974  FOREACH(dag, dc, ld)
1975  free_dag(dc);
1976  gen_free_list(ld);
1977 
1978  // deal with new images
1979  list real_new_images =
1980  freia_allocate_new_images_if_needed(ls, new_images, occs, init, init);
1981  gen_free_list(new_images);
1983  return real_new_images;
1984 }
static void node(FILE *out, string name)
Build for module name a node and link to its successors.
Definition: graph.c:56
int get_int_property(const string)
void free_dag(dag p)
void free_dagvtx(dagvtx p)
struct paramStruct params
static reference ref
Current stmt (an integer)
Definition: adg_read_paf.c:163
static FILE * out
Definition: alias_check.c:128
void const char const char const int
@ INT
Definition: atomic.c:48
bdt base
Current expression.
Definition: bdt_read_paf.c:100
void dag_consistency_asserts(dag d)
do some consistency checking...
Definition: dag-utils.c:531
list dag_split_connected_components(dag d, set output_images)
build connected components
Definition: dag-utils.c:3035
_int dagvtx_optype(const dagvtx v)
Definition: dag-utils.c:116
list dag_vertex_preds(const dag d, const dagvtx target)
return target predecessor vertices as a list.
Definition: dag-utils.c:680
_int dagvtx_number(const dagvtx v)
returns the vertex number, i.e.
Definition: dag-utils.c:98
bool dagvtx_other_stuff_p(const dagvtx v)
a vertex with a non AIPO or image related statement.
Definition: dag-utils.c:76
bool dag_no_image_operation(dag d)
tell whether we have something to do with images ??? hmmm...
Definition: dag-utils.c:2500
list dag_split_on_scalars(const dag initial, bool(*alone_only)(const dagvtx), dagvtx(*choose_vertex)(const list, bool), gen_cmp_func_t priority, void(*priority_update)(const dag), const set output_images)
split a dag on scalar dependencies only, with a greedy heuristics.
Definition: dag-utils.c:2823
void dagvtx_dump(FILE *out, const string name, const dagvtx v)
for dag debug.
Definition: dag-utils.c:186
list dag_computable_vertices(dag d, const set computed, const set maybe, const set currents)
return the vertices which may be computed from the list of available images, excluding vertices in ex...
Definition: dag-utils.c:2307
dagvtx dagvtx_get_producer(const dag d, const dagvtx sink, const entity e, _int before_number)
return (last) producer of image e for vertex sink, or NULL if none found.
Definition: dag-utils.c:156
void dag_remove_vertex(dag d, const dagvtx v)
remove vertex v from dag d.
Definition: dag-utils.c:570
void dag_dump(FILE *out, const string what, const dag d)
for dag debug
Definition: dag-utils.c:212
void dag_dot_dump(const string module, const string name, const dag d, const list lb, const list la)
generate a "dot" format from a dag to a file.
Definition: dag-utils.c:488
void set_append_vertex_statements(set s, list lv)
Definition: dag-utils.c:2385
string dagvtx_operation(const dagvtx v)
Definition: dag-utils.c:134
_int dagvtx_opid(const dagvtx v)
Definition: dag-utils.c:121
void dag_dot_dump_prefix(const string module, const string prefix, int number, const dag d, const list lb, const list la)
Definition: dag-utils.c:504
char * get_string_property(const char *)
const freia_api_t * hwac_freia_api(const char *function)
freia-utils.c
Definition: freia-utils.c:455
void freia_add_image_arguments(list limg, list *lparams)
prepend limg images in front of the argument list limg is consummed by the operation.
Definition: freia-utils.c:1234
list freia_get_vertex_params(const dagvtx v)
Definition: freia-utils.c:578
list freia_extract_params(const int napi, list args, string_buffer head, string_buffer head2, hash_table params, int *nparams)
returns an allocated expression list of the parameters only (i.e.
Definition: freia-utils.c:613
void hwac_kill_statement(statement s)
remove contents of statement s.
Definition: freia-utils.c:761
list freia_allocate_new_images_if_needed(list ls, list images, const hash_table occs, const hash_table init, const hash_table signatures)
insert image allocation if needed, for intermediate image inserted before if an image is used only tw...
Definition: freia-utils.c:1650
int freia_substitute_by_helper_call(dag d, set global_remainings, set remainings, list ls, const string function_name, list lparams, set helpers, int preceeding)
substitute those statement in ls that are in dag d and accelerated by a call to function_name(lparams...
Definition: freia-utils.c:1073
bool freia_convolution_p(dagvtx v)
is it the convolution special case?
Definition: freia-utils.c:1441
const freia_api_t * get_freia_api(int index)
Definition: freia-utils.c:477
bool freia_extract_kernel_vtx(dagvtx v, bool strict, intptr_t *k00, intptr_t *k10, intptr_t *k20, intptr_t *k01, intptr_t *k11, intptr_t *k21, intptr_t *k02, intptr_t *k12, intptr_t *k22)
vertex-based version
Definition: freia-utils.c:2012
call freia_statement_to_call(const statement s)
return the actual function call from a statement, dealing with assign and returns....
Definition: freia-utils.c:973
bool freia_convolution_width_height(dagvtx v, _int *pw, _int *ph, bool check)
get width & height of convolution
Definition: freia-utils.c:1449
void freia_insert_added_stats(list ls, list stats, bool before)
insert statements to actual code sequence in "ls" BEWARE that ls is assumed to be in reverse order....
Definition: freia-utils.c:1185
bool freia_aipo_terapix_implemented(const freia_api_t *api)
whether api available with Ter@pix
Definition: freia-utils.c:1426
#define cat(args...)
Definition: freia.h:41
#define AIPO
Definition: freia.h:51
#define HELPER
Definition: freia.h:38
#define sb_cat(args...)
Definition: freia.h:42
#define FREIA_IMAGE
Definition: freia.h:52
#define FREIA_DEFAULT_HEIGHT
Definition: freia.h:54
#define dagvtx_freia_api(v)
Definition: freia.h:97
@ spoc_type_mes
Definition: freia_spoc.h:179
@ spoc_type_nop
Definition: freia_spoc.h:174
@ spoc_type_oth
Definition: freia_spoc.h:173
@ spoc_type_alu
Definition: freia_spoc.h:177
#define pstatement_statement_p(x)
#define dagvtx_content(x)
#define vtxcontent_optype(x)
#define dag_outputs(x)
#define vtxcontent_out(x)
#define pstatement_statement(x)
#define dag_inputs(x)
#define dagvtx_succs(x)
#define vtxcontent_inputs(x)
#define dag_vertices(x)
#define vtxcontent_source(x)
static void terapix_mcu_img(string_buffer code, int op, string ref, int n)
set a double buffered image argument.
static hash_table erosion
global variable used by the dagvtx_terapix_priority function, because qsort does not allow to pass so...
static void dag_terapix_reset_erosion(const dag d)
#define WEST(v)
Definition: freia_terapix.c:88
static void terapix_gram_allocate(bool *used, int width, int height, int *x, int *y)
terapix allocate widthxheight in global memory
#define RED_PTR
static int cut_decision(dag d, hash_table erosion)
would it seem interesting to split d?
#define ip2s(n)
static void terapix_init_row(string_buffer decl, string_buffer code, string base, string suff, string mem, int nrow, string val, bool *used)
initialize a few rows at mem address with value val
static void dag_terapix_erosion(const dag d, hash_table erosion)
fill in erosion hash table from dag d.
static int dag_terapix_measures(const dag d, hash_table erosion, int *width, int *cost, int *nops, int *north, int *south, int *west, int *east)
compute some measures about DAG d.
static bool not_implemented(dagvtx v)
whether vertex is not implemented in terapix
static int freia_trpx_compile_one_dag(string module, list ls, dag d, string fname_fulldag, int n_split, int n_cut, set global_remainings, FILE *helper_file, set helpers, int stnb, hash_table signatures)
generate terapix code for this one dag, which should be already split.
#define EAST(v)
Definition: freia_terapix.c:89
static void erosion_optimization(dagvtx v, bool *north, bool *south, bool *west, bool *east)
tell whether the kernel is used on each of the 4 directions.
Definition: freia_terapix.c:69
static void terapix_get_reduction(string_buffer decl, string_buffer tail, int n_op, string mem, const freia_api_t *api)
generate reduction extraction code
static void terapix_macro_code(string_buffer code, string_buffer decl, int op, const freia_api_t *api, bool *used, hash_table hparams, const dagvtx v, const list ins, int out)
generate terapix code for
static void gram_param(string_buffer code, string_buffer decl, string name, dagvtx v, hash_table hparams, int width, int height, bool is_kernel, bool *used)
copy some operator parameters in the global ram (aka gram).
static _int freia_terapix_call(const string module, const string fname_dag, string_buffer code, dag thedag, list *params)
generate a terapix call for dag thedag.
static void terapix_initialize_memory(string_buffer decl, string_buffer body, int nop, string mem, const freia_api_t *api, bool *used)
initialize the memory at addr depending on the operation to perform
static dag cut_perform(dag d, int cut, hash_table erodes, dag fulld, const set output_images)
cut dag "d", possibly a subdag of "fulld", at "erosion" "cut"
#define IMG_PTR
static void terapix_image(string_buffer sb, int ff, int n)
generate an image symbolic pointer (a name:-).
static bool * terapix_gram_init(void)
allocate bitfield to described used cells in global memory.
static bool terapix_not_implemented(dag d)
whether dag is not implemented in terapix
static list dag_vertex_pred_imagelets(const dag d, const dagvtx v, const hash_table allocation)
of ints
static void terapix_gram_management(string_buffer code, string_buffer decl, int op, const freia_api_t *api, const dagvtx v, hash_table hparams, bool *used)
manage GRAM global memory to pass parameters.
static void terapix_mcu_pval(string_buffer code, int op, string ref, string p, string s)
set some prefixed value string argument.
static int dagvtx_terapix_priority(const dagvtx *v1, const dagvtx *v2)
comparison function for sorting dagvtx in qsort, this is deep voodoo, because the priority has an imp...
static void compute_dead_vertices(set deads, const set computed, const dag d, const dagvtx v)
Definition: freia_terapix.c:53
#define SOUTH(v)
Definition: freia_terapix.c:87
#define NORTH(v)
Definition: freia_terapix.c:86
static void update_erosions(const dag d, const dagvtx v, hash_table erosion)
update_erosions().
Definition: freia_terapix.c:95
static dagvtx choose_terapix_vertex(const list lv, bool started)
choose a vertex, avoiding non combinable stuff if the list is started
static _int select_imagelet(set availables, int *nimgs, bool first)
Return the first/last available imagelet, or create one if necessary This ensures that the choice is ...
static void terapix_mcu_val(string_buffer code, int op, string r, string s)
set some value string argument.
static void migrate_statements(sequence sq, dag d, set dones)
static void terapix_mcu_int(string_buffer code, int op, string ref, int val)
set an integer argument.
#define trpx_dag_cut_none_p(s)
Definition: freia_terapix.h:43
#define trpx_overlap_io_p()
Definition: freia_terapix.h:51
#define trpx_gram_width
Definition: freia_terapix.h:36
#define trpx_npe_prop
Definition: freia_terapix.h:34
#define trpx_dag_cut_is_valid(s)
Definition: freia_terapix.h:46
#define trpx_gram_height
Definition: freia_terapix.h:37
#define trpx_max_size
Definition: freia_terapix.h:40
#define trpx_mem_prop
Definition: freia_terapix.h:33
#define trpx_dag_cut
Definition: freia_terapix.h:38
#define trpx_dmabw_prop
Definition: freia_terapix.h:35
#define trpx_dag_cut_compute_p(s)
Definition: freia_terapix.h:44
#define trpx_dag_cut_enumerate_p(s)
Definition: freia_terapix.h:45
void * malloc(YYSIZE_T)
void free(void *)
list gen_nreverse(list cp)
reverse a list in place
Definition: list.c:304
#define NIL
The empty list (nil in Lisp)
Definition: newgen_list.h:47
list gen_copy_seq(list l)
Copy a list structure.
Definition: list.c:501
size_t gen_length(const list l)
Definition: list.c:150
#define CONS(_t_, _i_, _l_)
List element cell constructor (insert an element at the beginning of a list)
Definition: newgen_list.h:150
list gen_nconc(list cp1, list cp2)
physically concatenates CP1 and CP2 but do not duplicates the elements
Definition: list.c:344
#define CAR(pcons)
Get the value of the first element of a list.
Definition: newgen_list.h:92
void gen_free_list(list l)
free the spine of the list
Definition: list.c:327
bool gen_in_list_p(const void *vo, const list lx)
tell whether vo belongs to lx
Definition: list.c:734
#define FOREACH(_fe_CASTER, _fe_item, _fe_list)
Apply/map an instruction block on all the elements of a list.
Definition: newgen_list.h:179
#define CDR(pcons)
Get the list less its first element.
Definition: newgen_list.h:111
hash_table hash_table_make(hash_key_type key_type, size_t size)
Definition: hash.c:294
void * hash_get(const hash_table htp, const void *key)
this function retrieves in the hash table pointed to by htp the couple whose key is equal to key.
Definition: hash.c:449
void hash_put(hash_table htp, const void *key, const void *val)
This functions stores a couple (key,val) in the hash table pointed to by htp.
Definition: hash.c:364
void hash_table_free(hash_table htp)
this function deletes a hash table that is no longer useful.
Definition: hash.c:327
bool hash_defined_p(const hash_table htp, const void *key)
true if key has e value in htp.
Definition: hash.c:484
void hash_table_clear(hash_table htp)
Clears all entries of a hash table HTP.
Definition: hash.c:305
float_t space[SIZE][SIZE]
Definition: jacobi.c:7
int allocation
External variables for direct call to PIP.
Definition: pip.c:92
#define pips_debug
these macros use the GNU extensions that allow variadic macros, including with an empty list.
Definition: misc-local.h:145
#define pips_assert(what, predicate)
common macros, two flavors depending on NDEBUG
Definition: misc-local.h:172
#define pips_internal_error
Definition: misc-local.h:149
char * i2a(int)
I2A (Integer TO Ascii) yields a string for a given Integer.
Definition: string.c:121
string bool_to_string(bool)
Definition: string.c:243
@ hash_pointer
Definition: newgen_hash.h:32
#define same_string_p(s1, s2)
bool set_empty_p(const set)
tell whether set s is empty.
Definition: set.c:367
set set_assign_list(set, const list)
assigns a list contents to a set all duplicated elements are lost
Definition: set.c:474
set set_del_element(set, const set, const void *)
Definition: set.c:265
bool list_in_set_p(const list, const set)
Definition: set.c:201
#define SET_FOREACH(type_name, the_item, the_set)
enumerate set elements in their internal order.
Definition: newgen_set.h:78
void set_free(set)
Definition: set.c:332
set set_clear(set)
Assign the empty set to s s := {}.
Definition: set.c:326
bool set_belong_p(const set, const void *)
Definition: set.c:194
void set_fprint(FILE *, string, const set, gen_string_func_t)
print set s to file stream out.
Definition: set.c:524
@ set_pointer
Definition: newgen_set.h:44
set set_append_list(set, const list)
add list l items to set s, which is returned.
Definition: set.c:460
set set_dup(const set)
Definition: set.c:143
set set_make(set_type)
Create an empty set of any type but hash_private.
Definition: set.c:102
set set_add_element(set, const set, const void *)
Definition: set.c:152
void string_buffer_append_sb(string_buffer, const string_buffer)
append the string buffer sb2 to string buffer sb.
size_t string_buffer_size(const string_buffer)
return the size of the string in string_buffer sb
void string_buffer_to_file(const string_buffer, FILE *)
put string buffer into file.
void string_buffer_free(string_buffer *)
free string buffer structure, also free string contents according to the dup field
Definition: string_buffer.c:82
string_buffer string_buffer_make(bool dup)
allocate a new string buffer
Definition: string_buffer.c:58
#define _intFMT
Definition: newgen_types.h:57
char * string
STRING.
Definition: newgen_types.h:39
intptr_t _int
_INT
Definition: newgen_types.h:53
int(* gen_cmp_func_t)(const void *, const void *)
Definition: newgen_types.h:114
int f(int off1, int off2, int n, float r[n], float a[n], float b[n])
Definition: offsets.c:15
static char * module
Definition: pips.c:74
list lparams
Array bounds.
Definition: reindexing.c:111
const char * entity_user_name(entity e)
Since entity_local_name may contain PIPS special characters such as prefixes (label,...
Definition: entity.c:487
entity local_name_to_top_level_entity(const char *n)
This function try to find a top-level entity from a local name.
Definition: entity.c:1450
static int init
Maximal value set for Fortran 77.
Definition: entity.c:320
#define EXPRESSION(x)
EXPRESSION.
Definition: ri.h:1217
#define call_arguments(x)
Definition: ri.h:711
char * strdup()
#define ifdebug(n)
Definition: sg.c:47
static bool ok
static char * x
Definition: split_file.c:159
static void initialize()
Definition: stats.c:407
#define intptr_t
Definition: stdint.in.h:294
static size_t current
Definition: string.c:115
internally defined structure.
Definition: string_buffer.c:47
FI: I do not understand why the type is duplicated at the set level.
Definition: set.c:59
The structure used to build lists in NewGen.
Definition: newgen_list.h:41
FREIA API function name -> SPoC hardware description (and others?)
Definition: freia.h:71
unsigned int arg_misc_in
Definition: freia.h:83
string compact_name
Definition: freia.h:75
terapix_hw_t terapix
Definition: freia.h:90
string arg_out_types[3]
Definition: freia.h:85
unsigned int arg_img_out
Definition: freia.h:79
unsigned int arg_misc_out
Definition: freia.h:82
unsigned int arg_img_in
Definition: freia.h:80
Definition: statement.c:4047
static Panel_item choice
Definition: xv_schoose2.c:54