PIPS
freia_opencl.c
Go to the documentation of this file.
1 /*
2 
3  $Id: freia_opencl.c 23288 2016-11-08 10:09:39Z coelho $
4 
5  Copyright 1989-2016 MINES ParisTech
6 
7  This file is part of PIPS.
8 
9  PIPS is free software: you can redistribute it and/or modify it
10  under the terms of the GNU General Public License as published by
11  the Free Software Foundation, either version 3 of the License, or
12  any later version.
13 
14  PIPS is distributed in the hope that it will be useful, but WITHOUT ANY
15  WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  FITNESS FOR A PARTICULAR PURPOSE.
17 
18  See the GNU General Public License for more details.
19 
20  You should have received a copy of the GNU General Public License
21  along with PIPS. If not, see <http://www.gnu.org/licenses/>.
22 
23 */
24 
25 #ifdef HAVE_CONFIG_H
26 #include "pips_config.h"
27 #endif
28 
29 #include <stdint.h>
30 #include <stdlib.h>
31 
32 #include "genC.h"
33 #include "misc.h"
34 #include "linear.h"
35 
36 #include "ri.h"
37 #include "ri-util.h"
38 
39 #include "pipsdbm.h"
40 #include "properties.h"
41 
42 #include "freia.h"
43 #include "hwac.h"
44 
45 /* @return OpenCL helper file name for function
46  */
47 static string get_opencl_file_name(string func_name)
48 {
49  string src_dir = db_get_directory_name_for_module(func_name);
50  string file = strdup(cat(src_dir, "/", func_name, "_helper_functions.cl"));
51  free(src_dir);
52  return file;
53 }
54 
55 /* @return whether this vertex is mergeable for OpenCL
56  */
57 static bool opencl_mergeable_p(const dagvtx v)
58 {
59  const freia_api_t * api = dagvtx_freia_api(v);
60  bool mergeable = api? api->opencl.mergeable: false;
61  pips_debug(7, "%"_intFMT" vertex is %smergeable\n",
62  dagvtx_number(v), mergeable? "": "not ");
63  return mergeable;
64 }
65 
66 /* qsort helper: return -1 for v1 before v2
67  */
68 static int dagvtx_opencl_priority(const dagvtx * pv1, const dagvtx * pv2)
69 {
70  const dagvtx v1 = *pv1, v2 = *pv2;
71  bool m1 = opencl_mergeable_p(v1), m2 = opencl_mergeable_p(v2);
72  if (m1 && !m2)
73  return -1;
74  else if (!m1 && m2)
75  return 1;
76  else
77  return dagvtx_number(v1)-dagvtx_number(v2);
78 }
79 
80 /* @brief choose a vertex, avoiding other stuff if the list is started
81  */
82 static dagvtx choose_opencl_vertex(const list lv, bool started)
83 {
84  pips_assert("list contains vertices", lv);
85  if (started)
86  {
87  FOREACH(dagvtx, v, lv)
88  if (!dagvtx_other_stuff_p(v))
89  return v;
90  }
91  // just return the first vertex
92  return DAGVTX(CAR(lv));
93 }
94 
95 /* @return opencl type for freia type
96  */
97 static string opencl_type(string t)
98 {
99  if (same_string_p(t, "int32_t")) return "int";
100  pips_internal_error("unexpected type: %s\n", t);
101  return NULL;
102 }
103 
104 static string border_condition[9] = {
105  "is_N|is_W", "is_N", "is_N|is_E",
106  "is_W", NULL, "is_E",
107  "is_S|is_W", "is_S", "is_S|is_E"
108 };
109 
110 /* generate a load if needed for an input variable
111  * return the holding variable name in a statically allocated array
112  */
113 static string pixel_name(
114  dagvtx v, // source vertex
115  int shft, // -4 -3 -2 / -1 0 1 / 2 3 4
116  set loaded, // those already loaded
117  string_buffer load, // append load code there if needed
118  list inputs, // list of input vertices
119  string indentation)
120 {
121  // build name
122  static char name[30];
123  int in = -1; // input number
124  static string suffix[9] =
125  { "NW", "N", "NE", "W", "", "E", "SW", "S", "SE" };
126  bool is_input = dagvtx_number(v)==0;
127  if (is_input) {
128  in = gen_position(v, inputs)-1;
129  sprintf(name, "in%d%s", in, suffix[shft+4]);
130  }
131  else // else temporary variable
132  sprintf(name, "t%d", (int) dagvtx_number(v));
133 
134  // generate code if needed
135  static string shift[9] = {
136  "-pitch-1", "-pitch", "-pitch+1",
137  "-1", "", "+1",
138  "+pitch-1", "+pitch", "+pitch+1"
139  };
140 
141 #define VARSHIFT(v,n) (((char*)v)+(n))
142 
143  if (is_input && !set_belong_p(loaded, VARSHIFT(v, shft)))
144  {
145  if (get_bool_property("HWAC_OPENCL_PRELOAD_PIXELS"))
146  {
147  // we declare and load
148  sb_cat(load, indentation, OPENCL_PIXEL, name, " = ");
149  if (border_condition[shft+4])
150  sb_cat(load, "(", border_condition[shft+4], ")? 0: ");
151  sb_cat(load, "j", i2a(in), "[i", shift[shft+4], "];\n");
152 
153  // done!
154  set_add_element(loaded, loaded, VARSHIFT(v, shft));
155  }
156  else
157  // directly reference the initial array
158  sprintf(name, "j%d[i%s]", in, shift[shft+4]);
159  }
160 
161  return name;
162 }
163 
164 /* @brief perform OpenCL compilation on mergeable dag
165  the generated code relies on some freia-provided runtime
166  may be called on a one vertex dag for kernel operations
167 */
169  string module,
170  dag d, list ls,
171  string split_name, int n_cut,
172  set global_remainings, hash_table signatures,
173  FILE * helper_file, FILE * opencl_file, set helpers, int stnb)
174 {
175  string cut_name = strdup(cat(split_name, "_", i2a(n_cut)));
176  pips_debug(3, "compiling %s cut %d, %d stats\n",
177  split_name, n_cut, (int) gen_length(dag_vertices(d)));
178  ifdebug(4) dag_dump(stderr, cut_name, d);
179 
180  // properties
181  bool tiling = get_bool_property("HWAC_OPENCL_TILING");
182  string BODY = tiling? " ": " "; // loop body indentations
183 
184  const string first_thread_dim = (const string)
185  get_string_property("HWAC_OPENCL_FIRST_THREAD_DIMENSION");
186  const string first_kernel_loop = (const string)
187  get_string_property("HWAC_OPENCL_FIRST_KERNEL_LOOP");
188  pips_assert("HWAC_OPENCL_FIRST_THREAD_DIMENSION prop is 'height' or 'width'",
189  same_string_p(first_thread_dim, "height") ||
190  same_string_p(first_thread_dim, "width"));
191  pips_assert("HWAC_OPENCL_FIRST_KERNEL_LOOP prop is 'height' or 'width'",
192  same_string_p(first_kernel_loop, "height") ||
193  same_string_p(first_kernel_loop, "width"));
194 
195  string Hdim, Wdim;
196  if (same_string_p(first_thread_dim, "height"))
197  Hdim = "0", Wdim = "1";
198  else
199  Hdim = "1", Wdim = "0";
200 
201  bool first_loop_on_height = same_string_p(first_kernel_loop, "height");
202 
203  // i is the width loop, j the height loop
204  string INDi, INDj;
205  if (tiling)
206  if (first_loop_on_height)
207  INDi = " ", INDj = " ";
208  else
209  INDi = " ", INDj = " ";
210  else // no tiling, no actual j loop.
211  INDi = " ", INDj = " ", first_loop_on_height = true;
212 
213  dag_dot_dump(module, cut_name, d, NIL, NIL);
214 
215  // I could handle a closed dag, such as volume(cst(12))...
216  pips_assert("some input or output images", dag_inputs(d) || dag_outputs(d));
217 
218  set remainings = set_make(set_pointer), loaded = set_make(set_pointer);
220 
221  list lparams = NIL;
222 
224  // helper function
225  helper = string_buffer_make(true),
226  helper_decls = string_buffer_make(true),
227  helper_body = string_buffer_make(true),
228  helper_body_2 = string_buffer_make(true),
229  helper_tail = string_buffer_make(true),
230  // opencl function
231  opencl = string_buffer_make(true), // function signature
232  opencl_2 = string_buffer_make(true), // last "reduction" argument
233  opencl_head = string_buffer_make(true), // setup & declarations & loop
234  opencl_i_loop = string_buffer_make(true),
235  opencl_j_loop = string_buffer_make(true),
236  opencl_pointers = string_buffer_make(true), // compute row image pointers
237  // body
238  opencl_load = string_buffer_make(true), // load variables
239  opencl_body = string_buffer_make(true), // actual operations
240  opencl_tail = string_buffer_make(true), // store variables
241  // end of code
242  opencl_end = string_buffer_make(true), // after loops
243  // compilation function
244  compile = string_buffer_make(true);
245 
246  // runtime temporary limitation: one image is reduced
247  entity reduced = NULL;
248 
249  // whether there is a kernel computation in dag
250  bool has_kernel = false,
251  need_N = false, need_S = false, need_W = false, need_E = false;
252 
253  sb_cat(helper,
254  "\n"
255  "// helper function ", cut_name, "\n"
256  "freia_status ", cut_name, "(");
257 
258  sb_cat(helper_decls,
259  " freia_status err = FREIA_OK;\n");
260 
261  int n_outs = gen_length(dag_outputs(d)), n_ins = gen_length(dag_inputs(d));
262 
263  // ??? what about vol(cst(3))? we could use default bpp/width/height?
264  pips_assert("some images to process", n_ins+n_outs);
265 
266  sb_cat(helper_decls,
267  "\n"
268  // hmmm... should be really done at init time. how to do that?
269  " // handle on the fly compilation...\n"
270  " static int to_compile = 1;\n"
271  " if (to_compile) {\n"
272  " err |= ", cut_name, "_compile();\n"
273  " // compilation may have failed\n"
274  " if (err) return err;\n"
275  " to_compile = 0;\n"
276  " }\n"
277  "\n"
278  " // now get kernel, which must be have be compiled\n"
279  " uint32_t bpp = ", n_ins? "i0": "o0", "->bpp>>4;\n"
280  " cl_kernel kernel = ", cut_name, "_kernel[bpp];\n");
281 
282  sb_cat(opencl,
283  "\n"
284  "// opencl function ", cut_name, "\n"
285  "KERNEL void ", cut_name, "(");
286 
287  // count stuff in the generated code
288  int nargs = 0, n_params = 0, n_misc = 0, cl_args = 1;
289 
290  if (!tiling)
291  sb_cat(opencl_head,
292  // we assume that the image height is on the first thread dimension
293  " // no tiling on height dimension\n"
294  " // assert(height==get_global_size(", Hdim, "));\n"
295  " int j = get_global_id(", Hdim, ");\n"
296  "\n");
297 
298  sb_cat(opencl_pointers,
299  tiling? INDj: "", " // get input & output image pointers\n",
300  tiling? INDj: "", " int shift = pitch*j;\n");
301 
302  // output images
303  if (n_outs)
304  sb_cat(opencl_tail, BODY, "// set output pixels\n");
305  int i = 0;
306  FOREACH(dagvtx, v, dag_outputs(d))
307  {
308  string si = strdup(i2a(i));
309  sb_cat(helper, nargs? ",": "", "\n " FREIA_IMAGE "o", si);
310  sb_cat(opencl, nargs? ",": "",
311  "\n " OPENCL_IMAGE "o", si,
312  ",\n int ofs_o", si);
313  // image p<out> = o<out> + ofs_o<out>;
314  sb_cat(opencl_pointers, tiling? INDj: "",
315  " " OPENCL_IMAGE "p", si, " = ",
316  "o", si, " + ofs_o", si, " + shift;\n");
317  // , o<out>
318  sb_cat(helper_body, ", o", si);
319  // p<out>[i] = t<n>;
320  sb_cat(opencl_tail,
321  BODY, "p", si, "[i] = t", i2a((int) dagvtx_number(v)), ";\n");
322  cl_args+=2;
323  nargs++;
324  free(si);
325  i++;
326  }
327 
328  // input images
329  if (n_ins)
330  sb_cat(opencl_load, BODY, "// get input pixels\n");
331  for (i = 0; i<n_ins; i++)
332  {
333  string si = strdup(i2a(i));
334  sb_cat(helper, nargs? ",": "", "\n const " FREIA_IMAGE "i", si);
335  sb_cat(opencl, nargs? ",": "",
336  "\n " OPENCL_IMAGE "i", si, ", // const?\n int ofs_i", si);
337  cl_args+=2;
338  // image j<in> = i<in> + ofs_i<out> + shift;
339  sb_cat(opencl_pointers, tiling? INDj: "",
340  " " OPENCL_IMAGE "j", si, " = ",
341  "i", si, " + ofs_i", si, " + shift;\n");
342  // , i<in>
343  sb_cat(helper_body, ", i", si);
344  nargs++;
345  free(si);
346  }
347 
348  // pixel in<in> = j<in>[i];
349  //sb_cat(opencl_load,
350  // " " OPENCL_PIXEL "in", si, " = j", si, "[i];\n");
351 
352  // size parameters to handle an image row
353  sb_cat(opencl, ",\n"
354  " int width, // of the working area, vs image pitch below\n"
355  " int height, // of the working area\n"
356  // the pitch is shared by all images
357  // which thus must be declared of the same size...
358  " int pitch");
359  cl_args+=3;
360 
361  // there are possibly other kernel arguments yet to come...
362 
363  // helper call image arguments
364  list limg = NIL;
365  FOREACH(dagvtx, voa, dag_outputs(d))
366  limg = CONS(entity, vtxcontent_out(dagvtx_content(voa)), limg);
367  FOREACH(dagvtx, via, dag_inputs(d))
368  limg = CONS(entity, vtxcontent_out(dagvtx_content(via)), limg);
369  limg = gen_nreverse(limg);
370 
371  sb_cat(opencl_body, BODY, "// pixel computations\n");
372 
373  // actual computations...
374  list vertices = gen_nreverse(gen_copy_seq(dag_vertices(d)));
375  FOREACH(dagvtx, v, vertices)
376  {
377  // skip input nodes
378  if (dagvtx_number(v)==0) continue;
379 
380  // vertex v number as a string
381  string svn = strdup(i2a((int) dagvtx_number(v)));
382 
383  // get details...
384  vtxcontent vc = dagvtx_content(v);
385  pips_assert("there is a statement",
389  int opid = dagvtx_opid(v);
390  const freia_api_t * api = get_freia_api(opid);
391  pips_assert("freia api found", api!=NULL);
392 
393  bool is_a_reduction = api->arg_misc_out;
394  bool is_a_kernel = api->opencl.mergeable_kernel;
395  bool is_a_convolution =
396  is_a_kernel && same_string_p(api->compact_name, "conv");
397 
398  // update for helper call arguments...
399  // kernel operations are specialized, so there is no need to pass it.
400  if (!is_a_kernel)
401  {
403  freia_extract_params(opid, call_arguments(c), helper,
404  is_a_reduction? NULL: opencl, NULL, &nargs));
405  }
406  // input image arguments
407  list preds = dag_vertex_preds(d, v);
408  int nao = 0;
409 
410  // scalar output arguments: we are dealing with a reduction!
411  if (is_a_reduction)
412  {
413  vtxcontent c = dagvtx_content(v);
414  list li = vtxcontent_inputs(c);
415  pips_assert("one input image", gen_length(li)==1);
416  pips_assert("no scalar inputs", !api->arg_misc_in);
417 
418  // get the reduced image
419  entity img = ENTITY(CAR(li));
420  // we deal with only one image at the time...
421  // it is a runtime limitation
422  pips_assert("same image if any", !reduced ^ (img==reduced));
423  if (!reduced)
424  {
425  reduced = img;
426  sb_cat(helper_decls,
427  "\n"
428  " // currently only one reduction structure...\n"
429  " freia_opencl_measure_status redres;\n");
430  // must be the last argument!
431  sb_cat(helper_body_2, ", &redres");
432  sb_cat(helper_tail, "\n // return reduction results\n");
433  sb_cat(opencl_2, ",\n GLOBAL TMeasure * redX");
434  // declare them all
435  sb_cat(opencl_head,
436  " // reduction stuff is currently hardcoded...\n",
437  " int vol = 0;\n",
438  " PIXEL minv = PIXEL_MAX;\n"
439  " int2 minpos = { 0, 0 };\n"
440  " PIXEL maxv = PIXEL_MIN;\n"
441  " int2 maxpos = { 0, 0 };\n"
442  "\n");
443  sb_cat(opencl_end,
444  "\n"
445  " // reduction copy out\n"
446  // assume a 2D worksize. any linearization would do.
447  " int thrid = "
448  "get_global_id(0)*get_global_size(1)+get_global_id(1);\n");
449  n_misc = 1;
450  }
451  // inner loop reduction code
452  dagvtx pred = DAGVTX(CAR(preds));
453  sb_cat(opencl_body, BODY, api->opencl.macro, "(red", svn, ", ",
454  pixel_name(pred, 0, loaded, opencl_load, dag_inputs(d), BODY),
455  ");\n");
456 
457 #define RED " = redres." // for a small code compaction
458 
459  // tail code to copy back stuff in OpenCL.
460  // the runtime will have to finish the reduction
461  // ??? HARDCODED for now...
462  if (same_string_p(api->compact_name, "max"))
463  {
464  sb_cat(opencl_end, " redX[thrid].max = maxv;\n");
465  sb_cat(helper_tail, " *po", i2a(nargs-1), RED "maximum;\n");
466  }
467  else if (same_string_p(api->compact_name, "min"))
468  {
469  sb_cat(opencl_end, " redX[thrid].min = minv;\n");
470  sb_cat(helper_tail, " *po", i2a(nargs-1), RED "minimum;\n");
471  }
472  else if (same_string_p(api->compact_name, "vol"))
473  {
474  sb_cat(opencl_end, " redX[thrid].vol = vol;\n");
475  sb_cat(helper_tail, " *po", i2a(nargs-1), RED "volume;\n");
476  }
477  else if (same_string_p(api->compact_name, "max!"))
478  {
479  sb_cat(opencl_end,
480  " redX[thrid].max = maxv;\n",
481  " redX[thrid].max_x = (uint) maxpos.x;\n",
482  " redX[thrid].max_y = (uint) maxpos.y;\n");
483  sb_cat(helper_tail, " *po", i2a(nargs-3), RED "maximum;\n");
484  sb_cat(helper_tail, " *po", i2a(nargs-2), RED "max_coord_x;\n");
485  sb_cat(helper_tail, " *po", i2a(nargs-1), RED "max_coord_y;\n");
486  }
487  else if (same_string_p(api->compact_name, "min!"))
488  {
489  sb_cat(opencl_end,
490  " redX[thrid].min = minv;\n",
491  " redX[thrid].min_x = (uint) minpos.x;\n",
492  " redX[thrid].min_y = (uint) minpos.y;\n");
493  sb_cat(helper_tail, " *po", i2a(nargs-3), RED "minimum;\n");
494  sb_cat(helper_tail, " *po", i2a(nargs-2), RED "min_coord_x;\n");
495  sb_cat(helper_tail, " *po", i2a(nargs-1), RED "min_coord_y;\n");
496  }
497  }
498  else if (is_a_kernel)
499  {
500  // NOTE about ILP: many intra sequence dependencies are generated...
501  // - the loaded are issued ahead, then the operations are performed.
502  // - some more operation intermixing could be possible in some cases.
503  // - moving gards out does not change anything much wrt "?:" ops
504  // e.g. "if (is_X|is_Y) { op1 op2 }
505 
506  // record for adding specific stuff later...
507  has_kernel = true;
508  pips_assert("one input", gen_length(preds)==1);
509 
510  dagvtx input = DAGVTX(CAR(preds));
511  intptr_t k00, k01, k02, k10, k11, k12, k20, k21, k22;
512  bool extracted = freia_extract_kernel_vtx(v, true,
513  &k00, &k01, &k02, &k10, &k11, &k12, &k20, &k21, &k22);
514 
515  // simplistic hypothesis...
516  pips_assert("got kernel", extracted);
517  pips_assert("trivial kernel",
518  (k00==0 || k00==1) && (k01==0 || k01==1) && (k02==0 || k02==1) &&
519  (k10==0 || k10==1) && (k11==0 || k11==1) && (k12==0 || k12==1) &&
520  (k20==0 || k20==1) && (k21==0 || k21==1) && (k22==0 || k22==1));
521 
522  need_N = need_N || k00 || k01 || k02;
523  need_W = need_W || k00 || k10 || k20;
524  need_E = need_E || k02 || k12 || k22;
525  need_S = need_S || k20 || k21 || k22;
526 
527  // pixel t<vertex> = <init>;
528  sb_cat(opencl_load,
529  BODY, OPENCL_PIXEL "t", svn, " = ", api->opencl.init, ";\n");
530  // t<vertex> = <op>(t<vertex>, boundary?init:j[i+<shift....>]);
531  if (k00)
532  sb_cat(opencl_body, BODY, "t", svn, " = ", api->opencl.macro,
533  "(t", svn, ", (", border_condition[4-4], ")? ",
534  api->opencl.init, ": ",
535  pixel_name(input, -4, loaded, opencl_load, dag_inputs(d), BODY),
536  ");\n");
537  if (k01)
538  sb_cat(opencl_body, BODY, "t", svn, " = ", api->opencl.macro,
539  "(t", svn, ", (", border_condition[4-3], ")? ",
540  api->opencl.init, ": ",
541  pixel_name(input, -3, loaded, opencl_load, dag_inputs(d), BODY),
542  ");\n");
543  if (k02)
544  sb_cat(opencl_body, BODY, "t", svn, " = ", api->opencl.macro,
545  "(t", svn, ", (", border_condition[4-2], ")? ",
546  api->opencl.init, ": ",
547  pixel_name(input, -2, loaded, opencl_load, dag_inputs(d), BODY),
548  ");\n");
549  if (k10)
550  sb_cat(opencl_body, BODY, "t", svn, " = ", api->opencl.macro,
551  "(t", svn, ", (", border_condition[4-1], ")? ",
552  api->opencl.init, ": ",
553  pixel_name(input, -1, loaded, opencl_load, dag_inputs(d), BODY),
554  ");\n");
555  // most likely to be non null
556  if (k11)
557  sb_cat(opencl_body, BODY, "t", svn, " = ", api->opencl.macro,
558  "(t", svn, ", ",
559  pixel_name(input, 0, loaded, opencl_load, dag_inputs(d), BODY),
560  ");\n");
561  if (k12)
562  sb_cat(opencl_body, BODY, "t", svn, " = ", api->opencl.macro,
563  "(t", svn, ", (", border_condition[4+1], ")? ",
564  api->opencl.init, ": ",
565  pixel_name(input, +1, loaded, opencl_load, dag_inputs(d), BODY),
566  ");\n");
567  if (k20)
568  sb_cat(opencl_body, BODY, "t", svn, " = ", api->opencl.macro,
569  "(t", svn, ", (", border_condition[4+2], ")? ",
570  api->opencl.init, ": ",
571  pixel_name(input, +2, loaded, opencl_load, dag_inputs(d), BODY),
572  ");\n");
573  if (k21)
574  sb_cat(opencl_body, BODY, "t", svn, " = ", api->opencl.macro,
575  "(t", svn, ", (", border_condition[4+3], ")? ",
576  api->opencl.init, ": ",
577  pixel_name(input, +3, loaded, opencl_load, dag_inputs(d), BODY),
578  ");\n");
579  if (k22)
580  sb_cat(opencl_body, BODY, "t", svn, " = ", api->opencl.macro,
581  "(t", svn, ", (", border_condition[4+4], ")? ",
582  api->opencl.init, ": ",
583  pixel_name(input, +4, loaded, opencl_load, dag_inputs(d), BODY),
584  ");\n");
585 
586  if (is_a_convolution)
587  {
588  // compute norm depending on border...
589  sb_cat(opencl_body,
590  BODY, "// compute norm\n",
591  BODY, OPENCL_PIXEL "n", svn, ";\n");
592  // corner first
593  sb_cat(opencl_body,
594  BODY, "if (", border_condition[1], ")\n",
595  BODY, " if (", border_condition[3], ") n", svn,
596  " = ", i2a(k11+k12+k21+k22), ";\n"); // NW
597  sb_cat(opencl_body,
598  BODY, " else if (", border_condition[5], ") n", svn,
599  " = ", i2a(k10+k11+k20+k21), ";\n"); // NE
600  sb_cat(opencl_body,
601  BODY, " else n", svn,
602  " = ", i2a(k10+k11+k12+k20+k21+k22), ";\n"); // N
603  sb_cat(opencl_body,
604  BODY, "else if (", border_condition[7], ")\n",
605  BODY, " if (", border_condition[3], ") n", svn,
606  " = ", i2a(k11+k12+k01+k02), ";\n"); // SW
607  sb_cat(opencl_body,
608  BODY, " else if (", border_condition[5], ") n", svn,
609  " = ", i2a(k10+k11+k00+k01), ";\n"); // SE
610  sb_cat(opencl_body,
611  BODY, " else n", svn,
612  " = ", i2a(k00+k01+k02+k10+k11+k12), ";\n"); // S
613  sb_cat(opencl_body,
614  BODY, "else if (", border_condition[3], ") n", svn,
615  " = ", i2a(k01+k11+k12+k02+k21+k22), ";\n"); // W
616  sb_cat(opencl_body,
617  BODY, "else if (", border_condition[5], ") n", svn,
618  " = ", i2a(k00+k01+k10+k11+k20+k21), ";\n"); // E
619  sb_cat(opencl_body, BODY, "else n", svn,
620  " = ", i2a(k00+k01+k02+k10+k11+k12+k20+k21+k22), ";\n"); // C
621  sb_cat(opencl_body, BODY, "t", svn, " = "
622  "PIXEL_DIV(t", svn, ", n", svn, ");\n");
623  }
624 
625  }
626  else // we are compiling an arithmetic pixel operation
627  {
628  // pixel t<vertex> = <op>(args...);
629  sb_cat(opencl_body,
630  BODY, OPENCL_PIXEL "t", svn, " = ", api->opencl.macro, "(");
631 
632  // macro arguments
633  FOREACH(dagvtx, p, preds)
634  sb_cat(opencl_body, nao++? ", ": "",
635  pixel_name(p, 0, loaded, opencl_load, dag_inputs(d), BODY));
636 
637  gen_free_list(preds), preds = NIL;
638 
639  // other (scalar) input arguments
640  for (int i=0; i<(int) api->arg_misc_in; i++)
641  {
642  string sn = strdup(i2a(n_params));
643  sb_cat(helper, ",\n ", api->arg_in_types[i], " c", sn);
644  sb_cat(opencl, ",\n ", opencl_type(api->arg_in_types[i]), " c", sn);
645  sb_cat(helper_body, ", c", sn);
646  cl_args++;
647  sb_cat(opencl_body, nao++? ", ": "", "c", sn);
648  free(sn);
649  n_params++;
650  }
651 
652  // end of macro call
653  sb_cat(opencl_body, ");\n");
654  }
655 
656  // cleanup
657  free(svn), svn = NULL;
658  } // end of FOREACH on operations
659  gen_free_list(vertices), vertices = NIL;
660 
661  // tail
662  sb_cat(helper, ")\n{\n");
663  string_buffer_append_sb(helper, helper_decls);
664 
665  // this is really for debug, and should not be needed.
666  if (get_bool_property("HWAC_OPENCL_SYNCHRONIZE_KERNELS"))
667  sb_cat(helper, "\n"
668  " // synchronize...\n"
669  " freia_common_wait();\n");
670  sb_cat(helper, "\n"
671  " // call kernel ", cut_name, "\n",
672  " err |= freia_op_call_kernel(kernel");
673  // tell about number of coming kernel parameters
674  sb_cat(helper, ", ", i2a(n_outs)); // output images
675  sb_cat(helper, ", ", i2a(n_ins)); // input images
676  sb_cat(helper, ", ", i2a(n_params)); // input integer parameters
677  sb_cat(helper, ", ", i2a(n_misc)); // output integer pointers
678  string_buffer_append_sb(helper, helper_body); // image & param args
679  string_buffer_append_sb(helper, helper_body_2); // reduction args
680  sb_cat(helper, ");\n");
681  string_buffer_append_sb(helper, helper_tail);
682  sb_cat(helper, "\n return err;\n}\n");
683 
684  // OPENCL CODE
685  string_buffer_append_sb(opencl, opencl_2);
686  sb_cat(opencl, ")\n{\n");
687  string_buffer_append_sb(opencl, opencl_head);
688 
689  if (tiling)
690  // j (height) loop bound
691  sb_cat(opencl,
692  " // loop j upper bound\n"
693  " int Htile = (height+get_global_size(", Hdim, ")-1)/"
694  "get_global_size(", Hdim, ");\n"
695  " int Hlast = Htile*(get_global_id(", Hdim, ")+1);\n"
696  " if (Hlast>height) Hlast = height;\n"
697  "\n");
698 
699  sb_cat(opencl,
700  // i (width) loop bound
701  // work per thread so that all pixes are processed
702  " // loop i upper bound\n"
703  " int Wtile = (width+get_global_size(", Wdim, ")-1)/"
704  "get_global_size(", Wdim,");\n"
705  // last index in the row for this thread
706  " int Wlast = Wtile*(get_global_id(", Wdim, ")+1);\n"
707  " if (Wlast>width) Wlast = width;\n"
708  "\n");
709 
710  if (tiling)
711  sb_cat(opencl_j_loop,
712  // thread x-axis tile loop
713  INDj, "int j;\n",
714  INDj, "for (j=Htile*get_global_id(", Hdim, "); j<Hlast; j++)\n",
715  INDj, "{\n");
716 
717  if (has_kernel) {
718  // it is on the inside, if there is an inside...
719  string IND = tiling? INDj: "";
720  sb_cat(opencl_j_loop, IND, " "
721  "// N & S boundaries, one thread on first dimension per row\n");
722  if (tiling)
723  sb_cat(opencl_j_loop, IND, " ",
724  need_N? "int is_N = (j==0);\n": "// N not needed\n",
725  IND, " ",
726  need_S? "int is_S = (j==(height-1));\n": "// S not needed\n",
727  "\n");
728  else {
729  sb_cat(opencl_j_loop, IND, " ");
730  if (need_N)
731  sb_cat(opencl_j_loop, "int is_N = (get_global_id(", Hdim, ")==0);\n");
732  else
733  sb_cat(opencl_j_loop, "// N not needed\n");
734  sb_cat(opencl_j_loop, IND, " ");
735  if (need_S)
736  sb_cat(opencl_j_loop,
737  "int is_S = (get_global_id(", Hdim, ")==(height-1));\n");
738  else
739  sb_cat(opencl_j_loop, "// S not needed\n");
740  sb_cat(opencl_j_loop, "\n");
741  }
742  }
743 
744  sb_cat(opencl_i_loop,
745  // thread's pixel loop
746  INDi, "int i;\n",
747  INDi, "for (i=Wtile*get_global_id(", Wdim, "); i<Wlast; i++)\n",
748  INDi, "{\n");
749  if (has_kernel) {
750  sb_cat(opencl_i_loop,
751  INDi, " // W & E boundaries, assuming i global index\n");
752  sb_cat(opencl_i_loop, INDi, " ",
753  need_W? "int is_W = (i==0);\n": "// W not needed\n");
754  sb_cat(opencl_i_loop, INDi, " ",
755  need_E? "int is_E = (i==(width-1));\n": "// E not needed\n");
756  sb_cat(opencl_i_loop, "\n");
757  }
758 
759  if (first_loop_on_height) {
760  string_buffer_append_sb(opencl, opencl_j_loop);
761  string_buffer_append_sb(opencl, opencl_pointers);
762  sb_cat(opencl, "\n");
763  string_buffer_append_sb(opencl, opencl_i_loop);
764  }
765  else {
766  string_buffer_append_sb(opencl, opencl_i_loop);
767  string_buffer_append_sb(opencl, opencl_j_loop);
768  string_buffer_append_sb(opencl, opencl_pointers);
769  sb_cat(opencl, "\n");
770  }
771 
772  string_buffer_append_sb(opencl, opencl_load);
773  sb_cat(opencl, "\n");
774  string_buffer_append_sb(opencl, opencl_body);
775  sb_cat(opencl, "\n");
776  string_buffer_append_sb(opencl, opencl_tail);
777  sb_cat(opencl, tiling? " ": " ", "}\n"); // close internal loop
778  if (tiling)
779  sb_cat(opencl, " }\n"); // close external loop
780  string_buffer_append_sb(opencl, opencl_end);
781  sb_cat(opencl, "}\n"); // close function
782 
783  // OpenCL compilation
784  sb_cat(compile,
785  "\n"
786  "// hold kernels for ", cut_name, "\n"
787  "static cl_kernel ", cut_name, "_kernel[2];\n"
788  "\n"
789  "// compile kernels for ", cut_name, "\n"
790  "static freia_status ", cut_name, "_compile(void)\n"
791  "{\n"
792  " // OpenCL source for ", cut_name, "\n"
793  " const char * ", cut_name, "_source =\n");
794  sb_cat(compile, " \"" FREIA_OPENCL_CL_INCLUDES "\\n\"\n ");
796  sb_cat(compile,
797  ";\n"
798  " freia_status err = FREIA_OK;\n"
799  " err |= freia_op_compile_kernel(", cut_name, "_source, "
800  "\"", cut_name, "\", \"-DPIXEL8\","
801  " &", cut_name, "_kernel[0]);\n"
802  " err |= freia_op_compile_kernel(", cut_name, "_source, "
803  "\"", cut_name, "\", \"-DPIXEL16\","
804  " &", cut_name, "_kernel[1]);\n"
805  " return err;\n"
806  "}\n");
807 
808  if (ls)
809  {
810  // cleanup compiled statements
811  FOREACH(dagvtx, v, dag_vertices(d))
812  {
814  if (pstatement_statement_p(ps))
816  }
817 
818  // handle function image arguments
820 
821  // - and substitute its call...
822  stnb = freia_substitute_by_helper_call(NULL, global_remainings, remainings,
823  ls, cut_name, lparams, helpers, stnb);
824 
825  hash_put(signatures, local_name_to_top_level_entity(cut_name),
826  (void*) (_int) n_outs);
827  }
828  // else it is not subtituded
829 
830  // actual printing...
831  string_buffer_to_file(compile, helper_file);
832  string_buffer_to_file(helper, helper_file);
833  string_buffer_to_file(opencl, opencl_file);
834 
835  // cleanup
836  string_buffer_free(&helper);
837  string_buffer_free(&helper_decls);
838  string_buffer_free(&helper_body_2);
839  string_buffer_free(&helper_body);
840  string_buffer_free(&helper_tail);
842  string_buffer_free(&opencl);
843  string_buffer_free(&opencl_2);
844  string_buffer_free(&opencl_head);
845  string_buffer_free(&opencl_i_loop);
846  string_buffer_free(&opencl_j_loop);
847  string_buffer_free(&opencl_pointers);
848  string_buffer_free(&opencl_load);
849  string_buffer_free(&opencl_body);
850  string_buffer_free(&opencl_tail);
851  string_buffer_free(&opencl_end);
852 
853  free(cut_name);
854  set_free(loaded);
855 
856  return stnb;
857 }
858 
859 /* call and generate if necessary a specialized kernel, if possible
860  * the statement is bluntly modified "in place".
861  * @return whether a substition was performed
862  */
864  string module, dagvtx v, hash_table signatures,
865  FILE * helper_file, FILE * opencl_file, set helpers)
866 {
867  pips_debug(4, "considering statement %"_intFMT"\n", dagvtx_number(v));
868  const freia_api_t * api = get_freia_api_vtx(v);
869  intptr_t k00, k01, k02, k10, k11, k12, k20, k21, k22;
870  if (!api ||
871  !api->opencl.mergeable_kernel ||
872  !freia_extract_kernel_vtx(v, true, &k00, &k01, &k02,
873  &k10, &k11, &k12, &k20, &k21, &k22))
874  // do nothing
875  return;
876 
877  // build an id number for this specialized version
878  int number = (k00<<8) + (k01<<7) + (k02<<6) +
879  (k10<<5) + (k11<<4) + (k12<<3) +
880  (k20<<2) + (k21<<1) + k22;
881  pips_assert("non-zero kernel!", number);
882 
883  // main_opencl_helper_<E8,D8,con>
884  string prefix = strdup(cat(module, "_opencl_helper_", api->compact_name));
885  // main_opencl_helper_E8_<0..511>
886  string func_name = strdup(cat(prefix, "_", i2a(number)));
887 
888  entity specialized = local_name_to_top_level_entity(func_name);
889  if (specialized == entity_undefined)
890  {
891  pips_debug(5, "generating %s\n", func_name);
892 
893  // we need to create the function
894  dag one = make_dag(NIL, NIL, NIL);
895  dagvtx vop = copy_dagvtx_norec(v);
896  dag_append_vertex(one, vop);
897  dag_outputs(one) = CONS(dagvtx, vop, NIL);
898  //dag_compute_outputs(one, NULL, output_images, NIL, false);
899  //dag_cleanup_other_statements(nd);
900 
901  // we just generate the function
902  opencl_compile_mergeable_dag(module, one, NIL, prefix, number, NULL, NULL,
903  helper_file, opencl_file, NULL, 0);
904 
905  // then create it... function name must be consistent with
906  // what is done within previous function
907  specialized = freia_create_helper_function(func_name, NIL);
908 
909  // record #outs for this helper, needed for cleaning
910  pips_debug(9, "sig: %s (%p) = %d\n", func_name, specialized, 1);
911  hash_put(signatures, specialized, (void *) (_int) 1);
912  set_add_element(helpers, helpers, specialized);
913 
914  // cleanup
915  free_dag(one);
916  }
917 
918  pips_assert("specialized function found", specialized!=entity_undefined);
919  // directly call the function...
921  list largs = call_arguments(c);
922 
923  int nargs = (int) gen_length(largs);
924  pips_assert("3/5 arguments to function", nargs==3 || nargs==5);
925  // for convolution, should check that we have a 3x3 kernel...
926  call_function(c) = specialized;
927  list third = CDR(CDR(largs));
928  CDR(CDR(largs)) = NIL;
929  gen_full_free_list(third);
930 
931  pips_debug(5, "statement %"_intFMT" specialized\n", dagvtx_number(v));
932 }
933 
934 /* is v a constant kernel operation?
935  */
936 static bool dagvtx_constant_kernel_p(const dagvtx v)
937 {
938  const freia_api_t * api = get_freia_api_vtx(v);
939  intptr_t val;
940  return api->opencl.mergeable_kernel &&
941  freia_extract_kernel_vtx(v, true, &val, &val, &val, &val,
942  &val, &val, &val, &val, &val);
943 }
944 
945 static int compile_this_list(
946  string module,
947  list lvertices,
948  list ls,
949  string split_name,
950  int n_cut,
951  set global_remainings,
952  hash_table signatures,
953  FILE * helper_file, FILE * opencl,
954  set helpers,
955  set output_images,
956  dag fulld,
957  int stnb,
958  int max_stnb)
959 {
960  // actually build subdag if something to merge
961  dag nd = make_dag(NIL, NIL, NIL);
962  FOREACH(dagvtx, v, lvertices)
964  dag_compute_outputs(nd, NULL, output_images, NIL, false);
966 
967  // ??? should not be needed?
969 
970  // ??? hack to ensure dependencies...
971  if (max_stnb>stnb) stnb = max_stnb;
972 
973  // and compile!
975  (module, nd, ls, split_name, n_cut, global_remainings,
976  signatures, helper_file, opencl, helpers, stnb);
977 
978  return stnb;
979 }
980 
981 /* migrate the statements corresponding to the vertices
982  * so that they are one next to the other in the sequence.
983  */
984 static void migrate_statements(list lvertices, sequence sq, set dones)
985 {
986  set stats = set_make(set_pointer);
987  FOREACH(dagvtx, v, lvertices) {
989  if (s) set_add_element(stats, stats, s);
990  }
991  freia_migrate_statements(sq, stats, dones);
992  set_union(dones, dones, stats);
993  set_free(stats);
994 }
995 
996 /* return if the vertex can be merged in the set
997  * - the answer is "no" if there is already a reduction on another input
998  */
999 static bool vertex_mergeable_p(const dagvtx v, const set s, const dag d)
1000 {
1001  if (dagvtx_is_measurement_p(v))
1002  {
1003  list preds = dag_vertex_preds(d, v);
1004  pips_assert("one predecessor to reduction", gen_length(preds)==1);
1005  dagvtx pred = DAGVTX(CAR(preds));
1006  gen_free_list(preds), preds = NIL;
1007  SET_FOREACH(dagvtx, o, s)
1008  if (dagvtx_is_measurement_p(o) && !gen_in_list_p(o, dagvtx_succs(pred)))
1009  return false;
1010  }
1011  return true;
1012 }
1013 
1014 /* extract subdags of merged operations and compile them
1015  * @param d dag to compile, which is destroyed in the process...
1016  */
1018  string module,
1019  sequence sq,
1020  list ls,
1021  dag d,
1022  string fname_fulldag,
1023  int n_split,
1024  const dag fulld,
1025  const set output_images,
1026  FILE * helper_file,
1027  FILE * opencl,
1028  set helpers,
1029  hash_table signatures)
1030 {
1031  string split_name = strdup(cat(fname_fulldag, "_", i2a(n_split)));
1032  pips_debug(3, "compiling for %s\n", split_name);
1033 
1034  dag_dot_dump(module, split_name, d, NIL, NIL);
1035 
1036  int n_cut = 0;
1037 
1038  list // of vertices
1039  lmergeable = NIL,
1040  lnonmergeable = NIL;
1041 
1042  set // of vertices
1043  done = set_make(set_pointer),
1044  mergeable = set_make(set_pointer), // consistent with lmergeable
1045  nonmergeable = set_make(set_pointer); // consistent with lnonmergeable
1046 
1047  set // of statements
1048  dones = set_make(set_pointer);
1049 
1050  // overall remaining statements to compile
1051  set global_remainings = set_make(set_pointer);
1052  set_assign_list(global_remainings, ls);
1053 
1054  // statement number of merged stuff
1055  int stnb = -1;
1056  // statement number of last non mergeable vertex
1057  int max_stnb = -1;
1058 
1059  bool keepon = true;
1060  while (keepon)
1061  {
1062  pips_debug(3, "%s cut %d\n", split_name, n_cut);
1063 
1064  set_clear(done);
1065  set_assign_list(done, dag_inputs(d));
1066 
1067  // build an homogeneous sub dag
1068  list computables, initials;
1069  bool
1070  merge_reductions = get_bool_property("HWAC_OPENCL_MERGE_REDUCTIONS"),
1071  merge_kernels = get_bool_property("HWAC_OPENCL_MERGE_KERNEL_OPERATIONS"),
1072  compile_one_op = get_bool_property("HWAC_OPENCL_COMPILE_ONE_OPERATION"),
1073  generate_specialized_kernel =
1074  get_bool_property("HWAC_OPENCL_GENERATE_SPECIAL_KERNEL_OPS");
1075 
1076  // we eat up all computable vertices, following dependences
1077  // we compute first a maximum set of non mergeable vertices
1078  bool again = true;
1079  while (again &&
1080  (computables = dag_computable_vertices(d, done, done, nonmergeable)))
1081  {
1082  ifdebug(5) {
1083  pips_debug(5, "%d computables\n", (int) gen_length(computables));
1084  gen_fprint(stderr, "computables", computables,
1086  }
1087 
1088  again = false;
1089 
1090  FOREACH(dagvtx, v, computables)
1091  {
1092  if (!opencl_mergeable_p(v) ||
1093  (dagvtx_is_measurement_p(v) && !merge_reductions))
1094  {
1095  lnonmergeable = CONS(dagvtx, v, lnonmergeable);
1096  set_add_element(done, done, v);
1097  set_add_element(nonmergeable, nonmergeable, v);
1098  again = true;
1099  }
1100  }
1101 
1102  if (again)
1103  gen_free_list(computables), computables = NIL;
1104  // else loop exit
1105  }
1106 
1107  // save up for next phase with mergeables
1108  initials = computables;
1109 
1110  // then we keep on with extracting mergeable vertices
1111  again = true;
1112  while (again &&
1113  (computables = dag_computable_vertices(d, done, done, mergeable)))
1114  {
1115  ifdebug(5) {
1116  pips_debug(5, "%d computables\n", (int) gen_length(computables));
1117  gen_fprint(stderr, "computables", computables,
1119  }
1120 
1122  again = false;
1123 
1124  FOREACH(dagvtx, v, computables)
1125  {
1126  // look for reductions
1127  // ??? current runtime limitation, there is only ONE image
1128  // with associated reductions, so the other one are kept out
1129  if (dagvtx_is_measurement_p(v))
1130  {
1131  // a measure is mergeable
1132  if (merge_reductions)
1133  {
1134  lmergeable = CONS(dagvtx, v, lmergeable);
1135  set_add_element(done, done, v);
1136  set_add_element(mergeable, mergeable, v);
1137  again = true;
1138  }
1139  else // reduction are NOT merged by choice
1140  {
1141  // try to put it with the preceeding non mergeable...
1142  if (gen_in_list_p(v, initials))
1143  {
1144  lnonmergeable = CONS(dagvtx, v, lnonmergeable);
1145  set_add_element(done, done, v);
1146  set_add_element(nonmergeable, nonmergeable, v);
1147  again = true;
1148  }
1149  }
1150  }
1151  else if (opencl_mergeable_p(v))
1152  {
1153  // not a reduction, mergeable
1154  lmergeable = CONS(dagvtx, v, lmergeable);
1155  set_add_element(done, done, v);
1156  set_add_element(mergeable, mergeable, v);
1157  again = true;
1158  }
1159  }
1160  gen_free_list(computables), computables = NIL;
1161  }
1162 
1163  // restore vertices order
1164  lmergeable = gen_nreverse(lmergeable);
1165 
1166  // we try to aggregate some kernel ops to this merged task...
1167  if (merge_kernels && lmergeable)
1168  {
1169  pips_debug(3, "looking for kernel ops in predecessors...\n");
1170  list added = NIL;
1171  FOREACH(dagvtx, v, lmergeable)
1172  {
1173  list preds = dag_vertex_preds(d, v);
1174  pips_debug(4, "%d predecessors to vertex %d\n",
1175  (int) gen_length(preds), (int) dagvtx_number(v));
1176  FOREACH(dagvtx, p, preds)
1177  {
1178  if (set_belong_p(nonmergeable, p))
1179  {
1180  // it belongs to the previous set
1181  // we may consider merging it...
1182  const freia_api_t * api = get_freia_api_vtx(p);
1183  pips_debug(5, "predecessor is vertex %d (%s)\n",
1184  (int) dagvtx_number(p), api->compact_name);
1185  intptr_t val;
1186  if (// this is a mergeable kernel
1187  api->opencl.mergeable_kernel &&
1188  // *all* its successors are arithmetic merged
1189  list_in_set_p(dagvtx_succs(p), mergeable) &&
1190  // and the kernel must be fully known
1191  freia_extract_kernel_vtx(p, true, &val, &val, &val, &val,
1192  &val, &val, &val, &val, &val))
1193  {
1194  // change status of p
1195  set_del_element(nonmergeable, nonmergeable, p);
1196  gen_remove(&lnonmergeable, p);
1197  added = CONS(dagvtx, p, added);
1198  }
1199  }
1200  }
1201  // FIX mergeable DELAYED...
1202  if (added)
1203  {
1204  FOREACH(dagvtx, v, added)
1205  set_add_element(mergeable, mergeable, v);
1206  // the added are put ahead in the initial order
1207  lmergeable = gen_nconc(gen_nreverse(added), lmergeable), added = NIL;
1208  }
1209  gen_free_list(preds);
1210  }
1211  }
1212 
1213  // whether to try again later
1214  keepon = lnonmergeable || lmergeable;
1215 
1216  pips_debug(4, "got %d non-mergeables and %d mergeable vertices\n",
1217  (int) gen_length(lnonmergeable), (int) gen_length(lmergeable));
1218 
1219  set merged = set_make(set_pointer);
1220 
1221  if (lnonmergeable)
1222  {
1223  // merge kernel operations with a common input from nonmergeable
1224  // hmmm... "nonmergeable" is really meant with arithmetic operations
1225  if (merge_kernels)
1226  {
1227  // if starting from sinks, the complexity is not good because we have
1228  // to rebuild predecessors over and over which requires scanning all
1229  // vertices...
1230  // it is a little better when starting the other way around...
1231  FOREACH(dagvtx, v, dag_vertices(d))
1232  {
1233  // detect constant-kernels in nonmergeable with a common input
1234  list okays = NIL;
1235  bool some_real_stuff = false;
1236  FOREACH(dagvtx, s, dagvtx_succs(v))
1237  {
1238  if (set_belong_p(nonmergeable, s) &&
1240  okays = CONS(dagvtx, s, okays), some_real_stuff = true;
1241  else if (set_belong_p(mergeable, s) &&
1243  gen_in_list_p(s, initials))
1244  // try to backtrack reductions as well?
1245  okays = CONS(dagvtx, s, okays);
1246  }
1247 
1248  if (some_real_stuff && gen_length(okays)>1) // yep, something to do!
1249  {
1250  pips_debug(5,
1251  "merging %d common input const kernels & reductions\n",
1252  (int) gen_length(okays));
1253 
1254  // fix statement connexity
1255  migrate_statements(okays, sq, dones);
1256 
1257  // let us merge and compile these operations
1258  stnb = compile_this_list(module, okays, ls, split_name, n_cut,
1259  global_remainings, signatures,
1260  helper_file, opencl, helpers,
1261  output_images, fulld, stnb, max_stnb);
1262  // this was another cut!
1263  n_cut++;
1264 
1265  // these are compiled, bye bye!
1266  FOREACH(dagvtx, s, okays)
1267  {
1268  set_add_element(merged, merged, s);
1269  if (set_belong_p(mergeable, s))
1270  {
1271  // dependency hack? is it needed?
1272  // int n = (int) dagvtx_number(s);
1273  // if (n>max_stnb) max_stnb = n;
1274  // cleanup mergeable
1275  set_del_element(mergeable, mergeable, s);
1276  gen_remove(&lmergeable, s);
1277  // also cleanup full dag
1278  dag_remove_vertex(d, s);
1279  }
1280  // non mergeable vertices will be removed later
1281  }
1282  }
1283  gen_free_list(okays);
1284  }
1285  }
1286 
1287  // BUG ??? this is too late for the just-aboved merged kernels
1288  FOREACH(dagvtx, v, lnonmergeable)
1289  {
1290  // keep track of previous which may have dependencies... hmmm...
1291  int n = (int) dagvtx_number(v);
1292  if (n>max_stnb) max_stnb = n;
1293  }
1294 
1295  // fix statement connexity...
1296  migrate_statements(lnonmergeable, sq, dones);
1297 
1298  // possibly compile specialized kernels
1299  if (generate_specialized_kernel)
1300  {
1301  FOREACH(dagvtx, v, lnonmergeable)
1302  if (!set_belong_p(merged, v))
1304  helper_file, opencl, helpers);
1305  }
1306 
1307  // cleanup initial dag??
1308  FOREACH(dagvtx, v, lnonmergeable)
1309  dag_remove_vertex(d, v);
1311 
1312  // cleanup list of remaining computables for nonmergeable
1313  gen_free_list(lnonmergeable), lnonmergeable = NIL;
1314  set_clear(nonmergeable);
1315  set_free(merged);
1316 
1317  n_cut++; // this was a cut, next cut...
1318  }
1319 
1320  // then mergeables
1321  while (lmergeable)
1322  {
1323  list lconnected =
1325 
1326  // fix statement connexity
1327  migrate_statements(lconnected, sq, dones);
1328 
1329  // possibly compile
1330  if (gen_length(lconnected)>1 || compile_one_op)
1331  {
1332  stnb = compile_this_list(module, lconnected, ls, split_name, n_cut,
1333  global_remainings, signatures,
1334  helper_file, opencl, helpers,
1335  output_images, fulld, stnb, max_stnb);
1336  }
1337 
1338  // cleanup initial dag??
1339  FOREACH(dagvtx, v, lconnected)
1340  dag_remove_vertex(d, v);
1342 
1343  // cleanup loop state
1344  gen_free_list(lconnected), lconnected = NIL;
1345  n_cut++; // next cut
1346  }
1347 
1348  gen_free_list(initials), initials = NIL;
1349  set_clear(mergeable);
1350  }
1351 
1352  // cleanup
1353  set_free(global_remainings);
1354  set_free(mergeable);
1355  set_free(nonmergeable);
1356  set_free(done);
1357  set_free(dones);
1358  free(split_name);
1359 }
1360 
1361 /*
1362  @brief compile one dag for OPENCL
1363  @param sq containing sequence
1364  @param ls statements underlying the full dag
1365  @param occs image occurences
1366  @param exchanges statements to exchange because of dependences
1367  @param output_images as a surrogate to use-def chains
1368  @param helper_file output C file for generated code
1369  @return the list of allocated intermediate images
1370 */
1372 (string module,
1373  dag fulld,
1374  sequence sq,
1375  list ls,
1376  const hash_table occs,
1377  hash_table exchanges,
1378  const set output_images,
1379  FILE * helper_file,
1380  set helpers,
1381  int number,
1382  hash_table signatures)
1383 {
1384  pips_debug(3, "considering %d statements\n", (int) gen_length(ls));
1385  pips_assert("some statements", ls);
1386 
1387  int n_op_init, n_op_init_copies;
1388  freia_aipo_count(fulld, &n_op_init, &n_op_init_copies);
1389 
1390  // must have distinct images in the graph for optimizations
1392  list new_images = dag_fix_image_reuse(fulld, init, occs);
1393 
1394  list added_before = NIL, added_after = NIL;
1395  freia_dag_optimize(fulld, exchanges, &added_before, &added_after);
1396 
1397  int n_op_opt, n_op_opt_copies;
1398  freia_aipo_count(fulld, &n_op_opt, &n_op_opt_copies);
1399 
1400  fprintf(helper_file,
1401  "\n"
1402  "// dag %d: %d ops and %d copies, "
1403  "optimized to %d ops and %d+%d+%d copies\n",
1404  number, n_op_init, n_op_init_copies,
1405  n_op_opt, n_op_opt_copies,
1406  (int) gen_length(added_before), (int) gen_length(added_after));
1407 
1408  // opencl file
1409  string opencl_file = get_opencl_file_name(module);
1410  FILE * opencl;
1411  if (file_readable_p(opencl_file))
1412  opencl = safe_fopen(opencl_file, "a");
1413  else
1414  {
1415  opencl = safe_fopen(opencl_file, "w");
1416  fprintf(opencl,
1418  "// generated OpenCL kernels for function %s\n", module);
1419  }
1420  fprintf(opencl, "\n" "// opencl for dag %d\n", number);
1421 
1422  // dump final optimised dag
1423  dag_dot_dump_prefix(module, "dag_cleaned_", number, fulld,
1424  added_before, added_after);
1425 
1426  string fname_fulldag = strdup(cat(module, "_opencl", HELPER, i2a(number)));
1427 
1428  list ld =
1431  NULL, output_images);
1432 
1433  pips_debug(3, "dag initial split in %d dags\n", (int) gen_length(ld));
1434 
1435  int n_split = 0;
1436 
1438  {
1439  set stats = set_make(set_pointer), dones = set_make(set_pointer);
1440  FOREACH(dag, d, ld)
1441  {
1442  if (dag_no_image_operation(d))
1443  continue;
1444 
1445  // fix statements connexity
1446  dag_statements(stats, d);
1447  freia_migrate_statements(sq, stats, dones);
1448  set_union(dones, dones, stats);
1449 
1450  opencl_merge_and_compile(module, sq, ls, d, fname_fulldag, n_split,
1451  fulld, output_images, helper_file, opencl,
1452  helpers, signatures);
1453 
1454  n_split++;
1455  }
1456  set_free(stats);
1457  set_free(dones);
1458  }
1459  // else, do nothing, this is basically like AIPO
1460 
1461  // now may put actual allocations, which messes up statement numbers
1462  list reals =
1463  freia_allocate_new_images_if_needed(ls, new_images, occs, init, signatures);
1464 
1465  // hmmm... is this too late?
1466  freia_insert_added_stats(ls, added_before, true);
1467  added_before = NIL;
1468  freia_insert_added_stats(ls, added_after, false);
1469  added_after = NIL;
1470 
1471  // cleanup
1472  gen_free_list(new_images);
1474  safe_fclose(opencl, opencl_file);
1475  free(opencl_file);
1476 
1477  return reals;
1478 }
void free_dag(dag p)
dag make_dag(list a1, list a2, list a3)
bool is_a_kernel(const char *)
Definition: ikernels.c:369
void const char const char const int
void compile(void)
COMPILE reconnects the Domains table (for not compiled types – note that an inlined type is already c...
Definition: build.c:422
void dag_cleanup_other_statements(dag d)
remove unneeded statements? you must know they are really un-needed!
Definition: dag-utils.c:2191
list dag_vertex_preds(const dag d, const dagvtx target)
return target predecessor vertices as a list.
Definition: dag-utils.c:680
_int dagvtx_number(const dagvtx v)
returns the vertex number, i.e.
Definition: dag-utils.c:98
bool dagvtx_other_stuff_p(const dagvtx v)
a vertex with a non AIPO or image related statement.
Definition: dag-utils.c:76
bool dag_no_image_operation(dag d)
tell whether we have something to do with images ??? hmmm...
Definition: dag-utils.c:2500
list dag_split_on_scalars(const dag initial, bool(*alone_only)(const dagvtx), dagvtx(*choose_vertex)(const list, bool), gen_cmp_func_t priority, void(*priority_update)(const dag), const set output_images)
split a dag on scalar dependencies only, with a greedy heuristics.
Definition: dag-utils.c:2823
list dag_fix_image_reuse(dag d, hash_table init, const hash_table occs)
fix intermediate image reuse in dag
Definition: dag-utils.c:2779
void freia_hack_fix_global_ins_outs(dag dfull, dag d)
catch some cases of missing outs between splits...
Definition: dag-utils.c:2166
bool dagvtx_is_measurement_p(const dagvtx v)
returns whether the vertex is an image measurement operation.
Definition: dag-utils.c:623
dagvtx copy_dagvtx_norec(dagvtx v)
copy a vertex, but without its successors.
Definition: dag-utils.c:611
list dag_computable_vertices(dag d, const set computed, const set maybe, const set currents)
return the vertices which may be computed from the list of available images, excluding vertices in ex...
Definition: dag-utils.c:2307
string dagvtx_number_str(const dagvtx v)
Definition: dag-utils.c:111
void dag_remove_vertex(dag d, const dagvtx v)
remove vertex v from dag d.
Definition: dag-utils.c:570
void dag_dump(FILE *out, const string what, const dag d)
for dag debug
Definition: dag-utils.c:212
void freia_dag_optimize(dag d, hash_table exchanges, list *lbefore, list *lafter)
remove dead image operations.
Definition: dag-utils.c:1416
list dag_connected_component(dag d, list *plv, bool(*compat)(const dagvtx, const set, const dag))
extract a sublist of lv which is a connected component.
Definition: dag-utils.c:2939
void dag_compute_outputs(dag d, const hash_table occs, const set output_images, const list ld, bool inloop)
(re)compute the list of GLOBAL input & output images for this dag ??? BUG the output is rather an app...
Definition: dag-utils.c:2073
void dag_dot_dump(const string module, const string name, const dag d, const list lb, const list la)
generate a "dot" format from a dag to a file.
Definition: dag-utils.c:488
void set_append_vertex_statements(set s, list lv)
Definition: dag-utils.c:2385
statement dagvtx_statement(const dagvtx v)
return statement if any, or NULL (for input nodes).
Definition: dag-utils.c:56
_int dagvtx_opid(const dagvtx v)
Definition: dag-utils.c:121
void dag_statements(set stats, const dag d)
build the set of actual statements in d
Definition: dag-utils.c:64
void dag_append_vertex(dag d, dagvtx nv)
append new vertex nv to dag d.
Definition: dag-utils.c:632
void dag_dot_dump_prefix(const string module, const string prefix, int number, const dag d, const list lb, const list la)
Definition: dag-utils.c:504
FILE * safe_fopen(const char *filename, const char *what)
Definition: file.c:67
char * get_string_property(const char *)
int safe_fclose(FILE *stream, const char *filename)
Definition: file.c:77
bool file_readable_p(char *name)
Definition: file.c:428
bool get_bool_property(const string)
FC 2015-07-20: yuk, moved out to prevent an include cycle dependency include "properties....
entity freia_create_helper_function(const string function_name, list lparams)
Definition: freia-utils.c:1030
void freia_add_image_arguments(list limg, list *lparams)
prepend limg images in front of the argument list limg is consummed by the operation.
Definition: freia-utils.c:1234
list freia_extract_params(const int napi, list args, string_buffer head, string_buffer head2, hash_table params, int *nparams)
returns an allocated expression list of the parameters only (i.e.
Definition: freia-utils.c:613
void hwac_kill_statement(statement s)
remove contents of statement s.
Definition: freia-utils.c:761
list freia_allocate_new_images_if_needed(list ls, list images, const hash_table occs, const hash_table init, const hash_table signatures)
insert image allocation if needed, for intermediate image inserted before if an image is used only tw...
Definition: freia-utils.c:1650
void freia_migrate_statements(sequence sq, const set stats, const set before)
Definition: freia-utils.c:1905
int freia_substitute_by_helper_call(dag d, set global_remainings, set remainings, list ls, const string function_name, list lparams, set helpers, int preceeding)
substitute those statement in ls that are in dag d and accelerated by a call to function_name(lparams...
Definition: freia-utils.c:1073
const freia_api_t * get_freia_api(int index)
Definition: freia-utils.c:477
bool freia_extract_kernel_vtx(dagvtx v, bool strict, intptr_t *k00, intptr_t *k10, intptr_t *k20, intptr_t *k01, intptr_t *k11, intptr_t *k21, intptr_t *k02, intptr_t *k12, intptr_t *k22)
vertex-based version
Definition: freia-utils.c:2012
call freia_statement_to_call(const statement s)
return the actual function call from a statement, dealing with assign and returns....
Definition: freia-utils.c:973
void freia_insert_added_stats(list ls, list stats, bool before)
insert statements to actual code sequence in "ls" BEWARE that ls is assumed to be in reverse order....
Definition: freia-utils.c:1185
int freia_aipo_count(dag d, int *pa, int *pc)
Definition: freia-utils.c:1823
const freia_api_t * get_freia_api_vtx(dagvtx v)
Definition: freia-utils.c:483
#define cat(args...)
Definition: freia.h:41
#define HELPER
Definition: freia.h:38
#define sb_cat(args...)
Definition: freia.h:42
#define FREIA_IMAGE
Definition: freia.h:52
#define dagvtx_freia_api(v)
Definition: freia.h:97
list freia_opencl_compile_calls(string module, dag fulld, sequence sq, list ls, const hash_table occs, hash_table exchanges, const set output_images, FILE *helper_file, set helpers, int number, hash_table signatures)
freia_opencl.c
static void migrate_statements(list lvertices, sequence sq, set dones)
migrate the statements corresponding to the vertices so that they are one next to the other in the se...
Definition: freia_opencl.c:984
static string get_opencl_file_name(string func_name)
Definition: freia_opencl.c:47
static dagvtx choose_opencl_vertex(const list lv, bool started)
choose a vertex, avoiding other stuff if the list is started
Definition: freia_opencl.c:82
static bool vertex_mergeable_p(const dagvtx v, const set s, const dag d)
return if the vertex can be merged in the set
Definition: freia_opencl.c:999
static int dagvtx_opencl_priority(const dagvtx *pv1, const dagvtx *pv2)
qsort helper: return -1 for v1 before v2
Definition: freia_opencl.c:68
static bool opencl_mergeable_p(const dagvtx v)
Definition: freia_opencl.c:57
static string pixel_name(dagvtx v, int shft, set loaded, string_buffer load, list inputs, string indentation)
generate a load if needed for an input variable return the holding variable name in a statically allo...
Definition: freia_opencl.c:113
#define RED
static string border_condition[9]
Definition: freia_opencl.c:104
static string opencl_type(string t)
Definition: freia_opencl.c:97
static int compile_this_list(string module, list lvertices, list ls, string split_name, int n_cut, set global_remainings, hash_table signatures, FILE *helper_file, FILE *opencl, set helpers, set output_images, dag fulld, int stnb, int max_stnb)
Definition: freia_opencl.c:945
static bool dagvtx_constant_kernel_p(const dagvtx v)
is v a constant kernel operation?
Definition: freia_opencl.c:936
static void opencl_merge_and_compile(string module, sequence sq, list ls, dag d, string fname_fulldag, int n_split, const dag fulld, const set output_images, FILE *helper_file, FILE *opencl, set helpers, hash_table signatures)
extract subdags of merged operations and compile them
static void opencl_generate_special_kernel_ops(string module, dagvtx v, hash_table signatures, FILE *helper_file, FILE *opencl_file, set helpers)
call and generate if necessary a specialized kernel, if possible the statement is bluntly modified "i...
Definition: freia_opencl.c:863
#define VARSHIFT(v, n)
static int opencl_compile_mergeable_dag(string module, dag d, list ls, string split_name, int n_cut, set global_remainings, hash_table signatures, FILE *helper_file, FILE *opencl_file, set helpers, int stnb)
perform OpenCL compilation on mergeable dag the generated code relies on some freia-provided runtime ...
Definition: freia_opencl.c:168
#define opencl_merge_prop
Definition: freia_opencl.h:33
#define OPENCL_IMAGE
Definition: freia_opencl.h:37
#define OPENCL_PIXEL
Definition: freia_opencl.h:36
#define FREIA_OPENCL_CL_INCLUDES
Definition: freia_opencl.h:48
#define pstatement_statement_p(x)
#define dagvtx_content(x)
#define dag_outputs(x)
#define vtxcontent_out(x)
#define pstatement_statement(x)
#define dag_inputs(x)
#define dagvtx_succs(x)
#define vtxcontent_inputs(x)
#define dag_vertices(x)
#define vtxcontent_source(x)
#define DAGVTX(x)
DAGVTX.
void gen_full_free_list(list l)
Definition: genClib.c:1023
static int input(void)
void free(void *)
void gen_fprint(FILE *out, string name, const list l, gen_string_func_t item_name)
Definition: list.c:873
list gen_nreverse(list cp)
reverse a list in place
Definition: list.c:304
void gen_remove(list *cpp, const void *o)
remove all occurences of item o from list *cpp, which is thus modified.
Definition: list.c:685
int gen_position(const void *item, const list l)
Element ranks are strictly positive as for first, second, and so on.
Definition: list.c:995
#define NIL
The empty list (nil in Lisp)
Definition: newgen_list.h:47
list gen_copy_seq(list l)
Copy a list structure.
Definition: list.c:501
size_t gen_length(const list l)
Definition: list.c:150
#define CONS(_t_, _i_, _l_)
List element cell constructor (insert an element at the beginning of a list)
Definition: newgen_list.h:150
list gen_nconc(list cp1, list cp2)
physically concatenates CP1 and CP2 but do not duplicates the elements
Definition: list.c:344
#define CAR(pcons)
Get the value of the first element of a list.
Definition: newgen_list.h:92
void gen_free_list(list l)
free the spine of the list
Definition: list.c:327
bool gen_in_list_p(const void *vo, const list lx)
tell whether vo belongs to lx
Definition: list.c:734
#define FOREACH(_fe_CASTER, _fe_item, _fe_list)
Apply/map an instruction block on all the elements of a list.
Definition: newgen_list.h:179
#define CDR(pcons)
Get the list less its first element.
Definition: newgen_list.h:111
void gen_sort_list(list l, gen_cmp_func_t compare)
Sorts a list of gen_chunks in place, to avoid allocations...
Definition: list.c:796
hash_table hash_table_make(hash_key_type key_type, size_t size)
Definition: hash.c:294
void hash_put(hash_table htp, const void *key, const void *val)
This functions stores a couple (key,val) in the hash table pointed to by htp.
Definition: hash.c:364
void hash_table_free(hash_table htp)
this function deletes a hash table that is no longer useful.
Definition: hash.c:327
static entity(* load)(entity)
string db_get_directory_name_for_module(const char *name)
returns the allocated and mkdir'ed directory for module name
Definition: lowlevel.c:150
#define pips_debug
these macros use the GNU extensions that allow variadic macros, including with an empty list.
Definition: misc-local.h:145
#define pips_assert(what, predicate)
common macros, two flavors depending on NDEBUG
Definition: misc-local.h:172
#define pips_internal_error
Definition: misc-local.h:149
char * i2a(int)
I2A (Integer TO Ascii) yields a string for a given Integer.
Definition: string.c:121
@ hash_pointer
Definition: newgen_hash.h:32
#define same_string_p(s1, s2)
set set_assign_list(set, const list)
assigns a list contents to a set all duplicated elements are lost
Definition: set.c:474
set set_del_element(set, const set, const void *)
Definition: set.c:265
bool list_in_set_p(const list, const set)
Definition: set.c:201
#define SET_FOREACH(type_name, the_item, the_set)
enumerate set elements in their internal order.
Definition: newgen_set.h:78
void set_free(set)
Definition: set.c:332
set set_clear(set)
Assign the empty set to s s := {}.
Definition: set.c:326
bool set_belong_p(const set, const void *)
Definition: set.c:194
set set_union(set, const set, const set)
Definition: set.c:211
@ set_pointer
Definition: newgen_set.h:44
set set_make(set_type)
Create an empty set of any type but hash_private.
Definition: set.c:102
set set_add_element(set, const set, const void *)
Definition: set.c:152
void string_buffer_append_c_string_buffer(const string_buffer, string_buffer, int)
put string buffer as a C-string definition of the string buffer, including external double-quotes.
void string_buffer_append_sb(string_buffer, const string_buffer)
append the string buffer sb2 to string buffer sb.
void string_buffer_to_file(const string_buffer, FILE *)
put string buffer into file.
void string_buffer_free(string_buffer *)
free string buffer structure, also free string contents according to the dup field
Definition: string_buffer.c:82
string_buffer string_buffer_make(bool dup)
allocate a new string buffer
Definition: string_buffer.c:58
string(* gen_string_func_t)(const void *)
Definition: newgen_types.h:111
#define _intFMT
Definition: newgen_types.h:57
char * string
STRING.
Definition: newgen_types.h:39
intptr_t _int
_INT
Definition: newgen_types.h:53
int(* gen_cmp_func_t)(const void *, const void *)
Definition: newgen_types.h:114
static char * module
Definition: pips.c:74
list lparams
Array bounds.
Definition: reindexing.c:111
static const char * prefix
entity local_name_to_top_level_entity(const char *n)
This function try to find a top-level entity from a local name.
Definition: entity.c:1450
static int init
Maximal value set for Fortran 77.
Definition: entity.c:320
#define call_function(x)
Definition: ri.h:709
#define ENTITY(x)
ENTITY.
Definition: ri.h:2755
#define entity_undefined
Definition: ri.h:2761
#define call_arguments(x)
Definition: ri.h:711
int fprintf()
test sc_min : ce test s'appelle par : programme fichier1.data fichier2.data ...
char * strdup()
#define ifdebug(n)
Definition: sg.c:47
#define intptr_t
Definition: stdint.in.h:294
internally defined structure.
Definition: string_buffer.c:47
FI: I do not understand why the type is duplicated at the set level.
Definition: set.c:59
The structure used to build lists in NewGen.
Definition: newgen_list.h:41
FREIA API function name -> SPoC hardware description (and others?)
Definition: freia.h:71
unsigned int arg_misc_in
Definition: freia.h:83
string compact_name
Definition: freia.h:75
opencl_hw_t opencl
Definition: freia.h:91
string arg_in_types[3]
Definition: freia.h:86
unsigned int arg_misc_out
Definition: freia.h:82
string init
Definition: freia_opencl.h:60
bool mergeable_kernel
Definition: freia_opencl.h:56
bool mergeable
Definition: freia_opencl.h:54
string macro
Definition: freia_opencl.h:58