PIPS
gpu-ify.c
Go to the documentation of this file.
1 /* A simple phase that outlines parallel loops onto GPU
2 
3  Ronan.Keryell@hpc-project.com
4 */
5 #ifdef HAVE_CONFIG_H
6  #include "pips_config.h"
7 #endif
8 
9 // To have asprintf:
10 #include <stdio.h>
11 
12 #include "genC.h"
13 #include "linear.h"
14 #include "ri.h"
15 #include "effects.h"
16 #include "ri-util.h"
17 #include "effects-util.h"
18 #include "misc.h"
19 #include "effects-generic.h"
20 #include "effects-simple.h"
21 #include "control.h"
22 #include "callgraph.h"
23 #include "pipsdbm.h"
24 #include "accel-util.h"
25 #include "resources.h"
26 #include "properties.h"
27 #include "prettyprint.h" // for print_statement
28 
29 /** Store the loop nests found that meet the spec to be executed on a
30  GPU. Use a list and not a set or hash_map to have always the same
31  order */
33 
34 
35 /* These are the possibles prefixes for outline stuff, they are computed from a
36  * property and the current module name
37  */
38 static const char * kernel_prefix = 0;
39 static const char * wrapper_prefix = 0;
40 static const char * launcher_prefix = 0;
41 static const char * fwrapper_prefix = 0;
42 
43 
44 /* Return a pointer on the first char after the bad_prefix */
45 static const char* clean_prefix(const char* full_name, const char* bad_prefix) {
46  int len = strlen(bad_prefix);
47  if(strncasecmp(full_name,bad_prefix,len)==0) {
48  full_name = full_name+len;
49  }
50  // Jump over separator
51  if(*full_name=='_') full_name++;
52  return full_name;
53 }
54 
55 /**
56  * Trying to get only the original function name without prefix
57  *
58  */
59 static const char* get_clean_mod_name(const char *mod_name) {
60 
61  kernel_prefix = get_string_property("GPU_KERNEL_PREFIX");
62  launcher_prefix = get_string_property("GPU_LAUNCHER_PREFIX");
63  wrapper_prefix = get_string_property("GPU_WRAPPER_PREFIX");
64  fwrapper_prefix = get_string_property("GPU_FORTRAN_WRAPPER_PREFIX");
65 
66  const char * clean_mod_name = mod_name;
67 
68  clean_mod_name = clean_prefix(clean_mod_name,launcher_prefix);
69  clean_mod_name = clean_prefix(clean_mod_name,fwrapper_prefix);
70  clean_mod_name = clean_prefix(clean_mod_name,wrapper_prefix);
71  clean_mod_name = clean_prefix(clean_mod_name,kernel_prefix);
72  return clean_mod_name;
73 }
74 
75 /**
76  * Build the outline function name
77  */
78 string build_outline_name(const char *base_prefix,
79  const char *mod_name) {
80  bool name_suffix_p = get_bool_property("GPU_OUTLINE_SUFFIX_WITH_OWNER_NAME");
81 
82  char *prefix;
83  if(name_suffix_p) {
84  // strdup because concatenate is used during build_new_top_level_module_name
85  prefix = strdup(concatenate(base_prefix,"_",mod_name,NULL));
86  } else {
87  prefix = strdup(base_prefix);
88  }
89 
90  string outline_name = build_new_top_level_module_name(prefix,true);
91 
92  free(prefix);
93 
94  return outline_name;
95 }
96 
97 
98 
99 #if 0
100 /* Get the intrinsic function to get iteration coordinate
101 
102  @param coordinate is the coordinate number (0, 1...)
103 
104  @return the entity of the intrinsics
105  */
106 static entity get_coordinate_intrinsic(int coordinate) {
107  // Get the iteration coordinate intrinsic, for example P4A_vp_1:
108  string coord_name;
109  asprintf(&coord_name,
110  get_string_property("GPU_COORDINATE_INTRINSICS_FORMAT"),
111  coordinate);
112  entity coord_intrinsic = FindOrMakeDefaultIntrinsic(coord_name, 1);
113  free(coord_name);
114  return coord_intrinsic;
115 }
116 #endif
117 
118 
119 static bool
121  /* An interesting loop must be parallel first...
122 
123  We recurse on statements instead of loops in order to pick
124  informations on the statement itself, such as pragmas
125  */
126  if(statement_loop_p(s)) {
127  int parallel_loop_nest_depth = depth_of_parallel_perfect_loop_nest(s);
128  ifdebug(3) {
129  pips_debug(1, "Statement %td with // depth %d\n", statement_number(s),
130  parallel_loop_nest_depth);
131  print_statement(s);
132  }
133  if (parallel_loop_nest_depth > 0) {
134  // Register the loop-nest (note the list is in the reverse order):
136  /* Since we only outline outermost loop-nest, stop digging further in
137  this statement: */
138  pips_debug(1, "Statement %td marked to be outlined\n", statement_number(s));
139  return false;
140  }
141  }
142  // This statement is not a parallel loop, go on digging:
143  return true;
144 }
145 
146 
147 
148 /* Transform a loop nest into a GPU or accelerator-like kernel
149 
150  @param s is the parallel loop-nest statement
151 
152  @param depth is the number of loop in the loop nest to be taken out as
153  the GPU iterators
154 
155  Several properties can be used to change the behviour of this function,
156  as explained in pipsmake-rc
157 
158  For example is depth = 2 and s is:
159  for(i = 1; i <= 499; i += 1)
160  for(j = 1; j <= 499; j += 1)
161  save[i][j] = 0.25*(space[i-1][j]+space[i+1][j]+space[i][j-1]+space[i][j+1]);
162 
163  it generates something like:
164  [...]
165  If the GPU_USE_LAUNCHER property is true, this kind of function is generated:
166  void p4a_kernel_launcher_0(float_t save[501][501], float_t space[501][501])
167  {
168  int i;
169  int j;
170  for(i = 1; i <= 499; i += 1)
171  for(j = 1; j <= 499; j += 1)
172 
173  p4a_kernel_wrapper_0(save, space, i, j);
174  }
175 
176  If the GPU_USE_WRAPPER property is true, this kind of function is generated:
177  void p4a_kernel_wrapper_0(float_t save[501][501], float_t space[501][501], int i, int j)
178  {
179  // To be assigned to a call to P4A_vp_0: i
180  // To be assigned to a call to P4A_vp_1: j
181  p4a_kernel_0(save, space, i, j);
182  }
183 
184  If the GPU_USE_KERNEL property is true, this kind of function is generated:
185  void p4a_kernel_0(float_t save[501][501], float_t space[501][501], int
186  i, int j) {
187  save[i][j] = 0.25*(space[i-1][j]+space[i+1][j]+space[i][j-1]+space[i][j+1]);
188  }
189 
190  Other properties modify the behaviour:
191  GPU_USE_KERNEL_INDEPENDENT_COMPILATION_UNIT,
192  GPU_USE_LAUNCHER_INDEPENDENT_COMPILATION_UNIT,
193  GPU_USE_WRAPPER_INDEPENDENT_COMPILATION_UNIT,
194  GPU_COORDINATE_INTRINSICS_FORMAT, GPU_USE_FORTRAN_WRAPPER
195 
196  Look at pipsmake-rc documentation.
197  */
198 static void
199 gpu_ify_statement(statement s, int depth, const char* mod_name) {
200  ifdebug(1) {
201  pips_debug(1, "Parallel loop-nest of depth %d\n", depth);
202  print_statement(s);
203  }
204  // Get the statement inside the loop-nest:
206 
207  // Save the value of a property we are going to change locally:
208  bool old_outline_independent_compilation_unit =
209  get_bool_property("OUTLINE_INDEPENDENT_COMPILATION_UNIT");
210 
211  /* If we want to outline a kernel: */
212  string kernel_name = string_undefined;
213  if (get_bool_property("GPU_USE_KERNEL")) {
214  /* First outline the innermost code (the kernel itself) to avoid
215  spoiling its memory effects if we start with the outermost code
216  first. The kernel name with a prefix defined in the
217  GPU_KERNEL_PREFIX property: */
218  list sk = CONS(STATEMENT, inner, NIL);
219  // Choose if we want the kernel in its own file:
220  set_bool_property("OUTLINE_INDEPENDENT_COMPILATION_UNIT",
221  get_bool_property("GPU_USE_KERNEL_INDEPENDENT_COMPILATION_UNIT"));
222  kernel_name = build_outline_name(kernel_prefix, mod_name);
224  //insert_comments_to_statement(inner, "// Call the compute kernel:");
225  }
226 
227  /* Do we need to insert a wrapper phase to reconstruct iteration
228  coordinates from hardware intrinsics? */
229  if (get_bool_property("GPU_USE_WRAPPER")) {
230  /* Add index initialization from GPU coordinates, in the reverse order
231  since we use insert_comments_to_statement() to avoid furthering the
232  first statement from its original comment: */
233  for(int i = depth - 1; i >= 0; i--) {
235  // Get the iteration coordinate intrinsic, for example P4A_vp_1:
236  /*
237  This code makes a
238 resource SUMMARY_EFFECTS[p4a_kernel_launcher_1] is in 'required' status since 149
239 resource CUMULATED_EFFECTS[p4a_kernel_launcher_1] is in 'required' status since 152
240 resource PROPER_EFFECTS[p4a_kernel_launcher_1] is in 'required' status since 152
241 resource SUMMARY_EFFECTS[p4a_kernel_wrapper_1] is in 'required' status since 152
242 resource CUMULATED_EFFECTS[p4a_kernel_wrapper_1] is in 'required' status since 155
243 resource PROPER_EFFECTS[p4a_kernel_wrapper_1] is in 'required' status since 155
244 user error in rmake: recursion on resource SUMMARY_EFFECTS of p4a_kernel_wrapper_1
245  statement assign = make_assign_statement(entity_to_expression(index),
246  MakeUnaryCall(get_coordinate_intrinsic(i),
247  entity_to_expression(index)));
248  So keep simple right now
249  */
250 
251  /* Add a comment to know what to do later: */
252  string comment;
253  string intrinsic_name;
254  /* Map the inner loop index (numbered i) with the lower GPU
255  coordinate (numbered depth - 1 - i)). In this way, if the code
256  was cache-friendly, it should remain GPU-memory friendly
257 
258  Build the intrinsics of this form: P4A_vp_<depth - 1 - i>
259  */
260  asprintf(&intrinsic_name,
261  get_string_property("GPU_COORDINATE_INTRINSICS_FORMAT"),
262  depth - 1 - i);
263  /* Add a comment in the form of
264 
265  To be replaced with a call to P4A_vp_1: j
266 
267  that may replaced by a post-processor later by
268 
269  j = P4A_vp_1();
270  or whatever according to the target accelerator
271  */
272  asprintf(&comment, "%s To be assigned to a call to %s: %s\n",
273  c_module_p(get_current_module_entity()) ? "//" : "C",
274  intrinsic_name,
275  entity_user_name(index));
276  free(intrinsic_name);
278  }
279 
280  /* Then outline the innermost code again (the kernel wrapper) that owns
281  the kernel call. The kernel wrapper name with a prefix defined in the
282  GPU_WRAPPER_PREFIX property: */
283  list sk = CONS(STATEMENT, inner, NIL);
284  // Choose if we want the wrapper in its own file:
285  set_bool_property("OUTLINE_INDEPENDENT_COMPILATION_UNIT",
286  get_bool_property("GPU_USE_WRAPPER_INDEPENDENT_COMPILATION_UNIT"));
287  string wrapper_name = build_outline_name(wrapper_prefix, mod_name);
288  outliner(wrapper_name, sk);
289 
290  /* Here we check if we had requested to outline a kernel previously, and we
291  * ensure that if the wrapper wasn't generated in a new compilation unit,
292  * then it should be added in the same compilation unit as the kernel.
293  * It won't be declared in the compilation unit, but if the kernel have been
294  * generated in a new compilation unit, there is no PARSED_CODE resource
295  * available and thus we can't use AddEntityToCompilationUnit()
296  */
297  if(kernel_name && !string_undefined_p(kernel_name)
298  && !get_bool_property("GPU_USE_WRAPPER_INDEPENDENT_COMPILATION_UNIT")) {
299  string source_file_name =
300  db_get_memory_resource(DBR_USER_FILE, kernel_name, true);
301  DB_PUT_FILE_RESOURCE(DBR_USER_FILE, wrapper_name, strdup(source_file_name));
302  }
303 
304  //insert_comments_to_statement(inner, "// Call the compute kernel wrapper:");
305  }
306 
307  if (get_bool_property("GPU_USE_LAUNCHER")) {
308  /* Outline the kernel launcher with a prefix defined in the
309  GPU_LAUNCHER_PREFIX property: */
310  if(get_bool_property("GPU_IFY_ANNOTATE_LOOP_NESTS")) {
311  // Annotate loop nest now, so that we know which are parallel !
314  }
315  list sl = CONS(STATEMENT, s, NIL);
316  statement st;
317  // Choose if we want the launcher in its own file:
318  set_bool_property("OUTLINE_INDEPENDENT_COMPILATION_UNIT",
319  get_bool_property("GPU_USE_LAUNCHER_INDEPENDENT_COMPILATION_UNIT"));
320  st = outliner(build_outline_name(launcher_prefix, mod_name), sl);
321  if (get_bool_property("GPU_USE_FORTRAN_WRAPPER")) {
322  string fwp = strdup(concatenate(fwrapper_prefix,"_",mod_name,NULL));
323  ifdebug(3) {
324  pips_debug(1, "Outline Fortan_wrapper with prefix %s\n", fwp);
325  }
327  free(fwp);
328  }
329  //insert_comments_to_statement(inner, "// Call the compute kernel launcher:");
330  }
331  // Restore the original property value:
332  set_bool_property("OUTLINE_INDEPENDENT_COMPILATION_UNIT",
333  old_outline_independent_compilation_unit);
334 }
335 
336 
337 /* Transform all the parallel loop nests of a module into smaller
338  independent functions suitable for GPU-style accelerators.
339 
340  What can be done is more detailed in gpu_ify_statement(). The various
341  functions are generated or not according to different properties.
342 
343  @param module_name is the name of the module to work on.
344 
345  @return true since it should succeed...
346 */
347 bool gpu_ify(const string mod_name) {
348  // Use this module name and this environment variable to set
350  "GPU_IFY_DEBUG_LEVEL");
351 
352  // Get the effects and use them:
353  set_cumulated_rw_effects((statement_effects)db_get_memory_resource(DBR_CUMULATED_EFFECTS,mod_name,true));
354 
355  // Initialize the loop nest set to outline to the empty set yet:
357 
358  // Mark interesting loops:
361 
362  /* Outline the previous marked loop nests.
363  First put the statements to outline in the good order: */
365 
366  /* Clean module name from prefix */
368 
370  // We could have stored the depth, but it complexifies the code...
372  }
373 
375 
376  // No longer use effects:
378 
379  // We may have outline some code, so recompute the callees:
380  DB_PUT_MEMORY_RESOURCE(DBR_CALLEES, mod_name,
382 
383  // Put back the new statement module
385  // The macro above does a "return TRUE" indeed.
386 }
387 
388 
statement outliner(const char *, list)
outline the statements in statements_to_outline into a module named outline_module_name the outlined ...
Definition: outlining.c:1327
static statement module_statement
Definition: alias_check.c:125
entity FindOrMakeDefaultIntrinsic(string name, int arity)
Create a default intrinsic.
Definition: bootstrap.c:4397
callees compute_callees(const statement stat)
Recompute the callees of a module statement.
Definition: callgraph.c:355
void set_cumulated_rw_effects(statement_effects)
void reset_cumulated_rw_effects(void)
const char * global_name_to_user_name(const char *global_name)
functions on strings for entity names
Definition: entity_names.c:136
char * get_string_property(const char *)
bool get_bool_property(const string)
FC 2015-07-20: yuk, moved out to prevent an include cycle dependency include "properties....
static void comment(string_buffer code, spoc_hardware_type hw, dagvtx v, int stage, int side, bool flip)
Definition: freia_spoc.c:52
#define gen_recurse(start, domain_number, flt, rwt)
Definition: genC.h:283
void free(void *)
static const char * launcher_prefix
Definition: gpu-ify.c:40
static const char * wrapper_prefix
Definition: gpu-ify.c:39
static const char * clean_prefix(const char *full_name, const char *bad_prefix)
Return a pointer on the first char after the bad_prefix.
Definition: gpu-ify.c:45
static const char * get_clean_mod_name(const char *mod_name)
Trying to get only the original function name without prefix.
Definition: gpu-ify.c:59
static const char * fwrapper_prefix
Definition: gpu-ify.c:41
static const char * kernel_prefix
These are the possibles prefixes for outline stuff, they are computed from a property and the current...
Definition: gpu-ify.c:38
bool gpu_ify(const string mod_name)
Transform all the parallel loop nests of a module into smaller independent functions suitable for GPU...
Definition: gpu-ify.c:347
static list loop_nests_to_outline
A simple phase that outlines parallel loops onto GPU.
Definition: gpu-ify.c:32
static bool mark_loop_to_outline(const statement s)
Definition: gpu-ify.c:120
static void gpu_ify_statement(statement s, int depth, const char *mod_name)
Transform a loop nest into a GPU or accelerator-like kernel.
Definition: gpu-ify.c:199
string build_outline_name(const char *base_prefix, const char *mod_name)
Build the outline function name.
Definition: gpu-ify.c:78
bool gpu_loop_nest_annotate_on_statement(statement)
loop_nest_annotate.c
statement get_current_module_statement(void)
Get the current module statement.
Definition: static.c:208
entity get_current_module_entity(void)
Get the entity of the current module.
Definition: static.c:85
void gen_null(__attribute__((unused)) void *unused)
Ignore the argument.
Definition: genClib.c:2752
entity perfectly_nested_loop_index_at_depth(statement s, int depth)
Get the index of the loop at a given depth inside a loop-nest.
Definition: loop.c:694
statement perfectly_nested_loop_to_body_at_depth(statement s, int depth)
Extract the loop-body of a perfect loop-nest at a given depth.
Definition: loop.c:646
int depth_of_parallel_perfect_loop_nest(statement s)
Compute the depth of a parallel perfect loop-nest.
Definition: loop.c:436
list gen_nreverse(list cp)
reverse a list in place
Definition: list.c:304
#define NIL
The empty list (nil in Lisp)
Definition: newgen_list.h:47
#define CONS(_t_, _i_, _l_)
List element cell constructor (insert an element at the beginning of a list)
Definition: newgen_list.h:150
void gen_free_list(list l)
free the spine of the list
Definition: list.c:327
#define FOREACH(_fe_CASTER, _fe_item, _fe_list)
Apply/map an instruction block on all the elements of a list.
Definition: newgen_list.h:179
#define PIPS_PHASE_POSTLUDE(new_module_statement)
End a transformation phase by putting back into PIPS the (possibly) modified statement.
#define PIPS_PHASE_PRELUDE(module_name, debug_env_var)
Start a phase that use a module CODE.
string db_get_memory_resource(const char *rname, const char *oname, bool pure)
Return the pointer to the resource, whatever it is.
Definition: database.c:755
#define DB_PUT_MEMORY_RESOURCE(res_name, own_name, res_val)
conform to old interface.
Definition: pipsdbm-local.h:66
#define DB_PUT_FILE_RESOURCE
Put a file resource into the current workspace database.
Definition: pipsdbm-local.h:85
bool statement_loop_p(statement)
Definition: statement.c:349
void insert_comments_to_statement(statement, const char *)
Insert a comment string (if non empty) at the beginning of the comments of a statement.
Definition: statement.c:1916
#define full_name(dir, name)
Definition: compile.c:414
#define pips_debug
these macros use the GNU extensions that allow variadic macros, including with an empty list.
Definition: misc-local.h:145
#define asprintf
Definition: misc-local.h:225
string concatenate(const char *,...)
Return the concatenation of the given strings.
Definition: string.c:183
#define string_undefined
Definition: newgen_types.h:40
#define string_undefined_p(s)
Definition: newgen_types.h:41
void print_statement(statement)
Print a statement on stderr.
Definition: statement.c:98
void set_bool_property(const char *, bool)
static const char * prefix
const char * entity_user_name(entity e)
Since entity_local_name may contain PIPS special characters such as prefixes (label,...
Definition: entity.c:487
bool c_module_p(entity m)
Test if a module "m" is written in C.
Definition: entity.c:2777
string build_new_top_level_module_name(const char *prefix, bool prevent_suffix)
Get a new name for a module built from a prefix.
Definition: module.c:55
#define statement_domain
newgen_sizeofexpression_domain_defined
Definition: ri.h:362
#define entity_name(x)
Definition: ri.h:2790
#define statement_number(x)
Definition: ri.h:2452
#define STATEMENT(x)
STATEMENT.
Definition: ri.h:2413
char * strdup()
#define ifdebug(n)
Definition: sg.c:47
The structure used to build lists in NewGen.
Definition: newgen_list.h:41
static int depth
la sequence de nids