1 /*
2  * Cell Broadband Engine OProfile Support
3  *
4  * (C) Copyright IBM Corporation 2006
5  *
6  * Author: Maynard Johnson <maynardj@us.ibm.com>
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License
10  * as published by the Free Software Foundation; either version
11  * 2 of the License, or (at your option) any later version.
12  */
13 
14 /* The purpose of this file is to handle SPU event task switching
15  * and to record SPU context information into the OProfile
16  * event buffer.
17  *
18  * Additionally, the spu_sync_buffer function is provided as a helper
19  * for recoding actual SPU program counter samples to the event buffer.
20  */
21 #include <linux/dcookies.h>
22 #include <linux/kref.h>
23 #include <linux/mm.h>
24 #include <linux/fs.h>
25 #include <linux/file.h>
26 #include <linux/module.h>
27 #include <linux/notifier.h>
28 #include <linux/numa.h>
29 #include <linux/oprofile.h>
30 #include <linux/slab.h>
31 #include <linux/spinlock.h>
32 #include "pr_util.h"
33 
34 #define RELEASE_ALL 9999
35 
36 static DEFINE_SPINLOCK(buffer_lock);
37 static DEFINE_SPINLOCK(cache_lock);
38 static int num_spu_nodes;
39 static int spu_prof_num_nodes;
40 
41 struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE];
42 struct delayed_work spu_work;
43 static unsigned max_spu_buff;
44 
spu_buff_add(unsigned long int value,int spu)45 static void spu_buff_add(unsigned long int value, int spu)
46 {
47 	/* spu buff is a circular buffer.  Add entries to the
48 	 * head.  Head is the index to store the next value.
49 	 * The buffer is full when there is one available entry
50 	 * in the queue, i.e. head and tail can't be equal.
51 	 * That way we can tell the difference between the
52 	 * buffer being full versus empty.
53 	 *
54 	 *  ASSUMPTION: the buffer_lock is held when this function
55 	 *             is called to lock the buffer, head and tail.
56 	 */
57 	int full = 1;
58 
59 	if (spu_buff[spu].head >= spu_buff[spu].tail) {
60 		if ((spu_buff[spu].head - spu_buff[spu].tail)
61 		    <  (max_spu_buff - 1))
62 			full = 0;
63 
64 	} else if (spu_buff[spu].tail > spu_buff[spu].head) {
65 		if ((spu_buff[spu].tail - spu_buff[spu].head)
66 		    > 1)
67 			full = 0;
68 	}
69 
70 	if (!full) {
71 		spu_buff[spu].buff[spu_buff[spu].head] = value;
72 		spu_buff[spu].head++;
73 
74 		if (spu_buff[spu].head >= max_spu_buff)
75 			spu_buff[spu].head = 0;
76 	} else {
77 		/* From the user's perspective make the SPU buffer
78 		 * size management/overflow look like we are using
79 		 * per cpu buffers.  The user uses the same
80 		 * per cpu parameter to adjust the SPU buffer size.
81 		 * Increment the sample_lost_overflow to inform
82 		 * the user the buffer size needs to be increased.
83 		 */
84 		oprofile_cpu_buffer_inc_smpl_lost();
85 	}
86 }
87 
88 /* This function copies the per SPU buffers to the
89  * OProfile kernel buffer.
90  */
sync_spu_buff(void)91 static void sync_spu_buff(void)
92 {
93 	int spu;
94 	unsigned long flags;
95 	int curr_head;
96 
97 	for (spu = 0; spu < num_spu_nodes; spu++) {
98 		/* In case there was an issue and the buffer didn't
99 		 * get created skip it.
100 		 */
101 		if (spu_buff[spu].buff == NULL)
102 			continue;
103 
104 		/* Hold the lock to make sure the head/tail
105 		 * doesn't change while spu_buff_add() is
106 		 * deciding if the buffer is full or not.
107 		 * Being a little paranoid.
108 		 */
109 		spin_lock_irqsave(&buffer_lock, flags);
110 		curr_head = spu_buff[spu].head;
111 		spin_unlock_irqrestore(&buffer_lock, flags);
112 
113 		/* Transfer the current contents to the kernel buffer.
114 		 * data can still be added to the head of the buffer.
115 		 */
116 		oprofile_put_buff(spu_buff[spu].buff,
117 				  spu_buff[spu].tail,
118 				  curr_head, max_spu_buff);
119 
120 		spin_lock_irqsave(&buffer_lock, flags);
121 		spu_buff[spu].tail = curr_head;
122 		spin_unlock_irqrestore(&buffer_lock, flags);
123 	}
124 
125 }
126 
wq_sync_spu_buff(struct work_struct * work)127 static void wq_sync_spu_buff(struct work_struct *work)
128 {
129 	/* move data from spu buffers to kernel buffer */
130 	sync_spu_buff();
131 
132 	/* only reschedule if profiling is not done */
133 	if (spu_prof_running)
134 		schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
135 }
136 
137 /* Container for caching information about an active SPU task. */
138 struct cached_info {
139 	struct vma_to_fileoffset_map *map;
140 	struct spu *the_spu;	/* needed to access pointer to local_store */
141 	struct kref cache_ref;
142 };
143 
144 static struct cached_info *spu_info[MAX_NUMNODES * 8];
145 
destroy_cached_info(struct kref * kref)146 static void destroy_cached_info(struct kref *kref)
147 {
148 	struct cached_info *info;
149 
150 	info = container_of(kref, struct cached_info, cache_ref);
151 	vma_map_free(info->map);
152 	kfree(info);
153 	module_put(THIS_MODULE);
154 }
155 
156 /* Return the cached_info for the passed SPU number.
157  * ATTENTION:  Callers are responsible for obtaining the
158  *	       cache_lock if needed prior to invoking this function.
159  */
get_cached_info(struct spu * the_spu,int spu_num)160 static struct cached_info *get_cached_info(struct spu *the_spu, int spu_num)
161 {
162 	struct kref *ref;
163 	struct cached_info *ret_info;
164 
165 	if (spu_num >= num_spu_nodes) {
166 		printk(KERN_ERR "SPU_PROF: "
167 		       "%s, line %d: Invalid index %d into spu info cache\n",
168 		       __func__, __LINE__, spu_num);
169 		ret_info = NULL;
170 		goto out;
171 	}
172 	if (!spu_info[spu_num] && the_spu) {
173 		ref = spu_get_profile_private_kref(the_spu->ctx);
174 		if (ref) {
175 			spu_info[spu_num] = container_of(ref, struct cached_info, cache_ref);
176 			kref_get(&spu_info[spu_num]->cache_ref);
177 		}
178 	}
179 
180 	ret_info = spu_info[spu_num];
181  out:
182 	return ret_info;
183 }
184 
185 
186 /* Looks for cached info for the passed spu.  If not found, the
187  * cached info is created for the passed spu.
188  * Returns 0 for success; otherwise, -1 for error.
189  */
190 static int
prepare_cached_spu_info(struct spu * spu,unsigned long objectId)191 prepare_cached_spu_info(struct spu *spu, unsigned long objectId)
192 {
193 	unsigned long flags;
194 	struct vma_to_fileoffset_map *new_map;
195 	int retval = 0;
196 	struct cached_info *info;
197 
198 	/* We won't bother getting cache_lock here since
199 	 * don't do anything with the cached_info that's returned.
200 	 */
201 	info = get_cached_info(spu, spu->number);
202 
203 	if (info) {
204 		pr_debug("Found cached SPU info.\n");
205 		goto out;
206 	}
207 
208 	/* Create cached_info and set spu_info[spu->number] to point to it.
209 	 * spu->number is a system-wide value, not a per-node value.
210 	 */
211 	info = kzalloc(sizeof(*info), GFP_KERNEL);
212 	if (!info) {
213 		printk(KERN_ERR "SPU_PROF: "
214 		       "%s, line %d: create vma_map failed\n",
215 		       __func__, __LINE__);
216 		retval = -ENOMEM;
217 		goto err_alloc;
218 	}
219 	new_map = create_vma_map(spu, objectId);
220 	if (!new_map) {
221 		printk(KERN_ERR "SPU_PROF: "
222 		       "%s, line %d: create vma_map failed\n",
223 		       __func__, __LINE__);
224 		retval = -ENOMEM;
225 		goto err_alloc;
226 	}
227 
228 	pr_debug("Created vma_map\n");
229 	info->map = new_map;
230 	info->the_spu = spu;
231 	kref_init(&info->cache_ref);
232 	spin_lock_irqsave(&cache_lock, flags);
233 	spu_info[spu->number] = info;
234 	/* Increment count before passing off ref to SPUFS. */
235 	kref_get(&info->cache_ref);
236 
237 	/* We increment the module refcount here since SPUFS is
238 	 * responsible for the final destruction of the cached_info,
239 	 * and it must be able to access the destroy_cached_info()
240 	 * function defined in the OProfile module.  We decrement
241 	 * the module refcount in destroy_cached_info.
242 	 */
243 	try_module_get(THIS_MODULE);
244 	spu_set_profile_private_kref(spu->ctx, &info->cache_ref,
245 				destroy_cached_info);
246 	spin_unlock_irqrestore(&cache_lock, flags);
247 	goto out;
248 
249 err_alloc:
250 	kfree(info);
251 out:
252 	return retval;
253 }
254 
255 /*
256  * NOTE:  The caller is responsible for locking the
257  *	  cache_lock prior to calling this function.
258  */
release_cached_info(int spu_index)259 static int release_cached_info(int spu_index)
260 {
261 	int index, end;
262 
263 	if (spu_index == RELEASE_ALL) {
264 		end = num_spu_nodes;
265 		index = 0;
266 	} else {
267 		if (spu_index >= num_spu_nodes) {
268 			printk(KERN_ERR "SPU_PROF: "
269 				"%s, line %d: "
270 				"Invalid index %d into spu info cache\n",
271 				__func__, __LINE__, spu_index);
272 			goto out;
273 		}
274 		end = spu_index + 1;
275 		index = spu_index;
276 	}
277 	for (; index < end; index++) {
278 		if (spu_info[index]) {
279 			kref_put(&spu_info[index]->cache_ref,
280 				 destroy_cached_info);
281 			spu_info[index] = NULL;
282 		}
283 	}
284 
285 out:
286 	return 0;
287 }
288 
289 /* The source code for fast_get_dcookie was "borrowed"
290  * from drivers/oprofile/buffer_sync.c.
291  */
292 
293 /* Optimisation. We can manage without taking the dcookie sem
294  * because we cannot reach this code without at least one
295  * dcookie user still being registered (namely, the reader
296  * of the event buffer).
297  */
fast_get_dcookie(const struct path * path)298 static inline unsigned long fast_get_dcookie(const struct path *path)
299 {
300 	unsigned long cookie;
301 
302 	if (path->dentry->d_flags & DCACHE_COOKIE)
303 		return (unsigned long)path->dentry;
304 	get_dcookie(path, &cookie);
305 	return cookie;
306 }
307 
308 /* Look up the dcookie for the task's mm->exe_file,
309  * which corresponds loosely to "application name". Also, determine
310  * the offset for the SPU ELF object.  If computed offset is
311  * non-zero, it implies an embedded SPU object; otherwise, it's a
312  * separate SPU binary, in which case we retrieve it's dcookie.
313  * For the embedded case, we must determine if SPU ELF is embedded
314  * in the executable application or another file (i.e., shared lib).
315  * If embedded in a shared lib, we must get the dcookie and return
316  * that to the caller.
317  */
318 static unsigned long
get_exec_dcookie_and_offset(struct spu * spu,unsigned int * offsetp,unsigned long * spu_bin_dcookie,unsigned long spu_ref)319 get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp,
320 			    unsigned long *spu_bin_dcookie,
321 			    unsigned long spu_ref)
322 {
323 	unsigned long app_cookie = 0;
324 	unsigned int my_offset = 0;
325 	struct vm_area_struct *vma;
326 	struct file *exe_file;
327 	struct mm_struct *mm = spu->mm;
328 
329 	if (!mm)
330 		goto out;
331 
332 	exe_file = get_mm_exe_file(mm);
333 	if (exe_file) {
334 		app_cookie = fast_get_dcookie(&exe_file->f_path);
335 		pr_debug("got dcookie for %pD\n", exe_file);
336 		fput(exe_file);
337 	}
338 
339 	down_read(&mm->mmap_sem);
340 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
341 		if (vma->vm_start > spu_ref || vma->vm_end <= spu_ref)
342 			continue;
343 		my_offset = spu_ref - vma->vm_start;
344 		if (!vma->vm_file)
345 			goto fail_no_image_cookie;
346 
347 		pr_debug("Found spu ELF at %X(object-id:%lx) for file %pD\n",
348 			 my_offset, spu_ref, vma->vm_file);
349 		*offsetp = my_offset;
350 		break;
351 	}
352 
353 	*spu_bin_dcookie = fast_get_dcookie(&vma->vm_file->f_path);
354 	pr_debug("got dcookie for %pD\n", vma->vm_file);
355 
356 	up_read(&mm->mmap_sem);
357 
358 out:
359 	return app_cookie;
360 
361 fail_no_image_cookie:
362 	up_read(&mm->mmap_sem);
363 
364 	printk(KERN_ERR "SPU_PROF: "
365 		"%s, line %d: Cannot find dcookie for SPU binary\n",
366 		__func__, __LINE__);
367 	goto out;
368 }
369 
370 
371 
372 /* This function finds or creates cached context information for the
373  * passed SPU and records SPU context information into the OProfile
374  * event buffer.
375  */
process_context_switch(struct spu * spu,unsigned long objectId)376 static int process_context_switch(struct spu *spu, unsigned long objectId)
377 {
378 	unsigned long flags;
379 	int retval;
380 	unsigned int offset = 0;
381 	unsigned long spu_cookie = 0, app_dcookie;
382 
383 	retval = prepare_cached_spu_info(spu, objectId);
384 	if (retval)
385 		goto out;
386 
387 	/* Get dcookie first because a mutex_lock is taken in that
388 	 * code path, so interrupts must not be disabled.
389 	 */
390 	app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie, objectId);
391 	if (!app_dcookie || !spu_cookie) {
392 		retval  = -ENOENT;
393 		goto out;
394 	}
395 
396 	/* Record context info in event buffer */
397 	spin_lock_irqsave(&buffer_lock, flags);
398 	spu_buff_add(ESCAPE_CODE, spu->number);
399 	spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number);
400 	spu_buff_add(spu->number, spu->number);
401 	spu_buff_add(spu->pid, spu->number);
402 	spu_buff_add(spu->tgid, spu->number);
403 	spu_buff_add(app_dcookie, spu->number);
404 	spu_buff_add(spu_cookie, spu->number);
405 	spu_buff_add(offset, spu->number);
406 
407 	/* Set flag to indicate SPU PC data can now be written out.  If
408 	 * the SPU program counter data is seen before an SPU context
409 	 * record is seen, the postprocessing will fail.
410 	 */
411 	spu_buff[spu->number].ctx_sw_seen = 1;
412 
413 	spin_unlock_irqrestore(&buffer_lock, flags);
414 	smp_wmb();	/* insure spu event buffer updates are written */
415 			/* don't want entries intermingled... */
416 out:
417 	return retval;
418 }
419 
420 /*
421  * This function is invoked on either a bind_context or unbind_context.
422  * If called for an unbind_context, the val arg is 0; otherwise,
423  * it is the object-id value for the spu context.
424  * The data arg is of type 'struct spu *'.
425  */
spu_active_notify(struct notifier_block * self,unsigned long val,void * data)426 static int spu_active_notify(struct notifier_block *self, unsigned long val,
427 				void *data)
428 {
429 	int retval;
430 	unsigned long flags;
431 	struct spu *the_spu = data;
432 
433 	pr_debug("SPU event notification arrived\n");
434 	if (!val) {
435 		spin_lock_irqsave(&cache_lock, flags);
436 		retval = release_cached_info(the_spu->number);
437 		spin_unlock_irqrestore(&cache_lock, flags);
438 	} else {
439 		retval = process_context_switch(the_spu, val);
440 	}
441 	return retval;
442 }
443 
444 static struct notifier_block spu_active = {
445 	.notifier_call = spu_active_notify,
446 };
447 
number_of_online_nodes(void)448 static int number_of_online_nodes(void)
449 {
450         u32 cpu; u32 tmp;
451         int nodes = 0;
452         for_each_online_cpu(cpu) {
453                 tmp = cbe_cpu_to_node(cpu) + 1;
454                 if (tmp > nodes)
455                         nodes++;
456         }
457         return nodes;
458 }
459 
oprofile_spu_buff_create(void)460 static int oprofile_spu_buff_create(void)
461 {
462 	int spu;
463 
464 	max_spu_buff = oprofile_get_cpu_buffer_size();
465 
466 	for (spu = 0; spu < num_spu_nodes; spu++) {
467 		/* create circular buffers to store the data in.
468 		 * use locks to manage accessing the buffers
469 		 */
470 		spu_buff[spu].head = 0;
471 		spu_buff[spu].tail = 0;
472 
473 		/*
474 		 * Create a buffer for each SPU.  Can't reliably
475 		 * create a single buffer for all spus due to not
476 		 * enough contiguous kernel memory.
477 		 */
478 
479 		spu_buff[spu].buff = kzalloc((max_spu_buff
480 					      * sizeof(unsigned long)),
481 					     GFP_KERNEL);
482 
483 		if (!spu_buff[spu].buff) {
484 			printk(KERN_ERR "SPU_PROF: "
485 			       "%s, line %d:  oprofile_spu_buff_create "
486 		       "failed to allocate spu buffer %d.\n",
487 			       __func__, __LINE__, spu);
488 
489 			/* release the spu buffers that have been allocated */
490 			while (spu >= 0) {
491 				kfree(spu_buff[spu].buff);
492 				spu_buff[spu].buff = 0;
493 				spu--;
494 			}
495 			return -ENOMEM;
496 		}
497 	}
498 	return 0;
499 }
500 
501 /* The main purpose of this function is to synchronize
502  * OProfile with SPUFS by registering to be notified of
503  * SPU task switches.
504  *
505  * NOTE: When profiling SPUs, we must ensure that only
506  * spu_sync_start is invoked and not the generic sync_start
507  * in drivers/oprofile/oprof.c.	 A return value of
508  * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
509  * accomplish this.
510  */
spu_sync_start(void)511 int spu_sync_start(void)
512 {
513 	int spu;
514 	int ret = SKIP_GENERIC_SYNC;
515 	int register_ret;
516 	unsigned long flags = 0;
517 
518 	spu_prof_num_nodes = number_of_online_nodes();
519 	num_spu_nodes = spu_prof_num_nodes * 8;
520 	INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff);
521 
522 	/* create buffer for storing the SPU data to put in
523 	 * the kernel buffer.
524 	 */
525 	ret = oprofile_spu_buff_create();
526 	if (ret)
527 		goto out;
528 
529 	spin_lock_irqsave(&buffer_lock, flags);
530 	for (spu = 0; spu < num_spu_nodes; spu++) {
531 		spu_buff_add(ESCAPE_CODE, spu);
532 		spu_buff_add(SPU_PROFILING_CODE, spu);
533 		spu_buff_add(num_spu_nodes, spu);
534 	}
535 	spin_unlock_irqrestore(&buffer_lock, flags);
536 
537 	for (spu = 0; spu < num_spu_nodes; spu++) {
538 		spu_buff[spu].ctx_sw_seen = 0;
539 		spu_buff[spu].last_guard_val = 0;
540 	}
541 
542 	/* Register for SPU events  */
543 	register_ret = spu_switch_event_register(&spu_active);
544 	if (register_ret) {
545 		ret = SYNC_START_ERROR;
546 		goto out;
547 	}
548 
549 	pr_debug("spu_sync_start -- running.\n");
550 out:
551 	return ret;
552 }
553 
554 /* Record SPU program counter samples to the oprofile event buffer. */
spu_sync_buffer(int spu_num,unsigned int * samples,int num_samples)555 void spu_sync_buffer(int spu_num, unsigned int *samples,
556 		     int num_samples)
557 {
558 	unsigned long long file_offset;
559 	unsigned long flags;
560 	int i;
561 	struct vma_to_fileoffset_map *map;
562 	struct spu *the_spu;
563 	unsigned long long spu_num_ll = spu_num;
564 	unsigned long long spu_num_shifted = spu_num_ll << 32;
565 	struct cached_info *c_info;
566 
567 	/* We need to obtain the cache_lock here because it's
568 	 * possible that after getting the cached_info, the SPU job
569 	 * corresponding to this cached_info may end, thus resulting
570 	 * in the destruction of the cached_info.
571 	 */
572 	spin_lock_irqsave(&cache_lock, flags);
573 	c_info = get_cached_info(NULL, spu_num);
574 	if (!c_info) {
575 		/* This legitimately happens when the SPU task ends before all
576 		 * samples are recorded.
577 		 * No big deal -- so we just drop a few samples.
578 		 */
579 		pr_debug("SPU_PROF: No cached SPU contex "
580 			  "for SPU #%d. Dropping samples.\n", spu_num);
581 		goto out;
582 	}
583 
584 	map = c_info->map;
585 	the_spu = c_info->the_spu;
586 	spin_lock(&buffer_lock);
587 	for (i = 0; i < num_samples; i++) {
588 		unsigned int sample = *(samples+i);
589 		int grd_val = 0;
590 		file_offset = 0;
591 		if (sample == 0)
592 			continue;
593 		file_offset = vma_map_lookup( map, sample, the_spu, &grd_val);
594 
595 		/* If overlays are used by this SPU application, the guard
596 		 * value is non-zero, indicating which overlay section is in
597 		 * use.	 We need to discard samples taken during the time
598 		 * period which an overlay occurs (i.e., guard value changes).
599 		 */
600 		if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) {
601 			spu_buff[spu_num].last_guard_val = grd_val;
602 			/* Drop the rest of the samples. */
603 			break;
604 		}
605 
606 		/* We must ensure that the SPU context switch has been written
607 		 * out before samples for the SPU.  Otherwise, the SPU context
608 		 * information is not available and the postprocessing of the
609 		 * SPU PC will fail with no available anonymous map information.
610 		 */
611 		if (spu_buff[spu_num].ctx_sw_seen)
612 			spu_buff_add((file_offset | spu_num_shifted),
613 					 spu_num);
614 	}
615 	spin_unlock(&buffer_lock);
616 out:
617 	spin_unlock_irqrestore(&cache_lock, flags);
618 }
619 
620 
spu_sync_stop(void)621 int spu_sync_stop(void)
622 {
623 	unsigned long flags = 0;
624 	int ret;
625 	int k;
626 
627 	ret = spu_switch_event_unregister(&spu_active);
628 
629 	if (ret)
630 		printk(KERN_ERR "SPU_PROF: "
631 		       "%s, line %d: spu_switch_event_unregister "	\
632 		       "returned %d\n",
633 		       __func__, __LINE__, ret);
634 
635 	/* flush any remaining data in the per SPU buffers */
636 	sync_spu_buff();
637 
638 	spin_lock_irqsave(&cache_lock, flags);
639 	ret = release_cached_info(RELEASE_ALL);
640 	spin_unlock_irqrestore(&cache_lock, flags);
641 
642 	/* remove scheduled work queue item rather then waiting
643 	 * for every queued entry to execute.  Then flush pending
644 	 * system wide buffer to event buffer.
645 	 */
646 	cancel_delayed_work(&spu_work);
647 
648 	for (k = 0; k < num_spu_nodes; k++) {
649 		spu_buff[k].ctx_sw_seen = 0;
650 
651 		/*
652 		 * spu_sys_buff will be null if there was a problem
653 		 * allocating the buffer.  Only delete if it exists.
654 		 */
655 		kfree(spu_buff[k].buff);
656 		spu_buff[k].buff = 0;
657 	}
658 	pr_debug("spu_sync_stop -- done.\n");
659 	return ret;
660 }
661 
662