/* * This file contains the code to configure and read/write the ia64 performance * monitoring stuff. * * Originaly Written by Ganesh Venkitachalam, IBM Corp. * Modifications by David Mosberger-Tang, Hewlett-Packard Co. * Modifications by Stephane Eranian, Hewlett-Packard Co. * Copyright (C) 1999 Ganesh Venkitachalam * Copyright (C) 1999 David Mosberger-Tang * Copyright (C) 2000-2001 Stephane Eranian */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for ia64_get_itc() */ #ifdef CONFIG_PERFMON #define PFM_VERSION "0.3" #define PFM_SMPL_HDR_VERSION 1 #define PMU_FIRST_COUNTER 4 /* first generic counter */ #define PFM_WRITE_PMCS 0xa0 #define PFM_WRITE_PMDS 0xa1 #define PFM_READ_PMDS 0xa2 #define PFM_STOP 0xa3 #define PFM_START 0xa4 #define PFM_ENABLE 0xa5 /* unfreeze only */ #define PFM_DISABLE 0xa6 /* freeze only */ #define PFM_RESTART 0xcf #define PFM_CREATE_CONTEXT 0xa7 #define PFM_DESTROY_CONTEXT 0xa8 /* * Those 2 are just meant for debugging. I considered using sysctl() for * that but it is a little bit too pervasive. This solution is at least * self-contained. */ #define PFM_DEBUG_ON 0xe0 #define PFM_DEBUG_OFF 0xe1 #define PFM_DEBUG_BASE PFM_DEBUG_ON /* * perfmon API flags */ #define PFM_FL_INHERIT_NONE 0x00 /* never inherit a context across fork (default) */ #define PFM_FL_INHERIT_ONCE 0x01 /* clone pfm_context only once across fork() */ #define PFM_FL_INHERIT_ALL 0x02 /* always clone pfm_context across fork() */ #define PFM_FL_SMPL_OVFL_NOBLOCK 0x04 /* do not block on sampling buffer overflow */ #define PFM_FL_SYSTEM_WIDE 0x08 /* create a system wide context */ #define PFM_FL_EXCL_INTR 0x10 /* exclude interrupt from system wide monitoring */ /* * PMC API flags */ #define PFM_REGFL_OVFL_NOTIFY 1 /* send notification on overflow */ /* * Private flags and masks */ #define PFM_FL_INHERIT_MASK (PFM_FL_INHERIT_NONE|PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL) #ifdef CONFIG_SMP #define cpu_is_online(i) (cpu_online_map & (1UL << i)) #else #define cpu_is_online(i) 1 #endif #define PMC_IS_IMPL(i) (i < pmu_conf.num_pmcs && pmu_conf.impl_regs[i>>6] & (1<< (i&~(64-1)))) #define PMD_IS_IMPL(i) (i < pmu_conf.num_pmds && pmu_conf.impl_regs[4+(i>>6)] & (1<< (i&~(64-1)))) #define PMD_IS_COUNTER(i) (i>=PMU_FIRST_COUNTER && i < (PMU_FIRST_COUNTER+pmu_conf.max_counters)) #define PMC_IS_COUNTER(i) (i>=PMU_FIRST_COUNTER && i < (PMU_FIRST_COUNTER+pmu_conf.max_counters)) /* This is the Itanium-specific PMC layout for counter config */ typedef struct { unsigned long pmc_plm:4; /* privilege level mask */ unsigned long pmc_ev:1; /* external visibility */ unsigned long pmc_oi:1; /* overflow interrupt */ unsigned long pmc_pm:1; /* privileged monitor */ unsigned long pmc_ig1:1; /* reserved */ unsigned long pmc_es:7; /* event select */ unsigned long pmc_ig2:1; /* reserved */ unsigned long pmc_umask:4; /* unit mask */ unsigned long pmc_thres:3; /* threshold */ unsigned long pmc_ig3:1; /* reserved (missing from table on p6-17) */ unsigned long pmc_ism:2; /* instruction set mask */ unsigned long pmc_ig4:38; /* reserved */ } pmc_counter_reg_t; /* test for EAR/BTB configuration */ #define PMU_DEAR_EVENT 0x67 #define PMU_IEAR_EVENT 0x23 #define PMU_BTB_EVENT 0x11 #define PMC_IS_DEAR(a) (((pmc_counter_reg_t *)(a))->pmc_es == PMU_DEAR_EVENT) #define PMC_IS_IEAR(a) (((pmc_counter_reg_t *)(a))->pmc_es == PMU_IEAR_EVENT) #define PMC_IS_BTB(a) (((pmc_counter_reg_t *)(a))->pmc_es == PMU_BTB_EVENT) /* * This header is at the beginning of the sampling buffer returned to the user. * It is exported as Read-Only at this point. It is directly followed with the * first record. */ typedef struct { int hdr_version; /* could be used to differentiate formats */ int hdr_reserved; unsigned long hdr_entry_size; /* size of one entry in bytes */ unsigned long hdr_count; /* how many valid entries */ unsigned long hdr_pmds; /* which pmds are recorded */ } perfmon_smpl_hdr_t; /* * Header entry in the buffer as a header as follows. * The header is directly followed with the PMDS to saved in increasing index order: * PMD4, PMD5, .... How many PMDs are present is determined by the tool which must * keep track of it when generating the final trace file. */ typedef struct { int pid; /* identification of process */ int cpu; /* which cpu was used */ unsigned long rate; /* initial value of this counter */ unsigned long stamp; /* timestamp */ unsigned long ip; /* where did the overflow interrupt happened */ unsigned long regs; /* which registers overflowed (up to 64)*/ } perfmon_smpl_entry_t; /* * There is one such data structure per perfmon context. It is used to describe the * sampling buffer. It is to be shared among siblings whereas the pfm_context isn't. * Therefore we maintain a refcnt which is incremented on fork(). * This buffer is private to the kernel only the actual sampling buffer including its * header are exposed to the user. This construct allows us to export the buffer read-write, * if needed, without worrying about security problems. */ typedef struct { atomic_t psb_refcnt; /* how many users for the buffer */ int reserved; void *psb_addr; /* points to location of first entry */ unsigned long psb_entries; /* maximum number of entries */ unsigned long psb_size; /* aligned size of buffer */ unsigned long psb_index; /* next free entry slot */ unsigned long psb_entry_size; /* size of each entry including entry header */ perfmon_smpl_hdr_t *psb_hdr; /* points to sampling buffer header */ } pfm_smpl_buffer_desc_t; /* * This structure is initialized at boot time and contains * a description of the PMU main characteristic as indicated * by PAL */ typedef struct { unsigned long pfm_is_disabled; /* indicates if perfmon is working properly */ unsigned long perf_ovfl_val; /* overflow value for generic counters */ unsigned long max_counters; /* upper limit on counter pair (PMC/PMD) */ unsigned long num_pmcs ; /* highest PMC implemented (may have holes) */ unsigned long num_pmds; /* highest PMD implemented (may have holes) */ unsigned long impl_regs[16]; /* buffer used to hold implememted PMC/PMD mask */ } pmu_config_t; #define PERFMON_IS_DISABLED() pmu_conf.pfm_is_disabled typedef struct { __u64 val; /* virtual 64bit counter value */ __u64 ival; /* initial value from user */ __u64 smpl_rval; /* reset value on sampling overflow */ __u64 ovfl_rval; /* reset value on overflow */ int flags; /* notify/do not notify */ } pfm_counter_t; #define PMD_OVFL_NOTIFY(ctx, i) ((ctx)->ctx_pmds[i].flags & PFM_REGFL_OVFL_NOTIFY) /* * perfmon context. One per process, is cloned on fork() depending on inheritance flags */ typedef struct { unsigned int inherit:2; /* inherit mode */ unsigned int noblock:1; /* block/don't block on overflow with notification */ unsigned int system:1; /* do system wide monitoring */ unsigned int frozen:1; /* pmu must be kept frozen on ctxsw in */ unsigned int exclintr:1;/* exlcude interrupts from system wide monitoring */ unsigned int reserved:26; } pfm_context_flags_t; typedef struct pfm_context { pfm_smpl_buffer_desc_t *ctx_smpl_buf; /* sampling buffer descriptor, if any */ unsigned long ctx_dear_counter; /* which PMD holds D-EAR */ unsigned long ctx_iear_counter; /* which PMD holds I-EAR */ unsigned long ctx_btb_counter; /* which PMD holds BTB */ spinlock_t ctx_notify_lock; pfm_context_flags_t ctx_flags; /* block/noblock */ int ctx_notify_sig; /* XXX: SIGPROF or other */ struct task_struct *ctx_notify_task; /* who to notify on overflow */ struct task_struct *ctx_creator; /* pid of creator (debug) */ unsigned long ctx_ovfl_regs; /* which registers just overflowed (notification) */ unsigned long ctx_smpl_regs; /* which registers to record on overflow */ struct semaphore ctx_restart_sem; /* use for blocking notification mode */ unsigned long ctx_used_pmds[4]; /* bitmask of used PMD (speedup ctxsw) */ unsigned long ctx_used_pmcs[4]; /* bitmask of used PMC (speedup ctxsw) */ pfm_counter_t ctx_pmds[IA64_NUM_PMD_COUNTERS]; /* XXX: size should be dynamic */ } pfm_context_t; #define CTX_USED_PMD(ctx,n) (ctx)->ctx_used_pmds[(n)>>6] |= 1<< ((n) % 64) #define CTX_USED_PMC(ctx,n) (ctx)->ctx_used_pmcs[(n)>>6] |= 1<< ((n) % 64) #define ctx_fl_inherit ctx_flags.inherit #define ctx_fl_noblock ctx_flags.noblock #define ctx_fl_system ctx_flags.system #define ctx_fl_frozen ctx_flags.frozen #define ctx_fl_exclintr ctx_flags.exclintr #define CTX_OVFL_NOBLOCK(c) ((c)->ctx_fl_noblock == 1) #define CTX_INHERIT_MODE(c) ((c)->ctx_fl_inherit) #define CTX_HAS_SMPL(c) ((c)->ctx_smpl_buf != NULL) static pmu_config_t pmu_conf; /* for debug only */ static int pfm_debug=0; /* 0= nodebug, >0= debug output on */ #define DBprintk(a) \ do { \ if (pfm_debug >0) { printk(__FUNCTION__" %d: ", __LINE__); printk a; } \ } while (0); static void ia64_reset_pmu(void); /* * structure used to pass information between the interrupt handler * and the tasklet. */ typedef struct { pid_t to_pid; /* which process to notify */ pid_t from_pid; /* which process is source of overflow */ int sig; /* with which signal */ unsigned long bitvect; /* which counters have overflowed */ } notification_info_t; typedef struct { unsigned long pfs_proc_sessions; unsigned long pfs_sys_session; /* can only be 0/1 */ unsigned long pfs_dfl_dcr; /* XXX: hack */ unsigned int pfs_pp; } pfm_session_t; struct { struct task_struct *owner; } ____cacheline_aligned pmu_owners[NR_CPUS]; /* * helper macros */ #define SET_PMU_OWNER(t) do { pmu_owners[smp_processor_id()].owner = (t); } while(0); #define PMU_OWNER() pmu_owners[smp_processor_id()].owner #ifdef CONFIG_SMP #define PFM_CAN_DO_LAZY() (smp_num_cpus==1 && pfs_info.pfs_sys_session==0) #else #define PFM_CAN_DO_LAZY() (pfs_info.pfs_sys_session==0) #endif static void pfm_lazy_save_regs (struct task_struct *ta); /* for debug only */ static struct proc_dir_entry *perfmon_dir; /* * XXX: hack to indicate that a system wide monitoring session is active */ static pfm_session_t pfs_info; /* * finds the number of PM(C|D) registers given * the bitvector returned by PAL */ static unsigned long __init find_num_pm_regs(long *buffer) { int i=3; /* 4 words/per bitvector */ /* start from the most significant word */ while (i>=0 && buffer[i] == 0 ) i--; if (i< 0) { printk(KERN_ERR "perfmon: No bit set in pm_buffer\n"); return 0; } return 1+ ia64_fls(buffer[i]) + 64 * i; } /* * Generates a unique (per CPU) timestamp */ static inline unsigned long perfmon_get_stamp(void) { /* * XXX: maybe find something more efficient */ return ia64_get_itc(); } /* Given PGD from the address space's page table, return the kernel * virtual mapping of the physical memory mapped at ADR. */ static inline unsigned long uvirt_to_kva(pgd_t *pgd, unsigned long adr) { unsigned long ret = 0UL; pmd_t *pmd; pte_t *ptep, pte; if (!pgd_none(*pgd)) { pmd = pmd_offset(pgd, adr); if (!pmd_none(*pmd)) { ptep = pte_offset(pmd, adr); pte = *ptep; if (pte_present(pte)) { ret = (unsigned long) page_address(pte_page(pte)); ret |= (adr & (PAGE_SIZE - 1)); } } } DBprintk(("uv2kva(%lx-->%lx)\n", adr, ret)); return ret; } /* Here we want the physical address of the memory. * This is used when initializing the contents of the * area and marking the pages as reserved. */ static inline unsigned long kvirt_to_pa(unsigned long adr) { __u64 pa = ia64_tpa(adr); DBprintk(("kv2pa(%lx-->%lx)\n", adr, pa)); return pa; } static void * rvmalloc(unsigned long size) { void *mem; unsigned long adr, page; /* XXX: may have to revisit this part because * vmalloc() does not necessarily return a page-aligned buffer. * This maybe a security problem when mapped at user level */ mem=vmalloc(size); if (mem) { memset(mem, 0, size); /* Clear the ram out, no junk to the user */ adr=(unsigned long) mem; while (size > 0) { page = kvirt_to_pa(adr); mem_map_reserve(virt_to_page(__va(page))); adr+=PAGE_SIZE; size-=PAGE_SIZE; } } return mem; } static void rvfree(void *mem, unsigned long size) { unsigned long adr, page; if (mem) { adr=(unsigned long) mem; while (size > 0) { page = kvirt_to_pa(adr); mem_map_unreserve(virt_to_page(__va(page))); adr+=PAGE_SIZE; size-=PAGE_SIZE; } vfree(mem); } } static pfm_context_t * pfm_context_alloc(void) { pfm_context_t *pfc; /* allocate context descriptor */ pfc = vmalloc(sizeof(*pfc)); if (pfc) memset(pfc, 0, sizeof(*pfc)); return pfc; } static void pfm_context_free(pfm_context_t *pfc) { if (pfc) vfree(pfc); } static int pfm_remap_buffer(unsigned long buf, unsigned long addr, unsigned long size) { unsigned long page; while (size > 0) { page = kvirt_to_pa(buf); if (remap_page_range(addr, page, PAGE_SIZE, PAGE_SHARED)) return -ENOMEM; addr += PAGE_SIZE; buf += PAGE_SIZE; size -= PAGE_SIZE; } return 0; } /* * counts the number of PMDS to save per entry. * This code is generic enough to accomodate more than 64 PMDS when they become available */ static unsigned long pfm_smpl_entry_size(unsigned long *which, unsigned long size) { unsigned long res = 0; int i; for (i=0; i < size; i++, which++) res += hweight64(*which); DBprintk((" res=%ld\n", res)); return res; } /* * Allocates the sampling buffer and remaps it into caller's address space */ static int pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long which_pmds, unsigned long entries, void **user_addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long addr, size, regcount; void *smpl_buf; pfm_smpl_buffer_desc_t *psb; regcount = pfm_smpl_entry_size(&which_pmds, 1); /* note that regcount might be 0, in this case only the header for each * entry will be recorded. */ /* * 1 buffer hdr and for each entry a header + regcount PMDs to save */ size = PAGE_ALIGN( sizeof(perfmon_smpl_hdr_t) + entries * (sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64))); /* * check requested size to avoid Denial-of-service attacks * XXX: may have to refine this test */ if (size > current->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN; /* find some free area in address space */ addr = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE); if (!addr) goto no_addr; DBprintk((" entries=%ld aligned size=%ld, unmapped @0x%lx\n", entries, size, addr)); /* allocate vma */ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!vma) goto no_vma; /* XXX: see rvmalloc() for page alignment problem */ smpl_buf = rvmalloc(size); if (smpl_buf == NULL) goto no_buffer; DBprintk((" smpl_buf @%p\n", smpl_buf)); if (pfm_remap_buffer((unsigned long)smpl_buf, addr, size)) goto cant_remap; /* allocate sampling buffer descriptor now */ psb = vmalloc(sizeof(*psb)); if (psb == NULL) goto no_buffer_desc; /* start with something clean */ memset(smpl_buf, 0x0, size); psb->psb_hdr = smpl_buf; psb->psb_addr = (char *)smpl_buf+sizeof(perfmon_smpl_hdr_t); /* first entry */ psb->psb_size = size; /* aligned size */ psb->psb_index = 0; psb->psb_entries = entries; atomic_set(&psb->psb_refcnt, 1); psb->psb_entry_size = sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64); DBprintk((" psb @%p entry_size=%ld hdr=%p addr=%p\n", (void *)psb,psb->psb_entry_size, (void *)psb->psb_hdr, (void *)psb->psb_addr)); /* initialize some of the fields of header */ psb->psb_hdr->hdr_version = PFM_SMPL_HDR_VERSION; psb->psb_hdr->hdr_entry_size = sizeof(perfmon_smpl_entry_t)+regcount*sizeof(u64); psb->psb_hdr->hdr_pmds = which_pmds; /* store which PMDS to record */ ctx->ctx_smpl_regs = which_pmds; /* link to perfmon context */ ctx->ctx_smpl_buf = psb; /* * initialize the vma for the sampling buffer */ vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + size; vma->vm_flags = VM_READ|VM_MAYREAD; vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ vma->vm_ops = NULL; vma->vm_pgoff = 0; vma->vm_file = NULL; vma->vm_raend = 0; vma->vm_private_data = ctx; /* link to pfm_context(not yet used) */ /* * now insert the vma in the vm list for the process */ insert_vm_struct(mm, vma); mm->total_vm += size >> PAGE_SHIFT; /* * that's the address returned to the user */ *user_addr = (void *)addr; return 0; /* outlined error handling */ no_addr: DBprintk(("Cannot find unmapped area for size %ld\n", size)); return -ENOMEM; no_vma: DBprintk(("Cannot allocate vma\n")); return -ENOMEM; cant_remap: DBprintk(("Can't remap buffer\n")); rvfree(smpl_buf, size); no_buffer: DBprintk(("Can't allocate sampling buffer\n")); kmem_cache_free(vm_area_cachep, vma); return -ENOMEM; no_buffer_desc: DBprintk(("Can't allocate sampling buffer descriptor\n")); kmem_cache_free(vm_area_cachep, vma); rvfree(smpl_buf, size); return -ENOMEM; } static int pfx_is_sane(pfreq_context_t *pfx) { int ctx_flags; /* valid signal */ //if (pfx->notify_sig < 1 || pfx->notify_sig >= _NSIG) return -EINVAL; if (pfx->notify_sig !=0 && pfx->notify_sig != SIGPROF) return -EINVAL; /* cannot send to process 1, 0 means do not notify */ if (pfx->notify_pid < 0 || pfx->notify_pid == 1) return -EINVAL; ctx_flags = pfx->flags; if (ctx_flags & PFM_FL_SYSTEM_WIDE) { #ifdef CONFIG_SMP if (smp_num_cpus > 1) { printk("perfmon: system wide monitoring on SMP not yet supported\n"); return -EINVAL; } #endif if ((ctx_flags & PFM_FL_SMPL_OVFL_NOBLOCK) == 0) { printk("perfmon: system wide monitoring cannot use blocking notification mode\n"); return -EINVAL; } } /* probably more to add here */ return 0; } static int pfm_context_create(int flags, perfmon_req_t *req) { pfm_context_t *ctx; struct task_struct *task = NULL; perfmon_req_t tmp; void *uaddr = NULL; int ret; int ctx_flags; pid_t pid; /* to go away */ if (flags) { printk("perfmon: use context flags instead of perfmon() flags. Obsoleted API\n"); } if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT; ret = pfx_is_sane(&tmp.pfr_ctx); if (ret < 0) return ret; ctx_flags = tmp.pfr_ctx.flags; if (ctx_flags & PFM_FL_SYSTEM_WIDE) { /* * XXX: This is not AT ALL SMP safe */ if (pfs_info.pfs_proc_sessions > 0) return -EBUSY; if (pfs_info.pfs_sys_session > 0) return -EBUSY; pfs_info.pfs_sys_session = 1; } else if (pfs_info.pfs_sys_session >0) { /* no per-process monitoring while there is a system wide session */ return -EBUSY; } else pfs_info.pfs_proc_sessions++; ctx = pfm_context_alloc(); if (!ctx) goto error; /* record the creator (debug only) */ ctx->ctx_creator = current; pid = tmp.pfr_ctx.notify_pid; spin_lock_init(&ctx->ctx_notify_lock); if (pid == current->pid) { ctx->ctx_notify_task = task = current; current->thread.pfm_context = ctx; atomic_set(¤t->thread.pfm_notifiers_check, 1); } else if (pid!=0) { read_lock(&tasklist_lock); task = find_task_by_pid(pid); if (task) { /* * record who to notify */ ctx->ctx_notify_task = task; /* * make visible * must be done inside critical section * * if the initialization does not go through it is still * okay because child will do the scan for nothing which * won't hurt. */ current->thread.pfm_context = ctx; /* * will cause task to check on exit for monitored * processes that would notify it. see release_thread() * Note: the scan MUST be done in release thread, once the * task has been detached from the tasklist otherwise you are * exposed to race conditions. */ atomic_add(1, &task->thread.pfm_notifiers_check); } read_unlock(&tasklist_lock); } /* * notification process does not exist */ if (pid != 0 && task == NULL) { ret = -EINVAL; goto buffer_error; } ctx->ctx_notify_sig = SIGPROF; /* siginfo imposes a fixed signal */ if (tmp.pfr_ctx.smpl_entries) { DBprintk((" sampling entries=%ld\n",tmp.pfr_ctx.smpl_entries)); ret = pfm_smpl_buffer_alloc(ctx, tmp.pfr_ctx.smpl_regs, tmp.pfr_ctx.smpl_entries, &uaddr); if (ret<0) goto buffer_error; tmp.pfr_ctx.smpl_vaddr = uaddr; } /* initialization of context's flags */ ctx->ctx_fl_inherit = ctx_flags & PFM_FL_INHERIT_MASK; ctx->ctx_fl_noblock = (ctx_flags & PFM_FL_SMPL_OVFL_NOBLOCK) ? 1 : 0; ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0; ctx->ctx_fl_exclintr = (ctx_flags & PFM_FL_EXCL_INTR) ? 1: 0; ctx->ctx_fl_frozen = 0; /* * Keep track of the pmds we want to sample * XXX: may be we don't need to save/restore the DEAR/IEAR pmds * but we do need the BTB for sure. This is because of a hardware * buffer of 1 only for non-BTB pmds. */ ctx->ctx_used_pmds[0] = tmp.pfr_ctx.smpl_regs; ctx->ctx_used_pmcs[0] = 1; /* always save/restore PMC[0] */ sema_init(&ctx->ctx_restart_sem, 0); /* init this semaphore to locked */ if (copy_to_user(req, &tmp, sizeof(tmp))) { ret = -EFAULT; goto buffer_error; } DBprintk((" context=%p, pid=%d notify_sig %d notify_task=%p\n",(void *)ctx, current->pid, ctx->ctx_notify_sig, ctx->ctx_notify_task)); DBprintk((" context=%p, pid=%d flags=0x%x inherit=%d noblock=%d system=%d\n",(void *)ctx, current->pid, ctx_flags, ctx->ctx_fl_inherit, ctx->ctx_fl_noblock, ctx->ctx_fl_system)); /* * when no notification is required, we can make this visible at the last moment */ if (pid == 0) current->thread.pfm_context = ctx; /* * by default, we always include interrupts for system wide * DCR.pp is set by default to zero by kernel in cpu_init() */ if (ctx->ctx_fl_system) { if (ctx->ctx_fl_exclintr == 0) { unsigned long dcr = ia64_get_dcr(); ia64_set_dcr(dcr|IA64_DCR_PP); /* * keep track of the kernel default value */ pfs_info.pfs_dfl_dcr = dcr; DBprintk((" dcr.pp is set\n")); } } return 0; buffer_error: pfm_context_free(ctx); error: /* * undo session reservation */ if (ctx_flags & PFM_FL_SYSTEM_WIDE) { pfs_info.pfs_sys_session = 0; } else { pfs_info.pfs_proc_sessions--; } return ret; } static void pfm_reset_regs(pfm_context_t *ctx) { unsigned long mask = ctx->ctx_ovfl_regs; int i, cnum; DBprintk((" ovfl_regs=0x%lx\n", mask)); /* * now restore reset value on sampling overflowed counters */ for(i=0, cnum=PMU_FIRST_COUNTER; i < pmu_conf.max_counters; i++, cnum++, mask >>= 1) { if (mask & 0x1) { DBprintk((" reseting PMD[%d]=%lx\n", cnum, ctx->ctx_pmds[i].smpl_rval & pmu_conf.perf_ovfl_val)); /* upper part is ignored on rval */ ia64_set_pmd(cnum, ctx->ctx_pmds[i].smpl_rval); /* * we must reset BTB index (clears pmd16.full to make * sure we do not report the same branches twice. * The non-blocking case in handled in update_counters() */ if (cnum == ctx->ctx_btb_counter) { DBprintk(("reseting PMD16\n")); ia64_set_pmd(16, 0); } } } /* just in case ! */ ctx->ctx_ovfl_regs = 0; } static int pfm_write_pmcs(struct task_struct *ta, perfmon_req_t *req, int count) { struct thread_struct *th = &ta->thread; pfm_context_t *ctx = th->pfm_context; perfmon_req_t tmp; unsigned long cnum; int i; /* XXX: ctx locking may be required here */ for (i = 0; i < count; i++, req++) { if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT; cnum = tmp.pfr_reg.reg_num; /* XXX needs to check validity of the data maybe */ if (!PMC_IS_IMPL(cnum)) { DBprintk((" invalid pmc[%ld]\n", cnum)); return -EINVAL; } if (PMC_IS_COUNTER(cnum)) { /* * we keep track of EARS/BTB to speed up sampling later */ if (PMC_IS_DEAR(&tmp.pfr_reg.reg_value)) { ctx->ctx_dear_counter = cnum; } else if (PMC_IS_IEAR(&tmp.pfr_reg.reg_value)) { ctx->ctx_iear_counter = cnum; } else if (PMC_IS_BTB(&tmp.pfr_reg.reg_value)) { ctx->ctx_btb_counter = cnum; } #if 0 if (tmp.pfr_reg.reg_flags & PFM_REGFL_OVFL_NOTIFY) ctx->ctx_pmds[cnum - PMU_FIRST_COUNTER].flags |= PFM_REGFL_OVFL_NOTIFY; #endif } /* keep track of what we use */ CTX_USED_PMC(ctx, cnum); ia64_set_pmc(cnum, tmp.pfr_reg.reg_value); DBprintk((" setting PMC[%ld]=0x%lx flags=0x%x used_pmcs=0%lx\n", cnum, tmp.pfr_reg.reg_value, ctx->ctx_pmds[cnum - PMU_FIRST_COUNTER].flags, ctx->ctx_used_pmcs[0])); } /* * we have to set this here event hough we haven't necessarily started monitoring * because we may be context switched out */ if (ctx->ctx_fl_system==0) th->flags |= IA64_THREAD_PM_VALID; return 0; } static int pfm_write_pmds(struct task_struct *ta, perfmon_req_t *req, int count) { struct thread_struct *th = &ta->thread; pfm_context_t *ctx = th->pfm_context; perfmon_req_t tmp; unsigned long cnum; int i; /* XXX: ctx locking may be required here */ for (i = 0; i < count; i++, req++) { int k; if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT; cnum = tmp.pfr_reg.reg_num; k = cnum - PMU_FIRST_COUNTER; if (!PMD_IS_IMPL(cnum)) return -EINVAL; /* update virtualized (64bits) counter */ if (PMD_IS_COUNTER(cnum)) { ctx->ctx_pmds[k].ival = tmp.pfr_reg.reg_value; ctx->ctx_pmds[k].val = tmp.pfr_reg.reg_value & ~pmu_conf.perf_ovfl_val; ctx->ctx_pmds[k].smpl_rval = tmp.pfr_reg.reg_smpl_reset; ctx->ctx_pmds[k].ovfl_rval = tmp.pfr_reg.reg_ovfl_reset; if (tmp.pfr_reg.reg_flags & PFM_REGFL_OVFL_NOTIFY) ctx->ctx_pmds[cnum - PMU_FIRST_COUNTER].flags |= PFM_REGFL_OVFL_NOTIFY; } /* keep track of what we use */ CTX_USED_PMD(ctx, cnum); /* writes to unimplemented part is ignored, so this is safe */ ia64_set_pmd(cnum, tmp.pfr_reg.reg_value); /* to go away */ ia64_srlz_d(); DBprintk((" setting PMD[%ld]: ovfl_notify=%d pmd.val=0x%lx pmd.ovfl_rval=0x%lx pmd.smpl_rval=0x%lx pmd=%lx used_pmds=0%lx\n", cnum, PMD_OVFL_NOTIFY(ctx, cnum - PMU_FIRST_COUNTER), ctx->ctx_pmds[k].val, ctx->ctx_pmds[k].ovfl_rval, ctx->ctx_pmds[k].smpl_rval, ia64_get_pmd(cnum) & pmu_conf.perf_ovfl_val, ctx->ctx_used_pmds[0])); } /* * we have to set this here event hough we haven't necessarily started monitoring * because we may be context switched out */ if (ctx->ctx_fl_system==0) th->flags |= IA64_THREAD_PM_VALID; return 0; } static int pfm_read_pmds(struct task_struct *ta, perfmon_req_t *req, int count) { struct thread_struct *th = &ta->thread; pfm_context_t *ctx = th->pfm_context; unsigned long val=0; perfmon_req_t tmp; int i; /* * XXX: MUST MAKE SURE WE DON"T HAVE ANY PENDING OVERFLOW BEFORE READING * This is required when the monitoring has been stoppped by user of kernel. * If ity is still going on, then that's fine because we a re not gauranteed * to return an accurate value in this case */ /* XXX: ctx locking may be required here */ for (i = 0; i < count; i++, req++) { unsigned long reg_val = ~0, ctx_val = ~0; if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT; if (!PMD_IS_IMPL(tmp.pfr_reg.reg_num)) return -EINVAL; if (PMD_IS_COUNTER(tmp.pfr_reg.reg_num)) { if (ta == current){ val = ia64_get_pmd(tmp.pfr_reg.reg_num); } else { val = reg_val = th->pmd[tmp.pfr_reg.reg_num]; } val &= pmu_conf.perf_ovfl_val; /* * lower part of .val may not be zero, so we must be an addition because of * residual count (see update_counters). */ val += ctx_val = ctx->ctx_pmds[tmp.pfr_reg.reg_num - PMU_FIRST_COUNTER].val; } else { /* for now */ if (ta != current) return -EINVAL; ia64_srlz_d(); val = ia64_get_pmd(tmp.pfr_reg.reg_num); } tmp.pfr_reg.reg_value = val; DBprintk((" reading PMD[%ld]=0x%lx reg=0x%lx ctx_val=0x%lx pmc=0x%lx\n", tmp.pfr_reg.reg_num, val, reg_val, ctx_val, ia64_get_pmc(tmp.pfr_reg.reg_num))); if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT; } return 0; } static int pfm_do_restart(struct task_struct *task) { struct thread_struct *th = &task->thread; pfm_context_t *ctx = th->pfm_context; void *sem = &ctx->ctx_restart_sem; if (task == current) { DBprintk((" restarting self %d frozen=%d \n", current->pid, ctx->ctx_fl_frozen)); pfm_reset_regs(ctx); /* * We ignore block/don't block because we never block * for a self-monitoring process. */ ctx->ctx_fl_frozen = 0; if (CTX_HAS_SMPL(ctx)) { ctx->ctx_smpl_buf->psb_hdr->hdr_count = 0; ctx->ctx_smpl_buf->psb_index = 0; } /* pfm_reset_smpl_buffers(ctx,th->pfm_ovfl_regs);*/ /* simply unfreeze */ ia64_set_pmc(0, 0); ia64_srlz_d(); return 0; } /* check if blocking */ if (CTX_OVFL_NOBLOCK(ctx) == 0) { DBprintk((" unblocking %d \n", task->pid)); up(sem); return 0; } /* * in case of non blocking mode, then it's just a matter of * of reseting the sampling buffer (if any) index. The PMU * is already active. */ /* * must reset the header count first */ if (CTX_HAS_SMPL(ctx)) { DBprintk((" resetting sampling indexes for %d \n", task->pid)); ctx->ctx_smpl_buf->psb_hdr->hdr_count = 0; ctx->ctx_smpl_buf->psb_index = 0; } return 0; } /* * system-wide mode: propagate activation/desactivation throughout the tasklist * * XXX: does not work for SMP, of course */ static void pfm_process_tasklist(int cmd) { struct task_struct *p; struct pt_regs *regs; for_each_task(p) { regs = (struct pt_regs *)((unsigned long)p + IA64_STK_OFFSET); regs--; ia64_psr(regs)->pp = cmd; } } static int do_perfmonctl (struct task_struct *task, int cmd, int flags, perfmon_req_t *req, int count, struct pt_regs *regs) { perfmon_req_t tmp; struct thread_struct *th = &task->thread; pfm_context_t *ctx = th->pfm_context; memset(&tmp, 0, sizeof(tmp)); if (ctx == NULL && cmd != PFM_CREATE_CONTEXT && cmd < PFM_DEBUG_BASE) { DBprintk((" PFM_WRITE_PMCS: no context for task %d\n", task->pid)); return -EINVAL; } switch (cmd) { case PFM_CREATE_CONTEXT: /* a context has already been defined */ if (ctx) return -EBUSY; /* * cannot directly create a context in another process */ if (task != current) return -EINVAL; if (req == NULL || count != 1) return -EINVAL; if (!access_ok(VERIFY_READ, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT; return pfm_context_create(flags, req); case PFM_WRITE_PMCS: /* we don't quite support this right now */ if (task != current) return -EINVAL; if (!access_ok(VERIFY_READ, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT; return pfm_write_pmcs(task, req, count); case PFM_WRITE_PMDS: /* we don't quite support this right now */ if (task != current) return -EINVAL; if (!access_ok(VERIFY_READ, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT; return pfm_write_pmds(task, req, count); case PFM_START: /* we don't quite support this right now */ if (task != current) return -EINVAL; if (PMU_OWNER() && PMU_OWNER() != current && PFM_CAN_DO_LAZY()) pfm_lazy_save_regs(PMU_OWNER()); SET_PMU_OWNER(current); /* will start monitoring right after rfi */ ia64_psr(regs)->up = 1; ia64_psr(regs)->pp = 1; if (ctx->ctx_fl_system) { pfm_process_tasklist(1); pfs_info.pfs_pp = 1; } /* * mark the state as valid. * this will trigger save/restore at context switch */ if (ctx->ctx_fl_system==0) th->flags |= IA64_THREAD_PM_VALID; ia64_set_pmc(0, 0); ia64_srlz_d(); break; case PFM_ENABLE: /* we don't quite support this right now */ if (task != current) return -EINVAL; if (PMU_OWNER() && PMU_OWNER() != current && PFM_CAN_DO_LAZY()) pfm_lazy_save_regs(PMU_OWNER()); /* reset all registers to stable quiet state */ ia64_reset_pmu(); /* make sure nothing starts */ ia64_psr(regs)->up = 0; ia64_psr(regs)->pp = 0; /* do it on the live register as well */ __asm__ __volatile__ ("rsm psr.pp|psr.pp;;"::: "memory"); SET_PMU_OWNER(current); /* * mark the state as valid. * this will trigger save/restore at context switch */ if (ctx->ctx_fl_system==0) th->flags |= IA64_THREAD_PM_VALID; /* simply unfreeze */ ia64_set_pmc(0, 0); ia64_srlz_d(); break; case PFM_DISABLE: /* we don't quite support this right now */ if (task != current) return -EINVAL; /* simply freeze */ ia64_set_pmc(0, 1); ia64_srlz_d(); /* * XXX: cannot really toggle IA64_THREAD_PM_VALID * but context is still considered valid, so any * read request would return something valid. Same * thing when this task terminates (pfm_flush_regs()). */ break; case PFM_READ_PMDS: if (!access_ok(VERIFY_READ, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT; if (!access_ok(VERIFY_WRITE, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT; return pfm_read_pmds(task, req, count); case PFM_STOP: /* we don't quite support this right now */ if (task != current) return -EINVAL; /* simply stop monitors, not PMU */ ia64_psr(regs)->up = 0; ia64_psr(regs)->pp = 0; if (ctx->ctx_fl_system) { pfm_process_tasklist(0); pfs_info.pfs_pp = 0; } break; case PFM_RESTART: /* temporary, will most likely end up as a PFM_ENABLE */ if ((th->flags & IA64_THREAD_PM_VALID) == 0 && ctx->ctx_fl_system==0) { printk(" PFM_RESTART not monitoring\n"); return -EINVAL; } if (CTX_OVFL_NOBLOCK(ctx) == 0 && ctx->ctx_fl_frozen==0) { printk("task %d without pmu_frozen set\n", task->pid); return -EINVAL; } return pfm_do_restart(task); /* we only look at first entry */ case PFM_DESTROY_CONTEXT: /* we don't quite support this right now */ if (task != current) return -EINVAL; /* first stop monitors */ ia64_psr(regs)->up = 0; ia64_psr(regs)->pp = 0; /* then freeze PMU */ ia64_set_pmc(0, 1); ia64_srlz_d(); /* don't save/restore on context switch */ if (ctx->ctx_fl_system ==0) task->thread.flags &= ~IA64_THREAD_PM_VALID; SET_PMU_OWNER(NULL); /* now free context and related state */ pfm_context_exit(task); break; case PFM_DEBUG_ON: printk("perfmon debugging on\n"); pfm_debug = 1; break; case PFM_DEBUG_OFF: printk("perfmon debugging off\n"); pfm_debug = 0; break; default: DBprintk((" UNknown command 0x%x\n", cmd)); return -EINVAL; } return 0; } /* * XXX: do something better here */ static int perfmon_bad_permissions(struct task_struct *task) { /* stolen from bad_signal() */ return (current->session != task->session) && (current->euid ^ task->suid) && (current->euid ^ task->uid) && (current->uid ^ task->suid) && (current->uid ^ task->uid); } asmlinkage int sys_perfmonctl (int pid, int cmd, int flags, perfmon_req_t *req, int count, long arg6, long arg7, long arg8, long stack) { struct pt_regs *regs = (struct pt_regs *) &stack; struct task_struct *child = current; int ret = -ESRCH; /* sanity check: * * ensures that we don't do bad things in case the OS * does not have enough storage to save/restore PMC/PMD */ if (PERFMON_IS_DISABLED()) return -ENOSYS; /* XXX: pid interface is going away in favor of pfm context */ if (pid != current->pid) { read_lock(&tasklist_lock); child = find_task_by_pid(pid); if (!child) goto abort_call; ret = -EPERM; if (perfmon_bad_permissions(child)) goto abort_call; /* * XXX: need to do more checking here */ if (child->state != TASK_ZOMBIE && child->state != TASK_STOPPED) { DBprintk((" warning process %d not in stable state %ld\n", pid, child->state)); } } ret = do_perfmonctl(child, cmd, flags, req, count, regs); abort_call: if (child != current) read_unlock(&tasklist_lock); return ret; } #if __GNUC__ >= 3 void asmlinkage pfm_block_on_overflow(void) #else void asmlinkage pfm_block_on_overflow(u64 arg0, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7) #endif { struct thread_struct *th = ¤t->thread; pfm_context_t *ctx = current->thread.pfm_context; int ret; /* * NO matter what notify_pid is, * we clear overflow, won't notify again */ th->pfm_must_block = 0; /* * do some sanity checks first */ if (!ctx) { printk("perfmon: process %d has no PFM context\n", current->pid); return; } if (ctx->ctx_notify_task == 0) { printk("perfmon: process %d has no task to notify\n", current->pid); return; } DBprintk((" current=%d task=%d\n", current->pid, ctx->ctx_notify_task->pid)); /* should not happen */ if (CTX_OVFL_NOBLOCK(ctx)) { printk("perfmon: process %d non-blocking ctx should not be here\n", current->pid); return; } DBprintk((" CPU%d %d before sleep\n", smp_processor_id(), current->pid)); /* * may go through without blocking on SMP systems * if restart has been received already by the time we call down() */ ret = down_interruptible(&ctx->ctx_restart_sem); DBprintk((" CPU%d %d after sleep ret=%d\n", smp_processor_id(), current->pid, ret)); /* * in case of interruption of down() we don't restart anything */ if (ret >= 0) { /* we reactivate on context switch */ ctx->ctx_fl_frozen = 0; /* * the ovfl_sem is cleared by the restart task and this is safe because we always * use the local reference */ pfm_reset_regs(ctx); /* * Unlock sampling buffer and reset index atomically * XXX: not really needed when blocking */ if (CTX_HAS_SMPL(ctx)) { ctx->ctx_smpl_buf->psb_hdr->hdr_count = 0; ctx->ctx_smpl_buf->psb_index = 0; } DBprintk((" CPU%d %d unfreeze PMU\n", smp_processor_id(), current->pid)); ia64_set_pmc(0, 0); ia64_srlz_d(); /* state restored, can go back to work (user mode) */ } } /* * main overflow processing routine. * it can be called from the interrupt path or explicitely during the context switch code * Return: * new value of pmc[0]. if 0x0 then unfreeze, else keep frozen */ unsigned long update_counters (struct task_struct *task, u64 pmc0, struct pt_regs *regs) { unsigned long mask, i, cnum; struct thread_struct *th; pfm_context_t *ctx; unsigned long bv = 0; int my_cpu = smp_processor_id(); int ret = 1, buffer_is_full = 0; int ovfl_has_long_recovery, can_notify, need_reset_pmd16=0; struct siginfo si; /* * It is never safe to access the task for which the overflow interrupt is destinated * using the current variable as the interrupt may occur in the middle of a context switch * where current does not hold the task that is running yet. * * For monitoring, however, we do need to get access to the task which caused the overflow * to account for overflow on the counters. * * We accomplish this by maintaining a current owner of the PMU per CPU. During context * switch the ownership is changed in a way such that the reflected owner is always the * valid one, i.e. the one that caused the interrupt. */ if (task == NULL) { DBprintk((" owners[%d]=NULL\n", my_cpu)); return 0x1; } th = &task->thread; ctx = th->pfm_context; /* * XXX: debug test * Don't think this could happen given upfront tests */ if ((th->flags & IA64_THREAD_PM_VALID) == 0 && ctx->ctx_fl_system == 0) { printk("perfmon: Spurious overflow interrupt: process %d not using perfmon\n", task->pid); return 0x1; } if (!ctx) { printk("perfmon: Spurious overflow interrupt: process %d has no PFM context\n", task->pid); return 0; } /* * sanity test. Should never happen */ if ((pmc0 & 0x1 )== 0) { printk("perfmon: pid %d pmc0=0x%lx assumption error for freeze bit\n", task->pid, pmc0); return 0x0; } mask = pmc0 >> PMU_FIRST_COUNTER; DBprintk(("pmc0=0x%lx pid=%d owner=%d iip=0x%lx, ctx is in %s mode used_pmds=0x%lx used_pmcs=0x%lx\n", pmc0, task->pid, PMU_OWNER()->pid, regs->cr_iip, CTX_OVFL_NOBLOCK(ctx) ? "NO-BLOCK" : "BLOCK", ctx->ctx_used_pmds[0], ctx->ctx_used_pmcs[0])); /* * XXX: need to record sample only when an EAR/BTB has overflowed */ if (CTX_HAS_SMPL(ctx)) { pfm_smpl_buffer_desc_t *psb = ctx->ctx_smpl_buf; unsigned long *e, m, idx=0; perfmon_smpl_entry_t *h; int j; idx = ia64_fetch_and_add(1, &psb->psb_index); DBprintk((" recording index=%ld entries=%ld\n", idx, psb->psb_entries)); /* * XXX: there is a small chance that we could run out on index before resetting * but index is unsigned long, so it will take some time..... * We use > instead of == because fetch_and_add() is off by one (see below) * * This case can happen in non-blocking mode or with multiple processes. * For non-blocking, we need to reload and continue. */ if (idx > psb->psb_entries) { buffer_is_full = 1; goto reload_pmds; } /* first entry is really entry 0, not 1 caused by fetch_and_add */ idx--; h = (perfmon_smpl_entry_t *)(((char *)psb->psb_addr) + idx*(psb->psb_entry_size)); h->pid = task->pid; h->cpu = my_cpu; h->rate = 0; h->ip = regs ? regs->cr_iip : 0x0; /* where did the fault happened */ h->regs = mask; /* which registers overflowed */ /* guaranteed to monotonically increase on each cpu */ h->stamp = perfmon_get_stamp(); e = (unsigned long *)(h+1); /* * selectively store PMDs in increasing index number */ for (j=0, m = ctx->ctx_smpl_regs; m; m >>=1, j++) { if (m & 0x1) { if (PMD_IS_COUNTER(j)) *e = ctx->ctx_pmds[j-PMU_FIRST_COUNTER].val + (ia64_get_pmd(j) & pmu_conf.perf_ovfl_val); else { *e = ia64_get_pmd(j); /* slow */ } DBprintk((" e=%p pmd%d =0x%lx\n", (void *)e, j, *e)); e++; } } /* * make the new entry visible to user, needs to be atomic */ ia64_fetch_and_add(1, &psb->psb_hdr->hdr_count); DBprintk((" index=%ld entries=%ld hdr_count=%ld\n", idx, psb->psb_entries, psb->psb_hdr->hdr_count)); /* * sampling buffer full ? */ if (idx == (psb->psb_entries-1)) { /* * will cause notification, cannot be 0 */ bv = mask << PMU_FIRST_COUNTER; buffer_is_full = 1; DBprintk((" sampling buffer full must notify bv=0x%lx\n", bv)); /* * we do not reload here, when context is blocking */ if (!CTX_OVFL_NOBLOCK(ctx)) goto no_reload; /* * here, we have a full buffer but we are in non-blocking mode * so we need to reload overflowed PMDs with sampling reset values * and restart right away. */ } /* FALL THROUGH */ } reload_pmds: /* * in the case of a non-blocking context, we reload * with the ovfl_rval when no user notification is taking place (short recovery) * otherwise when the buffer is full which requires user interaction) then we use * smpl_rval which is the long_recovery path (disturbance introduce by user execution). * * XXX: implies that when buffer is full then there is always notification. */ ovfl_has_long_recovery = CTX_OVFL_NOBLOCK(ctx) && buffer_is_full; /* * XXX: CTX_HAS_SMPL() should really be something like CTX_HAS_SMPL() and is activated,i.e., * one of the PMC is configured for EAR/BTB. * * When sampling, we can only notify when the sampling buffer is full. */ can_notify = CTX_HAS_SMPL(ctx) == 0 && ctx->ctx_notify_task; DBprintk((" ovfl_has_long_recovery=%d can_notify=%d\n", ovfl_has_long_recovery, can_notify)); for (i = 0, cnum = PMU_FIRST_COUNTER; mask ; cnum++, i++, mask >>= 1) { if ((mask & 0x1) == 0) continue; DBprintk((" PMD[%ld] overflowed pmd=0x%lx pmod.val=0x%lx\n", cnum, ia64_get_pmd(cnum), ctx->ctx_pmds[i].val)); /* * Because we sometimes (EARS/BTB) reset to a specific value, we cannot simply use * val to count the number of times we overflowed. Otherwise we would loose the current value * in the PMD (which can be >0). So to make sure we don't loose * the residual counts we set val to contain full 64bits value of the counter. * * XXX: is this needed for EARS/BTB ? */ ctx->ctx_pmds[i].val += 1 + pmu_conf.perf_ovfl_val + (ia64_get_pmd(cnum) & pmu_conf.perf_ovfl_val); /* slow */ DBprintk((" pmod[%ld].val=0x%lx pmd=0x%lx\n", i, ctx->ctx_pmds[i].val, ia64_get_pmd(cnum)&pmu_conf.perf_ovfl_val)); if (can_notify && PMD_OVFL_NOTIFY(ctx, i)) { DBprintk((" CPU%d should notify task %p with signal %d\n", my_cpu, ctx->ctx_notify_task, ctx->ctx_notify_sig)); bv |= 1 << i; } else { DBprintk((" CPU%d PMD[%ld] overflow, no notification\n", my_cpu, cnum)); /* * In case no notification is requested, we reload the reset value right away * otherwise we wait until the notify_pid process has been called and has * has finished processing data. Check out pfm_overflow_notify() */ /* writes to upper part are ignored, so this is safe */ if (ovfl_has_long_recovery) { DBprintk((" CPU%d PMD[%ld] reload with smpl_val=%lx\n", my_cpu, cnum,ctx->ctx_pmds[i].smpl_rval)); ia64_set_pmd(cnum, ctx->ctx_pmds[i].smpl_rval); } else { DBprintk((" CPU%d PMD[%ld] reload with ovfl_val=%lx\n", my_cpu, cnum,ctx->ctx_pmds[i].smpl_rval)); ia64_set_pmd(cnum, ctx->ctx_pmds[i].ovfl_rval); } } if (cnum == ctx->ctx_btb_counter) need_reset_pmd16=1; } /* * In case of BTB overflow we need to reset the BTB index. */ if (need_reset_pmd16) { DBprintk(("reset PMD16\n")); ia64_set_pmd(16, 0); } no_reload: /* * some counters overflowed, but they did not require * user notification, so after having reloaded them above * we simply restart */ if (!bv) return 0x0; ctx->ctx_ovfl_regs = bv; /* keep track of what to reset when unblocking */ /* * Now we know that: * - we have some counters which overflowed (contains in bv) * - someone has asked to be notified on overflow. */ /* * If the notification task is still present, then notify_task is non * null. It is clean by that task if it ever exits before we do. */ if (ctx->ctx_notify_task) { si.si_errno = 0; si.si_addr = NULL; si.si_pid = task->pid; /* who is sending */ si.si_signo = ctx->ctx_notify_sig; /* is SIGPROF */ si.si_code = PROF_OVFL; /* goes to user */ si.si_pfm_ovfl = bv; /* * when the target of the signal is not ourself, we have to be more * careful. The notify_task may being cleared by the target task itself * in release_thread(). We must ensure mutual exclusion here such that * the signal is delivered (even to a dying task) safely. */ if (ctx->ctx_notify_task != current) { /* * grab the notification lock for this task */ spin_lock(&ctx->ctx_notify_lock); /* * now notify_task cannot be modified until we're done * if NULL, they it got modified while we were in the handler */ if (ctx->ctx_notify_task == NULL) { spin_unlock(&ctx->ctx_notify_lock); goto lost_notify; } /* * required by send_sig_info() to make sure the target * task does not disappear on us. */ read_lock(&tasklist_lock); } /* * in this case, we don't stop the task, we let it go on. It will * necessarily go to the signal handler (if any) when it goes back to * user mode. */ DBprintk((" %d sending %d notification to %d\n", task->pid, si.si_signo, ctx->ctx_notify_task->pid)); /* * this call is safe in an interrupt handler, so does read_lock() on tasklist_lock */ ret = send_sig_info(ctx->ctx_notify_sig, &si, ctx->ctx_notify_task); if (ret != 0) printk(" send_sig_info(process %d, SIGPROF)=%d\n", ctx->ctx_notify_task->pid, ret); /* * now undo the protections in order */ if (ctx->ctx_notify_task != current) { read_unlock(&tasklist_lock); spin_unlock(&ctx->ctx_notify_lock); } /* * if we block set the pfm_must_block bit * when in block mode, we can effectively block only when the notified * task is not self, otherwise we would deadlock. * in this configuration, the notification is sent, the task will not * block on the way back to user mode, but the PMU will be kept frozen * until PFM_RESTART. * Note that here there is still a race condition with notify_task * possibly being nullified behind our back, but this is fine because * it can only be changed to NULL which by construction, can only be * done when notify_task != current. So if it was already different * before, changing it to NULL will still maintain this invariant. * Of course, when it is equal to current it cannot change at this point. */ if (!CTX_OVFL_NOBLOCK(ctx) && ctx->ctx_notify_task != current) { th->pfm_must_block = 1; /* will cause blocking */ } } else { lost_notify: DBprintk((" notification task has disappeared !\n")); /* * for a non-blocking context, we make sure we do not fall into the pfm_overflow_notify() * trap. Also in the case of a blocking context with lost notify process, then we do not * want to block either (even though it is interruptible). In this case, the PMU will be kept * frozen and the process will run to completion without monitoring enabled. * * Of course, we cannot loose notify process when self-monitoring. */ th->pfm_must_block = 0; } /* * if we block, we keep the PMU frozen. If non-blocking we restart. * in the case of non-blocking were the notify process is lost, we also * restart. */ if (!CTX_OVFL_NOBLOCK(ctx)) ctx->ctx_fl_frozen = 1; else ctx->ctx_fl_frozen = 0; DBprintk((" reload pmc0=0x%x must_block=%ld\n", ctx->ctx_fl_frozen ? 0x1 : 0x0, th->pfm_must_block)); return ctx->ctx_fl_frozen ? 0x1 : 0x0; } static void perfmon_interrupt (int irq, void *arg, struct pt_regs *regs) { u64 pmc0; struct task_struct *ta; pmc0 = ia64_get_pmc(0); /* slow */ /* * if we have some pending bits set * assumes : if any PM[0].bit[63-1] is set, then PMC[0].fr = 1 */ if ((pmc0 & ~0x1) && (ta=PMU_OWNER())) { /* assumes, PMC[0].fr = 1 at this point */ pmc0 = update_counters(ta, pmc0, regs); /* * if pmu_frozen = 0 * pmc0 = 0 and we resume monitoring right away * else * pmc0 = 0x1 frozen but all pending bits are cleared */ ia64_set_pmc(0, pmc0); ia64_srlz_d(); } else { printk("perfmon: Spurious PMU overflow interrupt: pmc0=0x%lx owner=%p\n", pmc0, (void *)PMU_OWNER()); } } /* for debug only */ static int perfmon_proc_info(char *page) { char *p = page; u64 pmc0 = ia64_get_pmc(0); int i; p += sprintf(p, "CPU%d.pmc[0]=%lx\nPerfmon debug: %s\n", smp_processor_id(), pmc0, pfm_debug ? "On" : "Off"); p += sprintf(p, "proc_sessions=%lu sys_sessions=%lu\n", pfs_info.pfs_proc_sessions, pfs_info.pfs_sys_session); for(i=0; i < NR_CPUS; i++) { if (cpu_is_online(i)) { p += sprintf(p, "CPU%d.pmu_owner: %-6d\n", i, pmu_owners[i].owner ? pmu_owners[i].owner->pid: -1); } } return p - page; } /* for debug only */ static int perfmon_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data) { int len = perfmon_proc_info(page); if (len <= off+count) *eof = 1; *start = page + off; len -= off; if (len>count) len = count; if (len<0) len = 0; return len; } static struct irqaction perfmon_irqaction = { handler: perfmon_interrupt, flags: SA_INTERRUPT, name: "perfmon" }; void __init perfmon_init (void) { pal_perf_mon_info_u_t pm_info; s64 status; register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); ia64_set_pmv(IA64_PERFMON_VECTOR); ia64_srlz_d(); pmu_conf.pfm_is_disabled = 1; printk("perfmon: version %s (sampling format v%d)\n", PFM_VERSION, PFM_SMPL_HDR_VERSION); printk("perfmon: Interrupt vectored to %u\n", IA64_PERFMON_VECTOR); if ((status=ia64_pal_perf_mon_info(pmu_conf.impl_regs, &pm_info)) != 0) { printk("perfmon: PAL call failed (%ld)\n", status); return; } pmu_conf.perf_ovfl_val = (1L << pm_info.pal_perf_mon_info_s.width) - 1; pmu_conf.max_counters = pm_info.pal_perf_mon_info_s.generic; pmu_conf.num_pmcs = find_num_pm_regs(pmu_conf.impl_regs); pmu_conf.num_pmds = find_num_pm_regs(&pmu_conf.impl_regs[4]); printk("perfmon: %d bits counters (max value 0x%lx)\n", pm_info.pal_perf_mon_info_s.width, pmu_conf.perf_ovfl_val); printk("perfmon: %ld PMC/PMD pairs, %ld PMCs, %ld PMDs\n", pmu_conf.max_counters, pmu_conf.num_pmcs, pmu_conf.num_pmds); /* sanity check */ if (pmu_conf.num_pmds >= IA64_NUM_PMD_REGS || pmu_conf.num_pmcs >= IA64_NUM_PMC_REGS) { printk(KERN_ERR "perfmon: ERROR not enough PMC/PMD storage in kernel, perfmon is DISABLED\n"); return; /* no need to continue anyway */ } /* we are all set */ pmu_conf.pfm_is_disabled = 0; /* * Insert the tasklet in the list. * It is still disabled at this point, so it won't run printk(__FUNCTION__" tasklet is %p state=%d, count=%d\n", &perfmon_tasklet, perfmon_tasklet.state, perfmon_tasklet.count); */ /* * for now here for debug purposes */ perfmon_dir = create_proc_read_entry ("perfmon", 0, 0, perfmon_read_entry, NULL); } void perfmon_init_percpu (void) { ia64_set_pmv(IA64_PERFMON_VECTOR); ia64_srlz_d(); } void pfm_save_regs (struct task_struct *ta) { struct task_struct *owner; pfm_context_t *ctx; struct thread_struct *t; u64 pmc0, psr; unsigned long mask; int i; t = &ta->thread; ctx = ta->thread.pfm_context; /* * We must make sure that we don't loose any potential overflow * interrupt while saving PMU context. In this code, external * interrupts are always enabled. */ /* * save current PSR: needed because we modify it */ __asm__ __volatile__ ("mov %0=psr;;": "=r"(psr) :: "memory"); /* * stop monitoring: * This is the only way to stop monitoring without destroying overflow * information in PMC[0]. * This is the last instruction which can cause overflow when monitoring * in kernel. * By now, we could still have an overflow interrupt in-flight. */ __asm__ __volatile__ ("rsm psr.up|psr.pp;;"::: "memory"); /* * Mark the PMU as not owned * This will cause the interrupt handler to do nothing in case an overflow * interrupt was in-flight * This also guarantees that pmc0 will contain the final state * It virtually gives us full control over overflow processing from that point * on. * It must be an atomic operation. */ owner = PMU_OWNER(); SET_PMU_OWNER(NULL); /* * read current overflow status: * * we are guaranteed to read the final stable state */ ia64_srlz_d(); pmc0 = ia64_get_pmc(0); /* slow */ /* * freeze PMU: * * This destroys the overflow information. This is required to make sure * next process does not start with monitoring on if not requested */ ia64_set_pmc(0, 1); /* * Check for overflow bits and proceed manually if needed * * It is safe to call the interrupt handler now because it does * not try to block the task right away. Instead it will set a * flag and let the task proceed. The blocking will only occur * next time the task exits from the kernel. */ if (pmc0 & ~0x1) { update_counters(owner, pmc0, NULL); /* we will save the updated version of pmc0 */ } /* * restore PSR for context switch to save */ __asm__ __volatile__ ("mov psr.l=%0;; srlz.i;;"::"r"(psr): "memory"); /* * we do not save registers if we can do lazy */ if (PFM_CAN_DO_LAZY()) { SET_PMU_OWNER(owner); return; } /* * XXX needs further optimization. * Also must take holes into account */ mask = ctx->ctx_used_pmds[0]; for (i=0; mask; i++, mask>>=1) { if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i); } /* skip PMC[0], we handle it separately */ mask = ctx->ctx_used_pmcs[0]>>1; for (i=1; mask; i++, mask>>=1) { if (mask & 0x1) t->pmc[i] = ia64_get_pmc(i); } /* * Throughout this code we could have gotten an overflow interrupt. It is transformed * into a spurious interrupt as soon as we give up pmu ownership. */ } static void pfm_lazy_save_regs (struct task_struct *ta) { pfm_context_t *ctx; struct thread_struct *t; unsigned long mask; int i; DBprintk((" on [%d] by [%d]\n", ta->pid, current->pid)); t = &ta->thread; ctx = ta->thread.pfm_context; /* * XXX needs further optimization. * Also must take holes into account */ mask = ctx->ctx_used_pmds[0]; for (i=0; mask; i++, mask>>=1) { if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i); } /* skip PMC[0], we handle it separately */ mask = ctx->ctx_used_pmcs[0]>>1; for (i=1; mask; i++, mask>>=1) { if (mask & 0x1) t->pmc[i] = ia64_get_pmc(i); } SET_PMU_OWNER(NULL); } void pfm_load_regs (struct task_struct *ta) { struct thread_struct *t = &ta->thread; pfm_context_t *ctx = ta->thread.pfm_context; struct task_struct *owner; unsigned long mask; int i; owner = PMU_OWNER(); if (owner == ta) goto skip_restore; if (owner) pfm_lazy_save_regs(owner); SET_PMU_OWNER(ta); mask = ctx->ctx_used_pmds[0]; for (i=0; mask; i++, mask>>=1) { if (mask & 0x1) ia64_set_pmd(i, t->pmd[i]); } /* skip PMC[0] to avoid side effects */ mask = ctx->ctx_used_pmcs[0]>>1; for (i=1; mask; i++, mask>>=1) { if (mask & 0x1) ia64_set_pmc(i, t->pmc[i]); } skip_restore: /* * unfreeze only when possible */ if (ctx->ctx_fl_frozen == 0) { ia64_set_pmc(0, 0); ia64_srlz_d(); /* place where we potentially (kernel level) start monitoring again */ } } /* * This function is called when a thread exits (from exit_thread()). * This is a simplified pfm_save_regs() that simply flushes the current * register state into the save area taking into account any pending * overflow. This time no notification is sent because the taks is dying * anyway. The inline processing of overflows avoids loosing some counts. * The PMU is frozen on exit from this call and is to never be reenabled * again for this task. */ void pfm_flush_regs (struct task_struct *ta) { pfm_context_t *ctx; u64 pmc0, psr, mask; int i,j; if (ta == NULL) { panic(__FUNCTION__" task is NULL\n"); } ctx = ta->thread.pfm_context; if (ctx == NULL) { panic(__FUNCTION__" no PFM ctx is NULL\n"); } /* * We must make sure that we don't loose any potential overflow * interrupt while saving PMU context. In this code, external * interrupts are always enabled. */ /* * save current PSR: needed because we modify it */ __asm__ __volatile__ ("mov %0=psr;;": "=r"(psr) :: "memory"); /* * stop monitoring: * This is the only way to stop monitoring without destroying overflow * information in PMC[0]. * This is the last instruction which can cause overflow when monitoring * in kernel. * By now, we could still have an overflow interrupt in-flight. */ __asm__ __volatile__ ("rsm psr.up;;"::: "memory"); /* * Mark the PMU as not owned * This will cause the interrupt handler to do nothing in case an overflow * interrupt was in-flight * This also guarantees that pmc0 will contain the final state * It virtually gives us full control on overflow processing from that point * on. * It must be an atomic operation. */ SET_PMU_OWNER(NULL); /* * read current overflow status: * * we are guaranteed to read the final stable state */ ia64_srlz_d(); pmc0 = ia64_get_pmc(0); /* slow */ /* * freeze PMU: * * This destroys the overflow information. This is required to make sure * next process does not start with monitoring on if not requested */ ia64_set_pmc(0, 1); ia64_srlz_d(); /* * restore PSR for context switch to save */ __asm__ __volatile__ ("mov psr.l=%0;;srlz.i;"::"r"(psr): "memory"); /* * This loop flushes the PMD into the PFM context. * IT also processes overflow inline. * * IMPORTANT: No notification is sent at this point as the process is dying. * The implicit notification will come from a SIGCHILD or a return from a * waitpid(). * * XXX: must take holes into account */ mask = pmc0 >> PMU_FIRST_COUNTER; for (i=0,j=PMU_FIRST_COUNTER; i< pmu_conf.max_counters; i++,j++) { /* collect latest results */ ctx->ctx_pmds[i].val += ia64_get_pmd(j) & pmu_conf.perf_ovfl_val; /* * now everything is in ctx_pmds[] and we need * to clear the saved context from save_regs() such that * pfm_read_pmds() gets the correct value */ ta->thread.pmd[j] = 0; /* take care of overflow inline */ if (mask & 0x1) { ctx->ctx_pmds[i].val += 1 + pmu_conf.perf_ovfl_val; DBprintk((" PMD[%d] overflowed pmd=0x%lx pmds.val=0x%lx\n", j, ia64_get_pmd(j), ctx->ctx_pmds[i].val)); } mask >>=1; } } /* * XXX: this routine is not very portable for PMCs * XXX: make this routine able to work with non current context */ static void ia64_reset_pmu(void) { int i; /* PMU is frozen, no pending overflow bits */ ia64_set_pmc(0,1); /* extra overflow bits + counter configs cleared */ for(i=1; i< PMU_FIRST_COUNTER + pmu_conf.max_counters ; i++) { ia64_set_pmc(i,0); } /* opcode matcher set to all 1s */ ia64_set_pmc(8,~0); ia64_set_pmc(9,~0); /* I-EAR config cleared, plm=0 */ ia64_set_pmc(10,0); /* D-EAR config cleared, PMC[11].pt must be 1 */ ia64_set_pmc(11,1 << 28); /* BTB config. plm=0 */ ia64_set_pmc(12,0); /* Instruction address range, PMC[13].ta must be 1 */ ia64_set_pmc(13,1); /* clears all PMD registers */ for(i=0;i< pmu_conf.num_pmds; i++) { if (PMD_IS_IMPL(i)) ia64_set_pmd(i,0); } ia64_srlz_d(); } /* * task is the newly created task */ int pfm_inherit(struct task_struct *task, struct pt_regs *regs) { pfm_context_t *ctx = current->thread.pfm_context; pfm_context_t *nctx; struct thread_struct *th = &task->thread; int i, cnum; /* * bypass completely for system wide */ if (pfs_info.pfs_sys_session) { DBprintk((" enabling psr.pp for %d\n", task->pid)); ia64_psr(regs)->pp = pfs_info.pfs_pp; return 0; } /* * takes care of easiest case first */ if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_NONE) { DBprintk((" removing PFM context for %d\n", task->pid)); task->thread.pfm_context = NULL; task->thread.pfm_must_block = 0; atomic_set(&task->thread.pfm_notifiers_check, 0); /* copy_thread() clears IA64_THREAD_PM_VALID */ return 0; } nctx = pfm_context_alloc(); if (nctx == NULL) return -ENOMEM; /* copy content */ *nctx = *ctx; if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_ONCE) { nctx->ctx_fl_inherit = PFM_FL_INHERIT_NONE; atomic_set(&task->thread.pfm_notifiers_check, 0); DBprintk((" downgrading to INHERIT_NONE for %d\n", task->pid)); pfs_info.pfs_proc_sessions++; } /* initialize counters in new context */ for(i=0, cnum= PMU_FIRST_COUNTER; i < pmu_conf.max_counters; cnum++, i++) { nctx->ctx_pmds[i].val = nctx->ctx_pmds[i].ival & ~pmu_conf.perf_ovfl_val; th->pmd[cnum] = nctx->ctx_pmds[i].ival & pmu_conf.perf_ovfl_val; } /* clear BTB index register */ th->pmd[16] = 0; /* if sampling then increment number of users of buffer */ if (nctx->ctx_smpl_buf) { atomic_inc(&nctx->ctx_smpl_buf->psb_refcnt); } nctx->ctx_fl_frozen = 0; nctx->ctx_ovfl_regs = 0; sema_init(&nctx->ctx_restart_sem, 0); /* reset this semaphore to locked */ /* clear pending notification */ th->pfm_must_block = 0; /* link with new task */ th->pfm_context = nctx; DBprintk((" nctx=%p for process %d\n", (void *)nctx, task->pid)); /* * the copy_thread routine automatically clears * IA64_THREAD_PM_VALID, so we need to reenable it, if it was used by the caller */ if (current->thread.flags & IA64_THREAD_PM_VALID) { DBprintk((" setting PM_VALID for %d\n", task->pid)); th->flags |= IA64_THREAD_PM_VALID; } return 0; } /* * called from release_thread(), at this point this task is not in the * tasklist anymore */ void pfm_context_exit(struct task_struct *task) { pfm_context_t *ctx = task->thread.pfm_context; if (!ctx) { DBprintk((" invalid context for %d\n", task->pid)); return; } /* check is we have a sampling buffer attached */ if (ctx->ctx_smpl_buf) { pfm_smpl_buffer_desc_t *psb = ctx->ctx_smpl_buf; /* if only user left, then remove */ DBprintk((" [%d] [%d] psb->refcnt=%d\n", current->pid, task->pid, psb->psb_refcnt.counter)); if (atomic_dec_and_test(&psb->psb_refcnt) ) { rvfree(psb->psb_hdr, psb->psb_size); vfree(psb); DBprintk((" [%d] cleaning [%d] sampling buffer\n", current->pid, task->pid )); } } DBprintk((" [%d] cleaning [%d] pfm_context @%p\n", current->pid, task->pid, (void *)ctx)); /* * To avoid getting the notified task scan the entire process list * when it exits because it would have pfm_notifiers_check set, we * decrease it by 1 to inform the task, that one less task is going * to send it notification. each new notifer increases this field by * 1 in pfm_context_create(). Of course, there is race condition between * decreasing the value and the notified task exiting. The danger comes * from the fact that we have a direct pointer to its task structure * thereby bypassing the tasklist. We must make sure that if we have * notify_task!= NULL, the target task is still somewhat present. It may * already be detached from the tasklist but that's okay. Note that it is * okay if we 'miss the deadline' and the task scans the list for nothing, * it will affect performance but not correctness. The correctness is ensured * by using the notify_lock whic prevents the notify_task from changing on us. * Once holdhing this lock, if we see notify_task!= NULL, then it will stay like * that until we release the lock. If it is NULL already then we came too late. */ spin_lock(&ctx->ctx_notify_lock); if (ctx->ctx_notify_task) { DBprintk((" [%d] [%d] atomic_sub on [%d] notifiers=%u\n", current->pid, task->pid, ctx->ctx_notify_task->pid, atomic_read(&ctx->ctx_notify_task->thread.pfm_notifiers_check))); atomic_sub(1, &ctx->ctx_notify_task->thread.pfm_notifiers_check); } spin_unlock(&ctx->ctx_notify_lock); if (ctx->ctx_fl_system) { /* * if included interrupts (true by default), then reset * to get default value */ if (ctx->ctx_fl_exclintr == 0) { /* * reload kernel default DCR value */ ia64_set_dcr(pfs_info.pfs_dfl_dcr); DBprintk((" restored dcr to 0x%lx\n", pfs_info.pfs_dfl_dcr)); } /* * free system wide session slot */ pfs_info.pfs_sys_session = 0; } else { pfs_info.pfs_proc_sessions--; } pfm_context_free(ctx); /* * clean pfm state in thread structure, */ task->thread.pfm_context = NULL; task->thread.pfm_must_block = 0; /* pfm_notifiers is cleaned in pfm_cleanup_notifiers() */ } void pfm_cleanup_notifiers(struct task_struct *task) { struct task_struct *p; pfm_context_t *ctx; DBprintk((" [%d] called\n", task->pid)); read_lock(&tasklist_lock); for_each_task(p) { /* * It is safe to do the 2-step test here, because thread.ctx * is cleaned up only in release_thread() and at that point * the task has been detached from the tasklist which is an * operation which uses the write_lock() on the tasklist_lock * so it cannot run concurrently to this loop. So we have the * guarantee that if we find p and it has a perfmon ctx then * it is going to stay like this for the entire execution of this * loop. */ ctx = p->thread.pfm_context; DBprintk((" [%d] scanning task [%d] ctx=%p\n", task->pid, p->pid, ctx)); if (ctx && ctx->ctx_notify_task == task) { DBprintk((" trying for notifier %d in %d\n", task->pid, p->pid)); /* * the spinlock is required to take care of a race condition * with the send_sig_info() call. We must make sure that * either the send_sig_info() completes using a valid task, * or the notify_task is cleared before the send_sig_info() * can pick up a stale value. Note that by the time this * function is executed the 'task' is already detached from the * tasklist. The problem is that the notifiers have a direct * pointer to it. It is okay to send a signal to a task in this * stage, it simply will have no effect. But it is better than sending * to a completely destroyed task or worse to a new task using the same * task_struct address. */ spin_lock(&ctx->ctx_notify_lock); ctx->ctx_notify_task = NULL; spin_unlock(&ctx->ctx_notify_lock); DBprintk((" done for notifier %d in %d\n", task->pid, p->pid)); } } read_unlock(&tasklist_lock); } #else /* !CONFIG_PERFMON */ asmlinkage int sys_perfmonctl (int pid, int cmd, int flags, perfmon_req_t *req, int count, long arg6, long arg7, long arg8, long stack) { return -ENOSYS; } #endif /* !CONFIG_PERFMON */