diff --git a/kernel/behave.c b/kernel/behave.c index ffd18d9f7ee71efa5657cdc9805e904d89d1fd25..2d86ea3f22dc2eb9bbd68ca97b3dba35f282377f 100644 --- a/kernel/behave.c +++ b/kernel/behave.c @@ -12,9 +12,6 @@ int is_initialized = 0; * Based on "libpbm" (see header file for more info) */ -#define TASK_BUF_SIZE 4096 -#define PROC_BUF_SIZE 512 - /* ----------------------------------------------------------------------------- * PID -> PBM* Hashmap * Only insertion is needed currently (since memory efficiency is not our @@ -33,6 +30,15 @@ typedef struct { shared_pbm_int_map _index_2_pbm; shared_pbm_int_map* index_2_pbm = &_index_2_pbm; +/** + * Why not use a generic map data structure using void*, you ask? + * + * Because we do not only need _access_ to the data (PBM) but also need _memory-management_ for it. + * We can not use the generic kernel memory management (e.g. kalloc) since that might need certain + * tasks to be scheduled which in turn could lead to deadlocks. + * Therefore memory management/storage and access is implemented by this map in a simple and + * effective (but certainly inefficient) way. + */ static void init_pbm_int_map(void) { int i; @@ -255,9 +261,8 @@ void pbm_uninit(void) { /* Insert a task node into the PBM of given type (only COMP for now) */ int pbm_task_start(PBM* pbm, uint8_t type, struct task_struct* proc) { pbm_NODE* node; - - unsigned long irq_flags; struct perf_event *pevent; + int error; // only continue for COMP-nodes since other types are not implemented yet if(!(pbm && type == COMP)) { @@ -280,63 +285,19 @@ int pbm_task_start(PBM* pbm, uint8_t type, struct task_struct* proc) { // finally the new node becomes the last inserted one pbm->last = node; - /* - * Configure the performance counter - */ - memset(&(pbm->pea), 0, sizeof(struct perf_event_attr)); - pbm->pea.type = PERF_TYPE_HARDWARE; - pbm->pea.size = sizeof(struct perf_event_attr); - pbm->pea.config = PERF_COUNT_HW_INSTRUCTIONS; - pbm->pea.sample_period = 400800; - pbm->pea.disabled = 0; // start the counter as soon as we're in userland - pbm->pea.pinned = 1; - pbm->pea.exclude_kernel = 1; - pbm->pea.exclude_hv = 1; - pbm->pea.wakeup_events = 1; // Not needed on 3.2? - - printk(KERN_WARNING "TASK: %u, CPU: %i, PTR: %llu\n", proc->pid, smp_processor_id(), (u64)proc); - - /* - * Try to enable the performance counter - */ - // disable irqs to make 'perf_event_ctx_activate' in 'kernel/events/core.c' happy - local_irq_save(irq_flags); - pevent = perf_event_create_kernel_counter( - &(pbm->pea), - -1, // measure on all cores (in case the process runs on different ones) - proc, // exclusively measure the forked process (BEWARE: a process can only measure itself!) - NULL, //&overflow_handler, - NULL - ); - local_irq_restore(irq_flags); - if (IS_ERR(pevent)) { - printk(KERN_WARNING "TASK: %u | PB ERROR INITIALISING PERF EVENT: %li\n", proc->pid, PTR_ERR(pevent)); - // cast to prevent compiler warnings - if (-EOPNOTSUPP == (int64_t)pevent) { - printk(KERN_WARNING - "TASK: %u | EOPNOTSUPP (-95): The hardware does not support certain attributes! " - "E.g. perf_event_attr.precise_ip > 0 may not be supported.\n", proc->pid); - } - if (-EINVAL == (int64_t)pevent) { - printk(KERN_WARNING - "TASK: %u | EINVAL (-22): Invalid argument!" - "E.g. CPU with given index does not exist.\n", proc->pid); - } - return 0; - } - if (pevent->state != PERF_EVENT_STATE_ACTIVE) { - printk(KERN_WARNING "TASK: %u | Event is inactive", proc->pid); - } - pbm->pevent = pevent; + error = init_perf_event(proc, 400800, &pevent); + if (error) { + printk(KERN_WARNING "TASK: %u | Counting NOT started due to error\n", proc->pid); + return -1; + } + pbm->pevent = pevent; printk(KERN_WARNING "TASK: %u | Counting started...\n", proc->pid); - - return 2; + return 0; } /* Conclude the last task of the given PBM */ int pbm_task_end(PBM* pbm) { - unsigned long irq_flags; int read_error; struct perf_event *pevent; u64 perf_counter; @@ -361,26 +322,14 @@ int pbm_task_end(PBM* pbm) { printk(KERN_WARNING "TASK: %u | PEVENT INVALID\n", current->pid); return 0; } - read_error = perf_event_read_local(pevent, &perf_counter); + read_error = get_perf_counter(pevent, &perf_counter); if (read_error) { - printk(KERN_WARNING "TASK: %u | FETCHING PERFORMANCE COUNTER IN stop_counting FAILED WITH %i\n", current->pid, read_error); - if (-EINVAL == (int64_t)read_error) { - // If this is a per-task event, it must be for current. - // If this is a per-CPU event, it must be for this CPU. - printk(KERN_WARNING - "TASK: %u | EINVAL (-22): Invalid argument! " - "E.g. trying to measure a different task than itself.\n", current->pid); - } - } else { - pbm->last->count = perf_counter; + printk(KERN_WARNING "TASK: %u | ...something went wront while stopping counting\n", current->pid); + return -1; } + pbm->last->count = perf_counter; - // disable performance counter while preventing context switching - local_irq_save(irq_flags); - perf_event_disable(pevent); - perf_event_release_kernel(pevent); - local_irq_restore(irq_flags); - pevent = NULL; + terminate_perf_event(pevent); printk(KERN_WARNING "TASK: %u | ...Counting stopped: %llu instr.\n", current->pid, perf_counter); @@ -492,9 +441,6 @@ int pbm_fork(struct task_struct* proc, pid_t parent_pid, pbm_NODE* fork_node) { return 0; } - // general configurations for perf_event interface - child_pbm->pea.size = sizeof(struct perf_event_attr); - child_pbm->root = NULL; child_pbm->last = NULL; child_pbm->children = NULL; @@ -734,19 +680,6 @@ void _pbm_unvisit_node(pbm_NODE* node) { /******************************************************************************/ -/* - * handle the perf overflow event -> task needed more instructions than planed - */ -/* -static void overflow_handler( - struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - printk(KERN_WARNING "PB TASK RAN TOO LONG\n"); -} -*/ - int is_root_process(struct task_struct* p) { return strcmp(p->comm, "mpirun") == 0; } diff --git a/kernel/behave.h b/kernel/behave.h index e55a10a86fe4a35c55072066aec3b541a93ba2db..58b243424edadfa0d0033c29e1005f1d53850ca6 100644 --- a/kernel/behave.h +++ b/kernel/behave.h @@ -1,7 +1,7 @@ #ifndef PLAN_BASED_LINUX_SCHEDULER_BEHAVE_H #define PLAN_BASED_LINUX_SCHEDULER_BEHAVE_H -#include <linux/perf_event.h> +#include "sched/perf_error_detection.h" /****************************************************************************** * Based on "libpbm": @@ -73,7 +73,6 @@ typedef struct _PBM /* * Performance measurement and recording */ - struct perf_event_attr pea; // config info for the perf_event interface struct perf_event* pevent; } PBM; diff --git a/kernel/sched/pb.c b/kernel/sched/pb.c index be76fdda1d83d930428163a5f1b2fad2452c81c0..06059907d6075a308f8e3e6bec5ed3ea9e09ca54 100644 --- a/kernel/sched/pb.c +++ b/kernel/sched/pb.c @@ -5,6 +5,7 @@ #include <linux/spinlock.h> #include <linux/perf_event.h> #include <linux/kthread.h> +#include "sched.h" typedef struct pb_plan pb_plan_t; @@ -93,7 +94,7 @@ int pb_submit_plan(struct rq *rq) return -1; } - perf_init_res = init_perf_event(&pb->plan[i], &pb->pevent); + perf_init_res = init_perf_event(pb->plan[i].task_struct, pb->plan[i].n_instr, &(pb->pevent)); if(perf_init_res < 0) { //initialization error detection/handling could happen here printk(KERN_WARNING "PB INIT,%u: FAILED OPEN PERF EVENT\n", i); diff --git a/kernel/sched/perf_error_detection.c b/kernel/sched/perf_error_detection.c index 7e3024141232c126e84eae2825ba4a3026f13231..96d3299846d9593a4c2c26db81e2f6df5a6d2508 100644 --- a/kernel/sched/perf_error_detection.c +++ b/kernel/sched/perf_error_detection.c @@ -12,27 +12,63 @@ /* * initialize perf event for new task */ -int init_perf_event(struct plan_entry *plan_entry, struct perf_event **pevent){ +int init_perf_event(struct task_struct* proc, u64 num_instr, + struct perf_event** pevent) +{ unsigned long irq_flags; - struct perf_event_attr pe; - - memset(&pe, 0, sizeof(struct perf_event_attr)); - pe.type = PERF_TYPE_HARDWARE; - pe.size = sizeof(struct perf_event_attr); - pe.config = PERF_COUNT_HW_INSTRUCTIONS; - pe.sample_period = plan_entry->n_instr; - pe.disabled = 0; // start the counter as soon as we're in userland - pe.exclude_kernel = 1; // only count user space - pe.exclude_hv = 1; // excluding events that happen in the hypervisor - + struct perf_event_attr pea; // config info for the perf_event interface + + /* + * Configure the performance counter + */ + memset(&pea, 0, sizeof(struct perf_event_attr)); + pea.type = PERF_TYPE_HARDWARE; + pea.size = sizeof(struct perf_event_attr); + pea.config = PERF_COUNT_HW_INSTRUCTIONS; + pea.sample_period = num_instr; + pea.disabled = 0; // start the counter as soon as we're in userland + pea.exclude_kernel = 1; // only count user space + pea.exclude_hv = 1; // excluding events that happen in the hypervisor + + printk(KERN_WARNING "TASK: %u, CPU: %i, PTR: %llu\n", proc->pid, smp_processor_id(), (u64)proc); + + /* + * Try to enable the performance counter + */ // disable irqs to make 'perf_event_ctx_activate' in 'kernel/events/core.c' happy local_irq_save(irq_flags); - *pevent = perf_event_create(&pe, 0, plan_entry->task_struct); + *pevent = perf_event_create_kernel_counter( + &pea, + -1, // measure on all cores (in case the process runs on different ones) + proc, // exclusively measure the forked process (BEWARE: a process can only measure itself!) + NULL, // overflow_handler disabled, because we count within the scheduler + NULL + ); local_irq_restore(irq_flags); - if (IS_ERR(pevent)) { - printk(KERN_WARNING "PB ERROR INITIALISING PERF EVENT\n"); - return -1; + if (IS_ERR(*pevent)) { + printk(KERN_WARNING "TASK: %u | PB ERROR INITIALISING PERF EVENT: %li\n", proc->pid, PTR_ERR(*pevent)); + // cast to prevent compiler warnings + if (-EOPNOTSUPP == (int64_t)*pevent) { + printk(KERN_WARNING + "TASK: %u | EOPNOTSUPP (-95): The hardware does not support certain attributes! " + "E.g. perf_event_attr.precise_ip > 0 may not be supported.\n", proc->pid); + } + if (-EINVAL == (int64_t)*pevent) { + printk(KERN_WARNING + "TASK: %u | EINVAL (-22): Invalid argument! " + "E.g. CPU with given index does not exist.\n", proc->pid); + } + + if (-ESRCH == (int64_t)*pevent) { + printk(KERN_WARNING + "TASK: %u | ESRCH (-3): No such process! " + "E.g. the process to measure already exited.\n", proc->pid); + } + return -1; + } + if ((*pevent)->state != PERF_EVENT_STATE_ACTIVE) { + printk(KERN_WARNING "TASK: %u | Event is inactive", proc->pid); } return 0; @@ -44,7 +80,18 @@ int init_perf_event(struct plan_entry *plan_entry, struct perf_event **pevent){ */ u64 get_perf_counter(struct perf_event *pevent, u64 *perf_counter) { - return perf_event_read_local(pevent, perf_counter); + int read_error = perf_event_read_local(pevent, perf_counter); + if (read_error) { + printk(KERN_WARNING "TASK: %u | FETCHING PERFORMANCE COUNTER IN stop_counting FAILED WITH %i\n", current->pid, read_error); + if (-EINVAL == (int64_t)read_error) { + // If this is a per-task event, it must be for current. + // If this is a per-CPU event, it must be for this CPU. + printk(KERN_WARNING + "TASK: %u | EINVAL (-22): Invalid argument! " + "E.g. trying to measure a different task than itself.\n", current->pid); + } + } + return read_error; } u64 terminate_perf_event(struct perf_event *pevent) @@ -52,40 +99,12 @@ u64 terminate_perf_event(struct perf_event *pevent) u64 result; unsigned long irq_flags; + // disable performance counter while preventing context switching local_irq_save(irq_flags); + perf_event_disable(pevent); result = perf_event_release_kernel(pevent); local_irq_restore(irq_flags); + pevent = NULL; - return result; -} - - -// /* -// * handle the perf overflow event -> task needed more instructions than planed -// */ -// void overflow_handler( -// struct perf_event *event, -// struct perf_sample_data *data, -// struct pt_regs *regs) -// { -// struct pb_rq *pb_rq; -// int cpu; - -// cpu = smp_processor_id(); -// pb_rq = &cpu_rq(cpu)->pb; - -// if(pb_rq->is_initialized) -// printk(KERN_WARNING "OH: PB TASK %llu RAN TOO LONG\n",pb_rq->plan[pb_rq->c_entry].task_id); -// else -// printk(KERN_WARNING "OH: PB TASK RAN TOO LONG\n"); -// } - -struct perf_event* perf_event_create(struct perf_event_attr *hw_event_uptr, int cpu, struct task_struct *task_struct) -{ - return perf_event_create_kernel_counter( - hw_event_uptr, - cpu, - task_struct, - NULL, /* &overflow_handler disabled, because we count within the scheduler */ - NULL); + return result; } diff --git a/kernel/sched/perf_error_detection.h b/kernel/sched/perf_error_detection.h index 44b6e385866ef1719904dae5eba1600a58fa5675..09024038328dcd2d6df74b5fcb1d9949b28cefd0 100644 --- a/kernel/sched/perf_error_detection.h +++ b/kernel/sched/perf_error_detection.h @@ -2,19 +2,24 @@ #define __PERF_ERROR_DETECTION_H #include <linux/perf_event.h> -#include "sched.h" -int init_perf_event(struct plan_entry*, struct perf_event**); +#define TASK_BUF_SIZE 4096 +#define PROC_BUF_SIZE 512 + +int init_perf_event(struct task_struct* proc, u64 num_instr, struct perf_event **pevent); u64 get_perf_counter(struct perf_event *pevent, u64 *perf_counter); u64 terminate_perf_event(struct perf_event *pevent); -void overflow_handler( - struct perf_event *, - struct perf_sample_data *, - struct pt_regs *regs); - -struct perf_event *perf_event_create(struct perf_event_attr *hw_event_uptr, int cpu, struct task_struct *task_struct); +typedef struct { + // primitive int -> int and int -> pevent* hashmap combination + // the two arrays store the pid and corresponding PBM* at the same index + pid_t index_to_pid[PROC_BUF_SIZE]; + struct perf_event* index_to_pevent[PROC_BUF_SIZE]; + // index of currently last process in the arrays + size_t last_proc_index; + //pthread_mutex_t lock; +} pid_2_pevent_map; #endif