diff --git a/kernel/behave.c b/kernel/behave.c index c8fa9d32eb2d80315572c0c4be0a20a73e078fb7..f90b5a503ffbcccdc34faf8c12db6fc4bda73972 100644 --- a/kernel/behave.c +++ b/kernel/behave.c @@ -8,153 +8,322 @@ int is_initialized = 0; -// ============================================================================= +/****************************************************************************** +* Based on "libpbm" (see header file for more info) +*/ + +#define TASK_BUF_SIZE 4096 #define PROC_BUF_SIZE 512 -/* - * PID -> perf_event* Hashmap - * Only insertion is supported currently (since memory efficiency is not our +/* ----------------------------------------------------------------------------- + * PID -> PBM* Hashmap + * Only insertion is needed currently (since memory efficiency is not our * current concern) */ typedef struct { - // primitive int -> pid_t and int -> perf_event* "hashmap" (not really) combination implemented - // using two arrays to store the pid and corresponding perf_event* at the same index + // primitive int -> int and int -> PBM* hashmap combination + // the two arrays store the pid and corresponding PBM* at the same index pid_t index_to_pid[PROC_BUF_SIZE]; - struct perf_event* index_to_pevent[PROC_BUF_SIZE]; - size_t last_proc_index; // index of currently last process in the arrays -} task_pevent_map; + PBM* index_to_pbm[PROC_BUF_SIZE]; + // index of currently last process in the arrays + size_t last_proc_index; + //pthread_mutex_t lock; +} shared_pbm_int_map; + +shared_pbm_int_map _index_2_pbm; +shared_pbm_int_map* index_2_pbm = &_index_2_pbm; -task_pevent_map task_2_pevent; -void init_task_pevent_map(void) { - int i; +static void init_pbm_int_map(void) { + int i; + //index_2_pbm = init_shared_memory(index_2_pbm, sizeof(shared_pbm_int_map)); + for (i = 0; i < PROC_BUF_SIZE; ++i) { - task_2_pevent.index_to_pid[i] = 0; - task_2_pevent.index_to_pevent[i] = NULL; + index_2_pbm->index_to_pid[i] = 0; + index_2_pbm->index_to_pbm[i] = NULL; } - task_2_pevent.last_proc_index = 0; - printk(KERN_WARNING "Task map initialized by TASK: %u\n", current->pid); + index_2_pbm->last_proc_index = 0; + //init_shared_lock(&index_2_pbm->lock); +} + +static void uninit_pbm_int_map(void) { + //munmap(index_2_pbm, sizeof(index_2_pbm)); } /* - * Returns the perf_event* of the corresponding given process by PID if it + * Returns the PBM-pointer of the corresponding given process by PID if it * exists otherwise it returns NULL (assumption: process with PID=0 is not * considered a normal process) * * This is a primitive "Hashmap"-retrieval implementation (O(n)) */ -struct perf_event* get_pevent_by_pid(pid_t pid) { - size_t proc_index; - size_t i; +PBM* get_pbm_by_pid(pid_t pid) { + size_t i; // to get the PBM by PID one needs to find the index that corresponds to // the given PID and use that to retrieve the PBM of the second array // directly by index - proc_index = 0; - for(i = 0; i <= task_2_pevent.last_proc_index; i++) { - if (task_2_pevent.index_to_pid[i] == pid) { + size_t proc_index = 0; + for(i = 0; i < PROC_BUF_SIZE; i++) { + if (index_2_pbm->index_to_pid[i] == pid) { proc_index = i; break; } } - return proc_index != 0 ? task_2_pevent.index_to_pevent[proc_index] : NULL; + return proc_index != 0 ? index_2_pbm->index_to_pbm[proc_index] : NULL; } /* - * Adds a process (pid, perf_event) to the pid->perf_event hashmap (NON-idempotently!) + * Adds a process (pid, pbm) to the pid->pbm hashmap (NON-idempotently!) * * Returns if the process has been successfully inserted into the hashmap */ -int add_proc_to_map(pid_t pid, struct perf_event* pevent) { - if (PROC_BUF_SIZE <= task_2_pevent.last_proc_index) { - printk(KERN_WARNING "PROC MAP ADD: last_proc_index too large: %lu\n", task_2_pevent.last_proc_index); +int add_proc_to_map(pid_t pid, PBM* pbm) { + //pthread_mutex_lock(&index_2_pbm->lock); + if (TASK_BUF_SIZE <= index_2_pbm->last_proc_index) { + printk(KERN_WARNING "PROC MAP ADD: last_proc_index too large: %lu\n", index_2_pbm->last_proc_index); return 0; } - printk(KERN_WARNING "i: %lu, pid: %u\n", task_2_pevent.last_proc_index, pid); - task_2_pevent.last_proc_index++; - task_2_pevent.index_to_pid[task_2_pevent.last_proc_index] = pid; - task_2_pevent.index_to_pevent[task_2_pevent.last_proc_index] = pevent; + printk(KERN_WARNING "i: %lu, pid: %u\n", index_2_pbm->last_proc_index, pid); + index_2_pbm->last_proc_index++; + index_2_pbm->index_to_pid[index_2_pbm->last_proc_index] = pid; + index_2_pbm->index_to_pbm[index_2_pbm->last_proc_index] = pbm; + //pthread_mutex_unlock(&index_2_pbm->lock); return 1; } -// ============================================================================= -int start_counting(struct task_struct *p) { +static void debug_print_map(void) { + size_t i; + // lock to make the map printing sequential without interleaving other + // outputs + //pthread_mutex_lock(&index_2_pbm->lock); + printk(KERN_WARNING "MAP\n-----\n"); + for(i = 1; i < PROC_BUF_SIZE; i++) { + if (0 == index_2_pbm->index_to_pid[i]) + break; + printk(KERN_WARNING " %u\n", index_2_pbm->index_to_pid[i]); + } + printk(KERN_WARNING "-----\n"); + //pthread_mutex_unlock(&index_2_pbm->lock); +} + +/* ----------------------------------------------------------------------------- + * Task buffer which holds the nodes of the task graph + */ +typedef struct { + // buffer that holds the nodes of the task graph + pbm_NODE task_buffer[TASK_BUF_SIZE]; + // index of current task in task_buffer[] + uint32_t curr_task_index; + //pthread_mutex_t lock; +} task_buf; + +task_buf _tasks; +task_buf* tasks = &_tasks; + +void init_task_buf(void) { + //tasks = init_shared_memory(tasks, sizeof(task_buf)); + memset(tasks, 0, sizeof(task_buf)); + tasks->curr_task_index = 0; + //init_shared_lock(&tasks->lock); +} + +void uninit_task_buf(void) { + //munmap(tasks, sizeof(tasks)); +} + +pbm_NODE* task_alloc(void) { + pbm_NODE* new_task_node; + //pthread_mutex_lock(&tasks->lock); + if (TASK_BUF_SIZE <= tasks->curr_task_index) { + printk(KERN_WARNING "ERROR: Tried to alloc more tasks than available!\n"); + return NULL; + } + // get the memory address of the next free task node space + new_task_node = &(tasks->task_buffer[tasks->curr_task_index]); + tasks->curr_task_index++; + //pthread_mutex_unlock(&tasks->lock); + return new_task_node; +} + +void debug_print_tasks(void) { + size_t i; + pbm_NODE t; + printk(KERN_WARNING "-----\nTASKS:\n"); + for(i = 0; i <= tasks->curr_task_index; i++) { + t = tasks->task_buffer[i]; + printk(KERN_WARNING "type: %u, thread_id: %u, count: %llu, children: %llx, next_sib: %llx\n", t.type, t.thread_id, t.count, (uint64_t)t.children, (uint64_t)t.next_sib); + } + printk(KERN_WARNING "^^^^^\n"); +} + + /* ----------------------------------------------------------------------------- + * PBM buffer which holds the nodes of the process graph + */ +typedef struct { + // buffer that holds the nodes of the process graph + PBM process_buffer[PROC_BUF_SIZE]; + // index of current process in proc_buffer[] + uint32_t curr_proc_index; + //pthread_mutex_t lock; +} proc_buf; + +proc_buf _procs; +proc_buf* procs = &_procs; + +static void init_proc_buf(void) { + //procs = init_shared_memory(procs, sizeof(proc_buf)); + memset(procs, 0, sizeof(proc_buf)); + procs->curr_proc_index = 0; + //init_shared_lock(&procs->lock); +} + +static void uninit_proc_buf(void) { + //munmap(procs, sizeof(procs)); +} + +PBM* proc_alloc(void) { + PBM* new_pbm; + //pthread_mutex_lock(&procs->lock); + if (PROC_BUF_SIZE <= procs->curr_proc_index) { + printk(KERN_WARNING "ERROR: Tried to alloc more processes than available!\n"); + return NULL; + } + printk(KERN_WARNING "alloc proc index: %u\n", procs->curr_proc_index); + // get the memory address of the next free process node space + new_pbm = &(procs->process_buffer[procs->curr_proc_index]); + procs->curr_proc_index++; + //pthread_mutex_unlock(&procs->lock); + return new_pbm; +} + +void debug_print_procs(void) { + size_t i; + PBM p; + printk(KERN_WARNING "-----\nPROCS:\n"); + for(i = 0; i <= procs->curr_proc_index; i++) { + p = procs->process_buffer[i]; + printk(KERN_WARNING "root: %llx, last: %llx, children: %llx, next_sib: %llx\n", (uint64_t)p.root, (uint64_t)p.last, (uint64_t)p.children, (uint64_t)p.next_sib); + } + printk(KERN_WARNING "^^^^^\n"); +} + +/* ----------------------------------------------------------------------------- +* General function for management and creation of program behavior models (PBMs) +*/ + +void pbm_init(void) { + init_pbm_int_map(); + init_task_buf(); + init_proc_buf(); +} + +void pbm_uninit(void) { + uninit_pbm_int_map(); + uninit_task_buf(); + uninit_proc_buf(); +} + +/* Insert a task node into the PBM of given type (only COMP for now) */ +int pbm_task_start(PBM* pbm, uint8_t type, struct task_struct* proc) { + pbm_NODE* node; + unsigned long irq_flags; - struct perf_event_attr pe; struct perf_event *pevent; - u64 cpu; - if (!is_initialized) { - init_task_pevent_map(); - is_initialized = 1; + // only continue for COMP-nodes since other types are not implemented yet + if(!(pbm && type == COMP)) { + return 0; } - memset(&pe, 0, sizeof(struct perf_event_attr)); - pe.type = PERF_TYPE_HARDWARE; - pe.size = sizeof(struct perf_event_attr); - pe.config = PERF_COUNT_HW_INSTRUCTIONS; - pe.sample_period = 400800; - pe.disabled = 0; // start the counter as soon as we're in userland - pe.pinned = 1; // ? - pe.exclude_kernel = 1; - pe.exclude_hv = 1; + /* + * Append a new node to the task graph + */ + node = _pbm_create_node(type, proc->pid); + if(!node) { + printk(KERN_WARNING "ERROR: Could not create node!\n"); + return 0; + } + if(!pbm->root) { + pbm->root = node; + } else { + pbm->last->children = node; // append the new node + } + // finally the new node becomes the last inserted one + pbm->last = node; - // Not needed on 3.2? - pe.wakeup_events = 1; + /* + * Configure the performance counter + */ + memset(&(pbm->pea), 0, sizeof(struct perf_event_attr)); + pbm->pea.type = PERF_TYPE_HARDWARE; + pbm->pea.size = sizeof(struct perf_event_attr); + pbm->pea.config = PERF_COUNT_HW_INSTRUCTIONS; + pbm->pea.sample_period = 400800; + pbm->pea.disabled = 0; // start the counter as soon as we're in userland + pbm->pea.pinned = 1; + pbm->pea.exclude_kernel = 1; + pbm->pea.exclude_hv = 1; + pbm->pea.wakeup_events = 1; // Not needed on 3.2? - cpu = smp_processor_id(); - printk(KERN_WARNING "TASK: %u, CPU: %llu, PTR: %llu\n", p->pid, cpu, (u64)p); + printk(KERN_WARNING "TASK: %u, CPU: %i, PTR: %llu\n", proc->pid, smp_processor_id(), (u64)proc); + /* + * Try to enable the performance counter + */ // disable irqs to make 'perf_event_ctx_activate' in 'kernel/events/core.c' happy local_irq_save(irq_flags); pevent = perf_event_create_kernel_counter( - &pe, + &(pbm->pea), -1, // measure on all cores (in case the process runs on different ones) - p, // exclusively measure the forked process (BEWARE: a process can only measure itself!) + proc, // exclusively measure the forked process (BEWARE: a process can only measure itself!) NULL, //&overflow_handler, NULL ); local_irq_restore(irq_flags); - if (IS_ERR(pevent)) { - printk(KERN_WARNING "TASK: %u | PB ERROR INITIALISING PERF EVENT: %li\n", p->pid, PTR_ERR(pevent)); + printk(KERN_WARNING "TASK: %u | PB ERROR INITIALISING PERF EVENT: %li\n", proc->pid, PTR_ERR(pevent)); + // cast to prevent compiler warnings + if (-EOPNOTSUPP == (int64_t)pevent) { + printk(KERN_WARNING + "TASK: %u | EOPNOTSUPP (-95): The hardware does not support certain attributes! " + "E.g. perf_event_attr.precise_ip > 0 may not be supported.\n", proc->pid); + } + if (-EINVAL == (int64_t)pevent) { + printk(KERN_WARNING + "TASK: %u | EINVAL (-22): Invalid argument!" + "E.g. CPU with given index does not exist.\n", proc->pid); + } return 0; } if (pevent->state != PERF_EVENT_STATE_ACTIVE) { - printk(KERN_WARNING "TASK: %u | Event is inactive", p->pid); + printk(KERN_WARNING "TASK: %u | Event is inactive", proc->pid); } + pbm->pevent = pevent; - add_proc_to_map(p->pid, pevent); + printk(KERN_WARNING "TASK: %u | Counting started...\n", proc->pid); - printk(KERN_WARNING "TASK: %u | Counting started...\n", p->pid); - return 1; + return 2; } -/* - * handle the perf overflow event -> task needed more instructions than planed - */ -void overflow_handler( - struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - printk(KERN_WARNING "PB TASK RAN TOO LONG\n"); - //perf_event_get - //->unlocked_ioctl(event, reset); -} - -int stop_counting(void) { +/* Conclude the last task of the given PBM */ +int pbm_task_end(PBM* pbm) { unsigned long irq_flags; int read_error; struct perf_event *pevent; u64 perf_counter; + if (!pbm) { + printk(KERN_WARNING "Error: Could not end given task due to invalid PBM!\n"); + return 0; + } + + // record performance results if (!is_initialized) { printk(KERN_WARNING "TASK: %u | Pevent map not initialized!\n", current->pid); return 0; } - printk(KERN_WARNING "TASK: %u | Stopping counting...\n", current->pid); - pevent = get_pevent_by_pid(current->pid); + pevent = pbm->pevent; if (!pevent) { printk(KERN_WARNING "TASK: %u | ERROR: Could not find perf_event!\n", current->pid); return 0; @@ -163,12 +332,21 @@ int stop_counting(void) { printk(KERN_WARNING "TASK: %u | PEVENT INVALID\n", current->pid); return 0; } - read_error = perf_event_read_local(pevent, &perf_counter); if (read_error) { printk(KERN_WARNING "TASK: %u | FETCHING PERFORMANCE COUNTER IN stop_counting FAILED WITH %i\n", current->pid, read_error); + if (-EINVAL == (int64_t)read_error) { + // If this is a per-task event, it must be for current. + // If this is a per-CPU event, it must be for this CPU. + printk(KERN_WARNING + "TASK: %u | EINVAL (-22): Invalid argument! " + "E.g. trying to measure a different task than itself.\n", current->pid); + } + } else { + pbm->last->count = perf_counter; } + // disable performance counter while preventing context switching local_irq_save(irq_flags); perf_event_disable(pevent); perf_event_release_kernel(pevent); @@ -176,5 +354,368 @@ int stop_counting(void) { pevent = NULL; printk(KERN_WARNING "TASK: %u | ...Counting stopped: %llu instr.\n", current->pid, perf_counter); - return 0; + + return 0; } + +/** + * Why is the code concerning the forking separated into the two functions + * pbm_fork_parent_new_task() and pbm_fork() instead of simply putting it at the end of _do_fork? + * + * The separation is necessary since in the _do_fork a context switch from the parent to the + * child process takes place which is problematic since we want to end (and restart) perf-measuring + * the parent as well as the child process and the measurements (in pbm_task_end()) can only happen + * from the process itself. But in the beginning of _do_fork the child process does not exist yet. + * Therefore we have to split the code into the two functions to be able to measure the parent + * before the context switch as well as initialize the child-measuring after switching to the child. + */ + +/* Stop previous task and start new task for the parent process and also reset the perf counter + * Returns a pointer to the fork-task-node which the forked process can use as a time information. + * + * BEWARE: + * Inside the _do_fork routine the context gets switched from the parent to the child process. + * This function must get called in the _do_fork() routine BEFORE (!) the child process starts to + * run (current == parent) otherwise the perf counting will fail! + */ + pbm_NODE* pbm_fork_parent_new_task(struct task_struct* parent_proc) { + PBM* parent_pbm; + pbm_NODE* fork_node; + + // end task of parent process + parent_pbm = get_pbm_by_pid(parent_proc->pid); + if(!parent_pbm) { + printk(KERN_WARNING "COULD NOT FIND PARENT-PBM!\n"); + //TODO Since this will happen right at the first call to pbm_fork() + //TODO since no parent process has been initialized (since this is + //TODO the first relevant parent process) + return NULL; + } + pbm_task_end(parent_pbm); + + /* + * Before starting the new task, append the fork-node to the task graph to maintain the correct order + */ + fork_node = _pbm_create_node(FORK, parent_proc->pid); + if(!fork_node) { + printk(KERN_WARNING "COULD NOT CREATE NEW FORK NODE!\n"); + return NULL; + } + if(!parent_pbm->root) { + parent_pbm->root = fork_node; + } else { + parent_pbm->last->children = fork_node; // append the new node + } + parent_pbm->last = fork_node; // the new node becomes the last inserted one + + // start the new task + pbm_task_start(parent_pbm, COMP, parent_proc); + + return fork_node; + } + +//TODO Consider the difference between calling it from the root-process (that has no registered parent process) and from "normal" child processes (that are registered in the maps) +/* Insert a FORK node into the given PBM for up to 'num_thr' child threads + * + * BEWARE: + * Inside fork.c:_do_fork() the context gets switched from the parent to the child process. + * This function must get called in the _do_fork() routine AFTER (!) the child process starts to + * run (current == child) otherwise the perf counting will fail! + */ +int pbm_fork(struct task_struct* proc, pid_t parent_pid, pbm_NODE* fork_date) { + unsigned long irq_flags; + + PBM* parent_pbm; + PBM* child_pbm; + + /* NOTE: + * since the first time that _do_fork() is called the parent is "bash" which causes the + * pbm_fork_parent_new_task() to not be called but pbm_fork() since the child is "mpirun" so we + * put the initialization in here instead of into pbm_fork_parent_new_task(). + */ + // avoid context switching during initialization by disabling interrupts + local_irq_save(irq_flags); + if (!is_initialized) { + pbm_init(); + is_initialized = 1; + } + local_irq_restore(irq_flags); + + child_pbm = get_pbm_by_pid(proc->pid); + parent_pbm = get_pbm_by_pid(parent_pid); + + printk(KERN_WARNING "FORK: %u from parent %u\n", proc->pid, parent_pid); + + // check if the child already exists (if and only if the "forked" process + // is the process itself which happens in this case because OpenMP also + // uses the parent process for parallel calculations) + if (!child_pbm) { + // Create and initialize a new PBM for the child + { + child_pbm = proc_alloc(); + if(!child_pbm) { + printk(KERN_WARNING "ERROR: Could not alloc child-PBM! %llx\n", (uint64_t)&child_pbm); + return 0; + } + + // general configurations for perf_event interface + child_pbm->pea.size = sizeof(struct perf_event_attr); + + child_pbm->root = NULL; + child_pbm->last = NULL; + child_pbm->children = NULL; + child_pbm->next_sib = NULL; + child_pbm->fork_date = NULL; // this gets updated later in the function // TODO Remove initialization here since it is done later? + child_pbm->exit_date = NULL; + } + if (!add_proc_to_map(proc->pid, child_pbm)) { + printk(KERN_WARNING "FORK ERROR: Could not add process to map: %u\n", proc->pid); + return 0; + // TODO Reverse previous allocation of child pbm? + } else { + printk(KERN_WARNING "Added process to map: %u\n", proc->pid); + debug_print_map(); //FIXME + } + } else { + printk(KERN_WARNING "Process already exists: %u\n", proc->pid); + } + + // add child pbm to parents children + if(parent_pbm) { // checking this is only important in case of the root task which has no (recorded) parent + if(parent_pbm->children) { + // prepend the child to the list of children so we dont have to + // modify the sibbling (which works since we use a single-linked + // list) + child_pbm->next_sib = parent_pbm->children; + } + parent_pbm->children = child_pbm; + } + + // We have to know WHEN the exit happens relative to the parent. So every child remembers the + // current fork-task-node of the parent on exit (so that the join can happen at the correct + // position (more or less, may be imperfect due to parallelism)) + child_pbm->fork_date = fork_date; + + // continue performance counting for child (restarting parent counting has already been started + pbm_task_start(child_pbm, COMP, proc); + + return 1; +} + +// This should get called by the child at sysexit() +int pbm_exit(pid_t pid, pid_t parent_pid) { + PBM* pbm; + PBM* parent_pbm; + + printk(KERN_WARNING "EXIT: %u\n", pid); + pbm = get_pbm_by_pid(pid); + if(!pbm) { + printk(KERN_WARNING "COULD NOT FIND PBM!\n"); + debug_print_map(); + return 0; + } + pbm_task_end(pbm); + + parent_pbm = get_pbm_by_pid(parent_pid); + // set current parent task as the exit task of this child where the join + // gets inserted + if(parent_pbm) { + pbm->exit_date = parent_pbm->last; + } + + return 1; +} + +/* ----------------------------------------------------------------------------- +* PBM graph post-processing functions +*/ + +/* Insert a JOIN node into the given PBM and merge the forked child PBMs into this PBM */ +int pbm_join(PBM* child_pbm) { + pbm_NODE* fork_node; + pbm_NODE* join_node; + pid_t join_label; + + if(!child_pbm) + return 0; + + fork_node = child_pbm->fork_date; + + // the child process is used to label the join operation to know which process the join belongs + // to since using the parent as the label would be ambiguous since more than one child could + // have been spawned by the same parent + join_label = child_pbm->last->thread_id; + join_node = _pbm_create_node(JOIN, join_label); + if(!join_node) { + printk(KERN_WARNING "ERROR: Could not create node!\n"); + return 0; + } + + // any fork-node has exactly two children because a fork creates only one + // copy of an existing process (1 (copy) + 1 (existing) = 2) + fork_node->count = 2; + join_node->count = 2; + + // insert front of child task graph into parent task graph: + // prepend child-task-tree to list of child-nodes in the fork-node + { + // assumption: child_pbm->root->next_sib == NULL (since any fork-node + // has at most two children and only one before the join-operation) + child_pbm->root->next_sib = fork_node->children; + fork_node->children = child_pbm->root; + } + + // insert back of child task graph with appended join-node into parent task + // graph + { + // assumption: child_pbm->last->children == NULL (since it should be + // the last task the child did before exit) + // append join node to child task-graph + child_pbm->last->children = join_node; + // insert join node directly after the exit-date-node in the parent pbm + join_node->children = child_pbm->exit_date->children; + child_pbm->exit_date->children = join_node->children; + } + + return 1; +} + +// recursively traverse all PBMs and insert the child task-graphs +void pbm_post_processing(PBM* pbm) { + PBM* sib_pbm; + PBM* child_pbm = pbm->children; + if(child_pbm) { + pbm_post_processing(child_pbm); + pbm_join(child_pbm); + // TODO Remove from list of childs or just mark as visited? + } + + sib_pbm = pbm->next_sib; + if(sib_pbm) { + pbm_post_processing(sib_pbm); + // TODO Remove from list of siblings or just mark as visited? + } +} + +/* ----------------------------------------------------------------------------- +* PBM graph output functions +*/ + +void pbm_join_and_print_graph_self(pid_t pid) { + PBM* pbm; + + debug_print_map(); + debug_print_procs(); + debug_print_tasks(); + printk(KERN_WARNING "indices: %lu, %u, %u\n", index_2_pbm->last_proc_index, procs->curr_proc_index, tasks->curr_task_index); + pbm = get_pbm_by_pid(pid); + if (pbm) { + pbm_post_processing(pbm); + pbm_print_graph(pbm, pbm->root); + } else { + printk(KERN_WARNING "JOIN: PBM not found for: %u\n", pid); + } +} + +/* Crude recursive ADG printer, starts with given node */ +void pbm_print_graph(PBM* pbm, pbm_NODE* node) { + pbm_NODE* root; + char types[5][5] = {"", "FORK", "JOIN", "COMP", "COMM"}; + + if(!node) + return; + + if(node->visited) + return; + + root = node; + + printk(KERN_WARNING "Node %p: (%s, count = %llu), children:\n", node, types[node->type], node->count); + + if(node->children) + { + node = node->children; + while(node) + { + printk(KERN_WARNING " -- Node %p: (%s, count = %llu), next sibling: %p\n", node, types[node->type], node->count, node->next_sib); + node = node->next_sib; + } + } + + if(root->children) + pbm_print_graph(pbm, root->children); + + if(root->next_sib) + pbm_print_graph(pbm, root->next_sib); + + root->visited = 1; + if(root == pbm->root) + _pbm_unvisit_node(pbm->root); +} + +/* ----------------------------------------------------------------------------- +* Auxiliary functions, not for public use. +*/ + +pbm_NODE* _pbm_create_node(uint8_t type, pid_t pid) { + pbm_NODE* node = task_alloc(); + if(!node) + return NULL; + + node->thread_id = pid; + node->type = type; + node->count = 0; + node->children = NULL; + node->next_sib = NULL; + node->visited = 0; + return node; +} + +// recursive +void _pbm_unvisit_node(pbm_NODE* node) { + if(!node) + return; + + if(!node->visited) + return; + + if(node->children) + _pbm_unvisit_node(node->children); + + if(node->next_sib) + _pbm_unvisit_node(node->next_sib); + + node->visited = 0; +} + +/******************************************************************************/ + +/* + * handle the perf overflow event -> task needed more instructions than planed + */ +/* +static void overflow_handler( + struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + printk(KERN_WARNING "PB TASK RAN TOO LONG\n"); +} +*/ + +int is_root_process(struct task_struct* p) { + return strcmp(p->comm, "mpirun") == 0; +} + +int is_relevant_process(struct task_struct* p) { + struct task_struct* proc = p; + // check if mpirun is a parent, super-parent, ... until the root-parent ("swapper") is found + while (proc && !(strcmp(proc->comm, "swapper") == 0)) { + if (is_root_process(proc)) { + return 1; + } + //printk(KERN_WARNING "Searching relevant process: %s\n", proc->comm); + proc = proc->real_parent; + } + return 0; +} \ No newline at end of file diff --git a/kernel/behave.h b/kernel/behave.h index 4e83c96cfcce2d12280fcbf663620b557f3f1e8f..881970f662dedbf241261bb474334ec4deb5c1cd 100644 --- a/kernel/behave.h +++ b/kernel/behave.h @@ -3,12 +3,176 @@ #include <linux/perf_event.h> +/****************************************************************************** +* Based on "libpbm": +* Program Behaviour Model (PBM) as a Task Precedence Graph (TPG), +* implemented as a Acyclic Directed Graph (ADG) structure. Using +* Linux' perf_event interface for task performance measurement. +* Author: Michael Zent +* Context: Softwareproject 'Cluster Management', Lecturer: Barry Linnert, SS 2022 FU Berlin +*/ + +// node types +#define FORK 1 +#define JOIN 2 +#define COMP 3 // computation +#define COMM 4 // communication (not supported yet) + +/* +* PBM node, describing a program task +*/ +typedef struct _pbm_NODE +{ + uint8_t type; // FORK, JOIN, or COMP (COMM not supported yet) + int32_t thread_id; // ID of the current thread within its thread group + + /* + * Performance count value, interpretation depends on type + * FORK - Number of forked threads + * JOIN - Number of joined threads + * COMP - Number of instructions needed to complete the task + * COMM - Total length of all messages sent, in byte (not supported yet) + */ + uint64_t count; + + /* + * Inter-node connectors + * children - First child in a list of children. There should be >= 1 children + * only if type == FORK, else a node has just one child. + * next_sib - Next sibling of a node. Should be != NULL only if the node is a + * child of a FORK node, with exception of the last child. + */ + struct _pbm_NODE* children; // first child (in a list of children) + struct _pbm_NODE* next_sib; // next sibling + + // marker for graph traversion + uint8_t visited; +} pbm_NODE; + +/* +* Program Behavior Model (PBM) +*/ +typedef struct _PBM +{ + pbm_NODE* root; // first task of a thread // TODO Rename to first or first_task? + pbm_NODE* last; // current last task + + // the task nodes of the parent which are used as time markers/dates to + // know where the task graph must be inserted in the parent task-graph in + // the post-processing stage + pbm_NODE* fork_date; // fork task of the parent process + pbm_NODE* exit_date; // current/last task of the parent while this child exited + + /* + * The Fork Buffer contains pointers to PBMs which describe the forked + * child threads. Should be != NULL with size > 0 only after forking. + */ + struct _PBM* children; // first child (in a list of forked children) + struct _PBM* next_sib; // next sibling + + /* + * Performance measurement and recording + */ + struct perf_event_attr pea; // config info for the perf_event interface + struct perf_event* pevent; +} PBM; + +/* +* Creates a new PBM and initializes it. +* Should be the first PBM-function called. +* +* Returns a pointer to that PBM, or NULL on error. +*/ +PBM* pbm_create(void); + +/* +* Deletes the given PBM and frees the associated resources. +* Should only be called if pbm_create() was called prior, and +* if no pbm_task_start() or pbm_fork() remained unclosed. +*/ +void pbm_destroy(PBM* pbm); + +/* +* Inserts into the given PBM a new node of given type (for now only +* COMP, as COMM in not supported yet) and starts performance counting. +* Should be called immediately before the task starts, and be closed +* by pbm_task_end(). +* +* Returns 0 on error, 1 on full success, or 2 if performance counting +* could not start. +*/ +int pbm_task_start(PBM* pbm, uint8_t type, struct task_struct* proc); + +/* +* Ends performance counting for the last node of the given PBM and +* records the results. +* Should be called immediately after the task ends, and as the next +* PBM-method after pbm_task_start(). +* +* Returns 0 on failure, i.e. the performance counts could not be +* recorded, otherwise returns 1. +*/ +int pbm_task_end(PBM* pbm); + + pbm_NODE* pbm_fork_parent_new_task(struct task_struct* parent_proc); + +/* +* Inserts into the given PBM a FORK node which may have up to +* 'num_thr' children, describing the forked child threads. If +* num_thr == 0 the max number of threads is determined auto- +* matically via OpenMP's omp_get_max_threads(). +* Should be called immediately before the thread is forked, and be +* closed by pbm_join(). +* +* Returns 0 on error, or 1 on success. +* +* The process as an explicit argument is needed since the fork is called by the parent. +*/ +int pbm_fork(struct task_struct* proc, pid_t parent_pid, pbm_NODE* fork_date); + +int pbm_exit(pid_t pid, pid_t parent_pid); + +/* +* Inserts into the given PBM a JOIN node. Records the actual number of +* forked threads. Merges the sub-PBMs, describing the child threads, from +* the fork buffer into the parent PBM and releases the fork-buffer. +* Should be called immediately after the child threads are joined, and as +* the next PBM-method after pbm_fork(). +* +* Returns 0 on error, or 1 on success. +*/ +int pbm_join(PBM* child_pbm); + +/* +* Crude recursive ADG printer. Starts with the given node, which +* should be the root node of the given PBM. +*/ +void pbm_print_graph(PBM* pbm, pbm_NODE* node); + +void pbm_join_and_print_graph_self(pid_t pid); + +/* +* Auxiliary methods, not for public usage. Nomen est omen. +*/ +pbm_NODE* _pbm_create_node(uint8_t type, pid_t pid); +void _pbm_destroy_node(pbm_NODE* node); +void _pbm_unvisit_node(pbm_NODE* node); + +/* +* Initialize the map necessary to locate the PBM by given PID +* +* BEWARE: Forgetting to call this before calling fork() will trigger +* Segmentation faults! +*/ +void pbm_init(void); +void pbm_uninit(void); + +/******************************************************************************/ + +int is_root_process(struct task_struct* p); +int is_relevant_process(struct task_struct *p); + int start_counting(struct task_struct *p); int stop_counting(void); -void overflow_handler( - struct perf_event *, - struct perf_sample_data *, - struct pt_regs *regs); - #endif //PLAN_BASED_LINUX_SCHEDULER_BEHAVE_H diff --git a/kernel/exit.c b/kernel/exit.c index 97470434dad1b7dd2884c5bfe33a4d259590b21e..1d7f88f7aec2e1e55e042ce72a034f57a3d9a4e2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -768,11 +768,11 @@ void __noreturn do_exit(long code) int group_dead; // call the readout before the process is terminated - if (strcmp(tsk->real_parent->comm, "bash") == 0 - || strcmp(tsk->real_parent->real_parent->comm, "bash") == 0 - || strcmp(tsk->real_parent->real_parent->real_parent->comm, "bash") == 0) { - stop_counting(); - printk(KERN_EMERG "EXIT: %u, CMD: '%s', PARENT-CMD: '%s', PTR: %llu\n", tsk->pid, tsk->comm, tsk->real_parent->comm, (u64)tsk); + if (is_relevant_process(tsk)) { + pbm_exit(tsk->pid, tsk->real_parent->pid); + } + if (is_root_process(tsk)) { + pbm_join_and_print_graph_self(tsk->pid); } TASKS_RCU(int tasks_rcu_i); diff --git a/kernel/fork.c b/kernel/fork.c index 2c2aab1bbb6aa3e7201febd4f1fe03bf83a8f0af..b3ec7595bd1cc3df74a9fca6d8e205f92eff06fa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2016,7 +2016,16 @@ long _do_fork(unsigned long clone_flags, int trace = 0; long nr; - printk(KERN_EMERG "DO FORK CALLED by: %u\n", current->pid); + pbm_NODE* fork_date; + pid_t parent_pid = current->pid; + fork_date = NULL; + + // FIXME: This will not get called for mpirun since then bash will be the parent here + if (is_relevant_process(current)) { + printk(KERN_EMERG "DO FORK CALLED by: '%s' %u\n", current->comm ,parent_pid); + fork_date = pbm_fork_parent_new_task(current); + } + /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly @@ -2060,6 +2069,16 @@ long _do_fork(unsigned long clone_flags, get_task_struct(p); } + // this must be called before the new task wakes up to make sure that + // initialization of the perf event is done at that point + if (is_relevant_process(p)) { + // BEWARE: At this point p->comm is not yet up to date but shows the + // command of the parent! + printk(KERN_EMERG "FORKED!!!!: %u, Parent: %s, Super-Parent:%s\n", + p->pid, p->comm, p->real_parent->real_parent->comm); + pbm_fork(p, parent_pid, fork_date); + } + wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ @@ -2072,15 +2091,6 @@ long _do_fork(unsigned long clone_flags, } put_pid(pid); - printk(KERN_EMERG "FORKED!!!!: %u, Parent: %s, Super-Parent:%s\n", - p->pid, p->comm, p->real_parent->real_parent->comm); - //FIXME At this point p->comm is not up to date but shows the command of the parent! - // (This may not be a problem since the name of the forked processes are not needed?) - if (strcmp(p->comm, "bash") == 0 - || strcmp(p->real_parent->comm, "bash") == 0 - || strcmp(p->real_parent->real_parent->comm, "bash") == 0) { - start_counting(p); - } } else { nr = PTR_ERR(p); }