diff --git a/kernel/behave.c b/kernel/behave.c index 821e231901c6c5e60f039ea1847a0a24ae88bc76..129d699b3b3c5b799f29a74c2e01eedd5cf8e049 100644 --- a/kernel/behave.c +++ b/kernel/behave.c @@ -31,6 +31,9 @@ SYSCALL_DEFINE0(pbm_set_root_proc) { return 0; } +// FIXME: THIS IS JUST FOR TESTING PURPOSES so that i dont have to read and write the reference pid manually in the console +pid_t last_plan_pid; + /****************************************************************************** * Based on "libpbm" (see header file for more info) */ @@ -490,8 +493,8 @@ int pbm_fork(struct task_struct* proc, pid_t parent_pid, pbm_NODE* fork_node) { BUG_ON(current->pid != root_proc); } - // We have to know WHEN the exit happens relative to the parent. So every child remembers the - // current fork-task-node of the parent on exit (so that the join can happen at the correct + // We have to know WHEN the fork happens relative to the parent. So every child remembers the + // current fork-task-node of the parent on exit (so that the merge can happen at the correct // position (more or less, may be imperfect due to parallelism)) child_pbm->fork_date = fork_node; @@ -600,7 +603,7 @@ void pbm_post_processing(PBM* pbm) { if(child_pbm) { printk(KERN_WARNING "Joining child thread %u\n", child_pbm->last->thread_id); pbm_post_processing(child_pbm); - pbm_join(child_pbm); + //pbm_join(child_pbm); // TODO Remove from list of childs or just mark as visited? } @@ -608,11 +611,19 @@ void pbm_post_processing(PBM* pbm) { if(sib_pbm) { printk(KERN_WARNING "Joining sib thread %u\n", sib_pbm->last->thread_id); pbm_post_processing(sib_pbm); - pbm_join(sib_pbm); + //pbm_join(sib_pbm); // TODO Remove from list of siblings or just mark as visited? } } +/* Crude recursive pbm node counter, starts with given PBM */ +u64 pbm_count_children(PBM* pbm) { + printk(KERN_WARNING "Node %lli: (pid = %u)\n", pbm_2_index(pbm), pbm->root->thread_id); + // count itself (1) and add the count of children and sibblings if existing + return 1 + (pbm->children ? pbm_count_children(pbm->children) : 0) + + (pbm->next_sib ? pbm_count_children(pbm->next_sib) : 0); +} + /* ----------------------------------------------------------------------------- * PBM graph output functions */ @@ -634,6 +645,8 @@ void pbm_join_and_print_graph_self(pid_t pid) { printk(KERN_WARNING "indices: %lu, %u, %u\n", index_2_pbm->last_proc_index, procs->curr_proc_index, tasks->curr_task_index); pbm = get_pbm_by_pid(pid); if (pbm) { + pbm->child_count = pbm_count_children(pbm); + printk(KERN_WARNING "Child count: %llu\n", pbm->child_count); pbm_post_processing(pbm); printk(KERN_WARNING "After post-processing:\n"); debug_print_tasks(); @@ -643,6 +656,10 @@ void pbm_join_and_print_graph_self(pid_t pid) { } // reset so that is_relevant_process() can return early is_initialized = 0; + + printk(KERN_WARNING "ROOT: %u\n", pid); + // FIXME: THIS IS JUST FOR TESTING PURPOSES + last_plan_pid = pid; } /* Crude recursive ADG printer, starts with given node */ @@ -742,4 +759,86 @@ bool is_relevant_process(struct task_struct* p) { proc = proc->real_parent; } return false; -} \ No newline at end of file +} + + +/****************************************************************************** + * Plan runtime state + */ +struct plan_rt_state prs; + +void plan_rt_state_init(void) +{ + int i; + for (i = 0; i < PROC_BUF_SIZE; ++i) { + prs.node_stack[i] = NULL; + prs.proc_stack[i] = NULL; + } + prs.stack_size = 0; + prs.num_exited_procs = 0; +} + +struct task_struct* plan_rt_state_peek_proc(void) +{ + return prs.proc_stack[prs.stack_size - 1]; +} + +void plan_rt_state_peek(pbm_NODE** node_res, struct task_struct** proc_res) +{ + if (0 != prs.stack_size) { + *node_res = prs.node_stack[prs.stack_size - 1]; + *proc_res = plan_rt_state_peek_proc(); + } +} + +int plan_rt_state_is_empty(void) +{ + // it is sufficient to only check for the process since there should never be a process without + // a corresponding node and vice verca + return NULL == plan_rt_state_peek_proc(); +} + +void plan_rt_state_push(pbm_NODE* node, struct task_struct* proc) +{ + if (prs.stack_size >= PROC_BUF_SIZE) { + printk(KERN_WARNING "ERROR: Plan runtime stack is full!\n"); + return; + } + prs.node_stack[prs.stack_size] = node; + prs.proc_stack[prs.stack_size] = proc; + prs.stack_size++; +} + +void plan_rt_state_pop(void) +{ + if (prs.stack_size == 0) { + printk(KERN_WARNING "ERROR: Plan runtime stack is already empty!\n"); + } + prs.stack_size--; + prs.node_stack[prs.stack_size] = NULL; + prs.proc_stack[prs.stack_size] = NULL; +} + +void plan_rt_state_incr_num_exited_procs(void) +{ + prs.num_exited_procs++; +} + +u64 plan_rt_state_num_exited_procs(void) +{ + return prs.num_exited_procs; +} + +void plan_rt_state_debug_print(void) +{ + size_t i; + pbm_NODE* node; + struct task_struct* task; + printk(KERN_WARNING "MAP\n-----\n"); + for(i = 0; i < prs.stack_size; i++) { + node = prs.node_stack[i]; + task = prs.proc_stack[i]; + printk(KERN_WARNING "[%lu] %p, %p (%i, %s)\n", i, node, task, task ? task->pid : 0, task ? task->comm : "-"); + } + printk(KERN_WARNING "-----\n"); +} diff --git a/kernel/behave.h b/kernel/behave.h index ec0439290ddd0ee1f9c1f0325247289b3b5fbac9..8348874054c5a49f8b6d791857afb2cd3032508b 100644 --- a/kernel/behave.h +++ b/kernel/behave.h @@ -1,8 +1,12 @@ #ifndef PLAN_BASED_LINUX_SCHEDULER_BEHAVE_H #define PLAN_BASED_LINUX_SCHEDULER_BEHAVE_H +#include <linux/types.h> #include "sched/perf_error_detection.h" +// FIXME: THIS IS JUST FOR TESTING PURPOSES so that i dont have to read and write the reference pid manually in the console +extern pid_t last_plan_pid; + /****************************************************************************** * Based on "libpbm": * Program Behaviour Model (PBM) as a Task Precedence Graph (TPG), @@ -70,6 +74,9 @@ typedef struct _PBM struct _PBM* children; // first child (in a list of forked children) struct _PBM* next_sib; // next sibling + // number of children (and children's children) it totally spawned + uint64_t child_count; + /* * Performance measurement and recording */ @@ -174,4 +181,43 @@ bool is_relevant_process(struct task_struct *p); int start_counting(struct task_struct *p); int stop_counting(void); +/******************************************************************************/ + +PBM* get_pbm_by_pid(pid_t pid); + +/****************************************************************************** + * Plan runtime state + * + * Represents the current state of the plan at runtime. + * The head of the stack should always be the currently executed process and corresponding plan + * node. + * + * Implementation: + * Two stacks which store the plan-node and the corresponding runtime-process + * accumulated by forking and should be executed from top to bottom (current process is always the + * head). + * This works since we assume that a fork in the plan always corresponds to a runtime fork which + * means that a fork node that we push to the stack as a start for the traversal after the next + * exit-node should always have a corresponding runtime-process which should be run when the + * previous (child) process exits. + */ +struct plan_rt_state { + pbm_NODE* node_stack[PROC_BUF_SIZE]; + struct task_struct* proc_stack[PROC_BUF_SIZE]; + size_t stack_size; + + // number of exited processes of the plan needed to measure of the plan is finished + u64 num_exited_procs; +}; + +void plan_rt_state_init(void); +struct task_struct* plan_rt_state_peek_proc(void); +void plan_rt_state_peek(pbm_NODE** node_res, struct task_struct** proc_res); +int plan_rt_state_is_empty(void); +void plan_rt_state_push(pbm_NODE* node, struct task_struct* proc); +void plan_rt_state_pop(void); +void plan_rt_state_debug_print(void); +void plan_rt_state_incr_num_exited_procs(void); +u64 plan_rt_state_num_exited_procs(void); + #endif //PLAN_BASED_LINUX_SCHEDULER_BEHAVE_H diff --git a/kernel/exit.c b/kernel/exit.c index de4261f61cca3d931bf9b570a06d678a356dd450..08b0df2566aeb55baa8bec80f3271ac93275a6f5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -771,10 +771,29 @@ void __noreturn do_exit(long code) struct rq* rq; rq = this_rq(); + + //FIXME: This disclaimer is not really true anymore (since i dont follow it and it works either way) + /** + * BEWARE: + * We want to update the plan runtime model not too early to ensure that the exiting process + * has already signaled its exit to its parents otherwise the scheduler could switch the + * process to early to the parent which itself is waiting for the child to exit which leads to + * a deadlock! + * But we also do not want to update it too late since then the model would not be up to date + * when the next task should be picked and therefore the child would remain the current process + * of the model which also would lead to a deadlock since the child will never be replaced by + * the runnable parent. + * We want to update the model exactly after the childs exit-signal is sent to the parent. + */ // prevent syscalls from outside of the measured program (e.g. admin tasks) to be recognized if (PB_EXEC_MODE == rq->pb.mode) { - // set flag so that the pb-scheduler knows which syscall triggered the scheduling - rq->pb.triggering_syscall = sched_trig_EXIT; + printk(KERN_EMERG "EXIT: %u, CMD: '%s', PARENT-CMD: '%s'\n", tsk->pid, tsk->comm, tsk->real_parent->comm); + // inform the pb-scheduler knows which syscall triggered the scheduling + rq->pb.triggering_syscall.type = sched_trig_EXIT; + rq->pb.triggering_syscall.origin = current; + + // inform the plan runtime state + plan_rt_state_incr_num_exited_procs(); } // call the readout before the process is terminated @@ -913,6 +932,7 @@ void __noreturn do_exit(long code) TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); TASKS_RCU(preempt_enable()); exit_notify(tsk, group_dead); + proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX diff --git a/kernel/fork.c b/kernel/fork.c index 812c9cbc1c4195a0477fc7e7b0540d6b53e87ee9..f2d26cbec45225621274d683757a3a2895e50b33 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2024,10 +2024,13 @@ long _do_fork(unsigned long clone_flags, fork_date = NULL; rq = this_rq(); + // prevent syscalls from outside of the measured program (e.g. admin tasks) to be recognized if (PB_EXEC_MODE == rq->pb.mode) { - // set flag so that the pb-scheduler knows which syscall triggered the scheduling - rq->pb.triggering_syscall = sched_trig_FORK; + printk(KERN_EMERG "DO FORK CALLED by: '%s' %u\n", current->comm ,parent_pid); + // inform the pb-scheduler knows which syscall triggered the scheduling + rq->pb.triggering_syscall.type = sched_trig_FORK; + rq->pb.triggering_syscall.origin = current; } // FIXME: This will not get called for mpirun since then bash will be the parent here @@ -2064,6 +2067,7 @@ long _do_fork(unsigned long clone_flags, if (!IS_ERR(p)) { struct completion vfork; struct pid *pid; + int error; trace_sched_process_fork(current, p); @@ -2088,6 +2092,21 @@ long _do_fork(unsigned long clone_flags, p->pid, p->comm, p->real_parent->real_parent->comm); pbm_fork(p, parent_pid, fork_date); } + /** + * BEWARE: + * This info update must happen before the process wakes up since the information is used + * in the task enqueue routine which would otherwise lead to Nullptr-Errors etc.. + */ + if (PB_EXEC_MODE == rq->pb.mode) { + // inform the pb-scheduler which child process has been spawned + rq->pb.triggering_syscall.target = p; + // start observing the child process + error = init_perf_event_into_map(p, 0); + if (error) { + printk(KERN_WARNING "TASK: %u | Counting NOT started due to error\n", p->pid); + return -1; + } + } wake_up_new_task(p); diff --git a/kernel/sched/pb.c b/kernel/sched/pb.c index 39748bde853186df9ecc88ea1a0f67e4b5e9f5d3..fb196f10b390f14912815731f65af8d16947fa4c 100644 --- a/kernel/sched/pb.c +++ b/kernel/sched/pb.c @@ -5,11 +5,31 @@ #include <linux/spinlock.h> #include <linux/perf_event.h> #include <linux/kthread.h> +#include <linux/list.h> #include "sched.h" typedef struct pb_plan pb_plan_t; +// terminal colors (source: https://pkg.go.dev/github.com/whitedevops/colors) +#define Red "\033[91m" +#define Yellow "\033[33m" +#define Cyan "\033[36m" +#define Bold "\033[1m" +#define End "\033[0m" + + +static void reset_triggering_syscall_info(void) { + struct pb_rq *pb = &(this_rq()->pb); + pb->triggering_syscall.type = sched_trig_OTHER; + pb->triggering_syscall.origin = NULL; + pb->triggering_syscall.target = NULL; +} + + +/* -------------------------------------------------------------------------- */ + SYSCALL_DEFINE1(pb_set_plan, pid_t, reference_proc_pid) { + pb_plan_t _plan; struct task_struct* task; struct rq* rq; struct pb_rq* pb_rq; @@ -19,6 +39,8 @@ SYSCALL_DEFINE1(pb_set_plan, pid_t, reference_proc_pid) { unsigned long copied; unsigned int i; int res; + + PBM* pbm; copied = copy_from_user(&_plan, plan, sizeof(pb_plan_t)); @@ -47,12 +69,34 @@ SYSCALL_DEFINE1(pb_set_plan, pid_t, reference_proc_pid) { return -1; } + pbm = get_pbm_by_pid(last_plan_pid/*_plan.ref_pid (FIXME: THIS IS JUST FOR TESTING PURPOSES since it would be better to be able to insert the wanted pid as an argument)*/); + + printk(KERN_WARNING "Init Plan RunTime state\n"); + // reset plan runtime state + plan_rt_state_init(); + + // prepare the plan runtime stack by pushing the root node of the reference model/plan + plan_rt_state_push(pbm->root, task); + rq = this_rq(); + // prevent the pb-scheduler from being informed of the fork of the root process since we want + // it to contiue running instead of getting switched + reset_triggering_syscall_info(); + + // Only the root process has the scheduling class manually set since it is already initialized + // for using the fair scheduler in the fork() syscall in pb_submitter which must happen before + // initializing the plan since knowledge of the PID of the root process is necessary to init + // the performance counting as soon as possible. + // BEWARE: This also means that the scheduling class of this root process needs to be reset to + // the fair scheduler on exit so that it can do the necessary cleanup in its data structures. task->sched_class = &pb_sched_class; pb_rq = &rq->pb; + pb_rq->root_proc = task; + pb_rq->num_exited_procs = pbm->child_count; + set_pb_plan_size(pb_rq, _plan.num_tasks); for (i = 0; i < _plan.num_tasks; i++ ) { @@ -86,15 +130,15 @@ int pb_submit_plan(struct rq *rq) * Must be volatile to ensure correct initialization order */ volatile struct pb_rq * pb = (volatile struct pb_rq*)(&(rq->pb)); - int perf_init_res; + int error; int i = 0; if (pb->mode != PB_DISABLED_MODE) { return -1; } - perf_init_res = init_perf_event(pb->plan[i].task_struct, pb->plan[i].n_instr, &(pb->pevent)); - if(perf_init_res < 0) { + error = init_perf_event_into_map(pb->plan[i].task_struct, pb->plan[i].n_instr); + if(error) { //initialization error detection/handling could happen here printk(KERN_WARNING "PB INIT,%u: FAILED OPEN PERF EVENT\n", i); } else { @@ -146,21 +190,88 @@ void init_pb_rq(struct pb_rq *pb_rq) pb_rq->mode = PB_DISABLED_MODE; pb_rq->c_entry = 0; pb_rq->size = 0; - pb_rq->pevent = NULL; pb_rq->is_initialized = 0; pb_rq->waiting_on_io = 0; + + init_pid_2_pevent_map(); } EXPORT_SYMBOL(init_pb_rq); // IO has finished, we can schedule the next task +/** + * This is called in activate_task() in wake_up_new_task() which gets called during _do_fork() + * while switching from parent- to child-context. __schedule() may be called after it depending on + * various conditions (see __schedule in kernel/sched/core.c) but therefore it is always called + * before dequeue_task_pb() and pick_next_task_pb(). + */ static void enqueue_task_pb(struct rq *rq, struct task_struct *p, int flags) { struct pb_rq *pb = &(rq->pb); pb->waiting_on_io = 0; + + if (rq->pb.mode == PB_EXEC_MODE) { + printk(KERN_WARNING "ENQEUE TASK %u\n", p->pid); + if (pb->triggering_syscall.type == sched_trig_FORK) { + pbm_NODE* cur_node; + pbm_NODE* fork_node; + struct task_struct* cur_proc; + int fork_node_type; + + // safe current process for later use since the plan_rt_state might get modified + plan_rt_state_peek(&cur_node, &cur_proc); + fork_node = cur_node->children; + fork_node_type = fork_node->type; + + /** + * the scheduling class (pb) of the forked child is set in kernel/sched/core.c:sched_fork() + */ + + // if a fork occured then the next node should be a fork node + if (FORK != fork_node_type) { + printk(KERN_WARNING "ERROR: Fork node expected but got: %i\n", fork_node_type); + //TODO: Delegate to higher instance + } + + /** + * since we prepend the child node in pbm_fork() (see behave.c) the child of a + * fork-node is the child node (->children) while the parent is the next sibling + * (->next_sib). + */ + + // update the parent node: Keep the process and replace the node before the fork + // with the _parent_ node after it + // Precondition: The plan_rt_state is not empty (since pb_set_plan() initialized it) + plan_rt_state_pop(); + plan_rt_state_push(fork_node->children->next_sib, pb->triggering_syscall.origin); + + // add the child + plan_rt_state_push(fork_node->children, pb->triggering_syscall.target); + plan_rt_state_debug_print(); + + // reset the info so that the next relevant triggering syscall can be detected again + reset_triggering_syscall_info(); + + // force rescheduling so that the child can be picked nextZ + resched_curr(rq); //FIXME TESTING!!! + } + } } -// task started IO and thus it is finished +/** + * This is called right AFTER enqueue_task() in wake_up_new_task(). + */ +static void check_preempt_curr_pb(struct rq *rq, struct task_struct *p, int flags) +{ + // NOP +} + +/** + * task started IO and thus it is finished + * + * This is called in __schedule() by deactivate_task() BEFORE pick_next_task() which means the + * state of the plan (e.g. in case of an exit) is not yet updated! + */ static void dequeue_task_pb(struct rq *rq, struct task_struct *p, int flags) { struct pb_rq *pb = &(rq->pb); @@ -168,72 +279,130 @@ static void dequeue_task_pb(struct rq *rq, struct task_struct *p, int flags) u64 perf_counter; u64 counter_diff; u64 read_error; - bool premature_finish = false; + u64 expected_instr_count; + u64 diff_from_expected; + pbm_NODE* cur_node; + struct task_struct* cur_proc; + struct perf_event* pevent; + bool process_exited = false; - if (pb->waiting_on_io) { + //printk("Dequeue task: %u\n", p->pid); + + if (pb->waiting_on_io && p->state != TASK_DEAD) { + printk("Waiting for IO\n"); return; } pb->waiting_on_io = 1; c_entry_curr = pb->c_entry; + + // safe current process for later use since the plan_rt_state might get modified + plan_rt_state_peek(&cur_node, &cur_proc); + pevent = get_pevent_by_pid(cur_proc->pid); - if(!pb->pevent) { + if(!pevent) { printk("WARNING: PERF EVENT IS NULL"); } + if (pb->triggering_syscall.type == sched_trig_EXIT) { + // remove the exited process from the stack and run the next available + plan_rt_state_pop(); + process_exited = true; + //terminate_perf_event(get_pevent_by_pid(prev_proc->pid)); + plan_rt_state_debug_print(); + + // reset the info so that the next relevant triggering syscall can be detected again + reset_triggering_syscall_info(); + } else { + if (!plan_rt_state_is_empty()) { + plan_rt_state_debug_print(); + } + } + + if (cur_node) { + expected_instr_count = cur_node->count; + } else { + expected_instr_count = 0; + } + // printk(KERN_ALERT "DEBUG: Passed %s %d \n",__FUNCTION__,__LINE__); - read_error = get_perf_counter(pb->pevent, &perf_counter); + read_error = get_perf_counter(pevent, &perf_counter); if (read_error) { printk(KERN_WARNING "FETCHING PERFORMANCE COUNTER IN PB SCHEDULER FAILED WITH %llu\n", read_error); } - counter_diff = perf_counter - pb->total_instr; + counter_diff = perf_counter; //- pb->total_instr; pb->plan[c_entry_curr].n_instr_counted = counter_diff; pb->total_instr = perf_counter; - if (counter_diff < pb->plan[c_entry_curr].n_instr) { - u64 under_time = pb->plan[c_entry_curr].n_instr - counter_diff; - - printk(KERN_WARNING "PB TASK %llu RAN %llu INSTRUCTIONS TOO SHORT\n", pb->plan[pb->c_entry].task_id, under_time); - } else if (counter_diff > pb->plan[c_entry_curr].n_instr) { - //TODO: Check if actually an overflow occurs and an another calculation is necessary - // (setting a flag in the perf overflow_handler could be a solution) - u64 over_time = counter_diff - pb->plan[c_entry_curr].n_instr; - - printk(KERN_WARNING "PB TASK %llu RAN %llu INSTRUCTIONS TOO LONG\n", pb->plan[pb->c_entry].task_id, over_time); + diff_from_expected = abs(counter_diff - expected_instr_count); + //TODO: Set proper threshold for significance (relative values would probably be better than absolutes) + //if (diff_from_expected > 0) + { + printk(KERN_WARNING Yellow Bold "PB TASK %u RAN %llu / %llu INSTRUCTIONS TOO %s" End "\n", + cur_proc->pid, diff_from_expected, expected_instr_count, + counter_diff < expected_instr_count ? "SHORT" : "LONG"); } pb->c_entry++; - /** - * Don't schedule a task that is dead. (e.g. plan was incorrect and program finished quicker) - * TODO: if we have multiple tasks structs try the next plan entry - */ - if (pb->c_entry < pb->size && pb->plan[pb->c_entry].task_struct->state == TASK_DEAD) { - premature_finish = true; + //TODO: Can this actually happen? Can a process die without calling exit? + // remove a dead process which has not called exit from the plan + if (!process_exited && cur_proc->state == TASK_DEAD) { + plan_rt_state_pop(); } - if (pb->c_entry >= pb->size || premature_finish) { - if (premature_finish) { + if (is_plan_finished(pb)) { + if (!is_plan_successful(pb)) { printk(KERN_WARNING "PLAN TERMINATED PREMATURELY \n"); } else { - printk(KERN_WARNING "PLAN DONE \n"); + printk(KERN_WARNING Bold Yellow "PLAN DONE" End "\n"); } // set back to cfs for completion of task pb->is_initialized = 0; - pb->plan[0].task_struct->sched_class = &fair_sched_class; + } + printk(KERN_WARNING Bold Yellow "Exited: %i .Proc state: %li (?= %i)" End "\n", process_exited, cur_proc->state, TASK_DEAD); + + if (process_exited && pb->root_proc == cur_proc) { + cur_proc->sched_class = &fair_sched_class; resched_curr(rq); } -} -static void yield_task_pb(struct rq *rq) -{ - // NOP + /* + // show all current processes (source: https://unix.stackexchange.com/questions/299140/linux-is-there-a-way-to-dump-the-task-run-queue/336663#336663) + { + struct task_struct *process, *thread; + int cnt = 0; + + rcu_read_lock(); + for_each_process_thread(process, thread) { + task_lock(thread); + + // exclude system processes (which have been started earlier and thereby have a lower pid) + if (thread->pid > 1000) { + printk(KERN_WARNING "%li, %u, %s, %s\n", thread->state, thread->pid, thread->comm, + thread->sched_class == &fair_sched_class ? "fair" : + (thread->sched_class == &pb_sched_class ? "pb" : + "other")); + } + task_unlock(thread); + cnt++; + } + rcu_read_unlock(); + } + */ } -static void check_preempt_curr_pb(struct rq *rq, struct task_struct *p, int flags) +static void task_dead_pb(struct task_struct *p) { - // NOP + /** + * We cant put the exit notification here because this only gets called _after_ a switch from a + * dead process to another one. + * For such a context switch it would be required that a new process is chosen which only + * happens when the plan runtime state is updated but that update requires the information that + * should be emitted here (which closes the circular dependency). + * Therefore it must be done e.g. in do_exit() and not here. + */ } static struct task_struct * pick_next_task_pb(struct rq *rq, @@ -243,6 +412,9 @@ static struct task_struct * pick_next_task_pb(struct rq *rq, struct task_struct *picked = NULL; enum pb_mode current_mode, next_mode; struct pb_rq *pb = &(rq->pb); + + // FIXME: Testing purposes + struct task_struct* one; int i; current_mode = pb->mode; next_mode = determine_next_mode_pb(rq); @@ -250,8 +422,8 @@ static struct task_struct * pick_next_task_pb(struct rq *rq, if (next_mode == PB_DISABLED_MODE && current_mode == PB_EXEC_MODE) { // After Plan is done do the cleanup - terminate_perf_event(pb->pevent); - pb->pevent = NULL; + // FIXME: This should be done for all processes on exit and not just for the last one in the plan?! + //terminate_perf_event(&(get_pevent_by_pid(plan_rt_state_peek_proc()->pid))); // TODO: Check if we have to free the memory or if perf takes care of it // see 'perf_event_release_kernel(struct perf_event *event)' in core.c } @@ -260,7 +432,7 @@ static struct task_struct * pick_next_task_pb(struct rq *rq, * pb scheduler starts executing */ if (current_mode == PB_DISABLED_MODE && current_mode != next_mode) { - if (pb->c_entry < pb->size && pb->plan[pb->c_entry].task_struct->state == TASK_DEAD) { + if (!is_plan_finished(pb) && (plan_rt_state_peek_proc() ? (plan_rt_state_peek_proc()->state == TASK_DEAD) : true)) { pb->mode = PB_DISABLED_MODE; next_mode = PB_DISABLED_MODE; picked = NULL; @@ -270,47 +442,38 @@ static struct task_struct * pick_next_task_pb(struct rq *rq, } if (current_mode != next_mode) { - printk("SWITCHING MODES\n"); + printk("SWITCHING MODES: %i -> %i\n", current_mode, next_mode); pb->count_admin_cycles = 0; pb->count_pb_cycles = 0; // Push last non-plan task back in its corresponding runqueue if (next_mode == PB_EXEC_MODE) { // Necessary to manage the preempted task printk("PUT OLD TASK BACK IN RQ\n"); - put_prev_task(rq, prev); + put_prev_task(rq, prev); // TODO: This does nothing, right? So why even call? } } // EXEC Mode is next, so we return our next task to be executed if (next_mode == PB_EXEC_MODE) { - switch(pb->triggering_syscall) { - case sched_trig_FORK: - printk(KERN_WARNING "FORK TRIGGERED THIS!!!\n"); - break; - case sched_trig_EXIT: - printk(KERN_WARNING "EXIT TRIGGERED THIS!!!\n"); - break; - default: - printk(KERN_WARNING "OTHER STUFF TRIGGERED THIS!!!\n"); - break; - } - - // reset the flag so that the relevant syscalls can be detected if they are the trigger - pb->triggering_syscall = sched_trig_OTHER; - // printk(KERN_ALERT "DEBUG: Passed %s %d \n",__FUNCTION__,__LINE__); if(current_mode == PB_ADMIN_MODE) { printk(KERN_DEBUG "PB ADMIN,STOP,%u,%llu\n", pb->c_entry, sched_clock()); } else if (current_mode == PB_DISABLED_MODE) { printk("Switching from disabled to EXEC\n"); } - picked = pb->plan[pb->c_entry].task_struct; + + picked = plan_rt_state_peek_proc(); } return picked; } +static void yield_task_pb(struct rq *rq) +{ + // NOP +} + static void put_prev_task_pb(struct rq *rq, struct task_struct *p) { // NOP @@ -363,7 +526,7 @@ static void update_curr_pb(struct rq *rq) } const struct sched_class pb_sched_class = { - .next = &dl_sched_class, + .next = &dl_sched_class, // arrange the pb scheduler to be higher prioritized than the deadline (dl) scheduler .enqueue_task = enqueue_task_pb, .dequeue_task = dequeue_task_pb, .yield_task = yield_task_pb, @@ -373,6 +536,7 @@ const struct sched_class pb_sched_class = { .pick_next_task = pick_next_task_pb, .put_prev_task = put_prev_task_pb, // NOP + //.task_dead = task_dead_pb, // NOP .set_curr_task = set_curr_task_pb, // NOP .task_tick = task_tick_pb, diff --git a/kernel/sched/perf_error_detection.c b/kernel/sched/perf_error_detection.c index 96d3299846d9593a4c2c26db81e2f6df5a6d2508..87244e1aaa8352aa69a75935ca6ec09bc5938173 100644 --- a/kernel/sched/perf_error_detection.c +++ b/kernel/sched/perf_error_detection.c @@ -94,7 +94,7 @@ u64 get_perf_counter(struct perf_event *pevent, u64 *perf_counter) return read_error; } -u64 terminate_perf_event(struct perf_event *pevent) +u64 terminate_perf_event(struct perf_event* pevent) { u64 result; unsigned long irq_flags; @@ -104,7 +104,83 @@ u64 terminate_perf_event(struct perf_event *pevent) perf_event_disable(pevent); result = perf_event_release_kernel(pevent); local_irq_restore(irq_flags); - pevent = NULL; + //*pevent = NULL; return result; } + + +/* ----------------------------------------------------------------------------- + * PID -> pevent* Hashmap + * Only insertion is needed currently (since memory efficiency is not our + * current concern) + */ + +pid_2_pevent_map pid_2_pevent; + +void init_pid_2_pevent_map(void) +{ + int i; + for (i = 0; i < PROC_BUF_SIZE; ++i) { + pid_2_pevent.index_to_pid[i] = 0; + pid_2_pevent.index_to_pevent[i] = NULL; + } + pid_2_pevent.last_proc_index = 0; +} + +/* + * Returns the pevent-pointer of the corresponding given process by PID if it + * exists otherwise it returns NULL (assumption: process with PID=0 is not + * considered a normal process) + * + * This is a primitive "Hashmap"-retrieval implementation (O(n)) + */ +struct perf_event* get_pevent_by_pid(pid_t pid) +{ + size_t i; + // to get the pevent by PID one needs to find the index that corresponds to + // the given PID and use that to retrieve the pevent of the second array + // directly by index + size_t proc_index = 0; + for(i = 0; i < PROC_BUF_SIZE; i++) { + if (pid_2_pevent.index_to_pid[i] == pid) { + proc_index = i; + break; + } + } + return proc_index != 0 ? pid_2_pevent.index_to_pevent[proc_index] : NULL; +} + +/* + * Adds a process (pid, pevent) to the pid->pevent hashmap (NON-idempotently!) + * + * Returns if the process has been successfully inserted into the hashmap + */ +int add_proc_to_pevent_map(pid_t pid, struct perf_event* pevent) +{ + if (TASK_BUF_SIZE <= pid_2_pevent.last_proc_index) { + printk(KERN_WARNING "PROC MAP ADD: last_proc_index too large: %lu\n", pid_2_pevent.last_proc_index); + return 0; + } + printk(KERN_WARNING "i: %lu, pid: %u\n", pid_2_pevent.last_proc_index, pid); + pid_2_pevent.last_proc_index++; + pid_2_pevent.index_to_pid[pid_2_pevent.last_proc_index] = pid; + pid_2_pevent.index_to_pevent[pid_2_pevent.last_proc_index] = pevent; + return 1; +} + +int init_perf_event_into_map(struct task_struct* proc, u64 num_instr) +{ + int error; + struct perf_event* pevent; + pid_t pid = proc->pid; + + error = init_perf_event(proc, num_instr, &pevent); + if (error) { + printk(KERN_WARNING "TASK: %u | Counting NOT started due to error\n", pid); + return -1; + } + add_proc_to_pevent_map(pid, pevent); + + return 0; +} \ No newline at end of file diff --git a/kernel/sched/perf_error_detection.h b/kernel/sched/perf_error_detection.h index 09024038328dcd2d6df74b5fcb1d9949b28cefd0..02113e82f8afd45c8a3f4ee0280c17021c35d1b5 100644 --- a/kernel/sched/perf_error_detection.h +++ b/kernel/sched/perf_error_detection.h @@ -10,7 +10,7 @@ int init_perf_event(struct task_struct* proc, u64 num_instr, struct perf_event * u64 get_perf_counter(struct perf_event *pevent, u64 *perf_counter); -u64 terminate_perf_event(struct perf_event *pevent); +u64 terminate_perf_event(struct perf_event* pevent); typedef struct { // primitive int -> int and int -> pevent* hashmap combination @@ -22,4 +22,9 @@ typedef struct { //pthread_mutex_t lock; } pid_2_pevent_map; +void init_pid_2_pevent_map(void); +struct perf_event* get_pevent_by_pid(pid_t pid); +int add_proc_to_pevent_map(pid_t pid, struct perf_event* pevent); +int init_perf_event_into_map(struct task_struct* proc, u64 num_instr); + #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 876ef5b02926a2e64806bf1bd5f3684eec3e209a..063e47dde2510ad2e6caf4f1d72919549697ba90 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -40,6 +40,8 @@ #include <linux/perf_event.h> // For performance counter +#include "../behave.h" + #ifdef CONFIG_SCHED_DEBUG #define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else @@ -544,10 +546,17 @@ struct pb_plan { pid_t pid; // process_id of the prgramm tp execute with the plan uint64_t *inst_cnt; // array of estimated instructions for each task size_t num_tasks; // number of tasks in the plan + pid_t ref_pid; // pid of the root parent task of the plan used as reference (as generated by libpbm) }; enum sched_trigger_syscall { sched_trig_FORK, sched_trig_EXIT, sched_trig_OTHER }; +struct syscall_info { + enum sched_trigger_syscall type; // which syscall triggered the scheduler + struct task_struct* origin; // e.g. parent process in case of a fork + struct task_struct* target; // e.g. fork: child process, exit: NULL +}; + struct pb_rq { struct plan_entry *plan; // plan (used to be proxy_task) @@ -562,10 +571,13 @@ struct pb_rq enum pb_mode mode; // current scheduler mode + struct task_struct* root_proc; // a plans root/parent process + u64 total_instr; // total counted instructions for current plan - struct perf_event *pevent; // linux perf handle - enum sched_trigger_syscall triggering_syscall; // which syscall triggered the scheduler + u64 num_exited_procs; // total number of exited/spawned processes of the plan to know when it is finished + + struct syscall_info triggering_syscall; // which syscall triggered the scheduler /* * flag determining whether the plan is completely initialized and should be run @@ -901,6 +913,29 @@ static inline int cpu_of(struct rq *rq) struct task_struct *find_task_by_vpid(pid_t vnr); +/** + * If the runtime plan has been finished without checking if it conforms to the forecasted plan + */ +static inline int is_plan_finished(struct pb_rq* pb) +{ + return plan_rt_state_is_empty()/* && pb->is_initialized*/; +} + +/** + * If the runtime plan actually conforms to the forecasted plan + */ +static inline int is_plan_successful(struct pb_rq* pb) +{ + printk(KERN_WARNING "Plan successful? Exited processes: actual: %llu, expected: %llu\n", plan_rt_state_num_exited_procs(), pb->num_exited_procs); + return is_plan_finished(pb) && pb->num_exited_procs == plan_rt_state_num_exited_procs(); +} + +/** + * BEWARE: + * This function must take large amounts of compute time (e.g. by calling printk() etc. in it) + * since it must not get interrupted by the scheduler ticks (e.g. task_tick_pb) since that can lead + * to deadlocks! + */ // used to determine the next mode of the PB-Scheduler // This function is located in sched.h since pb.c and fair.c are using this function static inline int determine_next_mode_pb(struct rq *rq) @@ -947,7 +982,7 @@ static inline int determine_next_mode_pb(struct rq *rq) * tasks were pushed forward by the default scheduler and the IO * starved. We have to wait until the process is runnable. */ - if (pb->plan[pb->c_entry].task_struct->state == 0) + if (plan_rt_state_peek_proc()->state == TASK_RUNNING) { /* * 0 == Runnable (IO succeeded) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 9f69fb6308537b2b7e6b9ba47f6cb17fd17d97ce..99636d7dbceafcb463316b6515f21ad77360da6e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -110,7 +110,7 @@ static void update_curr_stop(struct rq *rq) * Simple, special scheduling class for the per-CPU stop tasks: */ const struct sched_class stop_sched_class = { - .next = &dl_sched_class, + .next = &pb_sched_class, .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, diff --git a/pb_utils/pb_submitter/example_run.sh b/pb_utils/pb_submitter/example_run.sh index 43de115acb584936c3078f8302fc265469b4f12d..991ef59c4c13ee0d3ec7fe4f2a183074a9c232c9 100755 --- a/pb_utils/pb_submitter/example_run.sh +++ b/pb_utils/pb_submitter/example_run.sh @@ -1,3 +1,6 @@ #!/bin/sh cd /root -./pb_submitter test_prog example_plan +# FIXME: For testing purposes the first argument is simply the PID of the reference plan (which has +# to be run before with kernel-libpbm so that it is available in the data structure) +#./pb_submitter $1 example_plan +./pb_submitter 666 example_plan # FIXME: THIS IS JUST FOR TESTING PURPOSES \ No newline at end of file diff --git a/pb_utils/test_results b/pb_utils/test_results deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000