From a0336fb6c1251d7105bd1706b0048aff29b123ba Mon Sep 17 00:00:00 2001
From: FKHals <5229803-FKHals@users.noreply.gitlab.com>
Date: Wed, 4 Oct 2023 12:54:12 +0200
Subject: [PATCH] Implement the plan runtime state and other things

which could not be further split into coherent incremental steps and
therefore unfortunately resulted in this giant commit.
---
 kernel/behave.c                      | 109 ++++++++++-
 kernel/behave.h                      |  46 +++++
 kernel/exit.c                        |  24 ++-
 kernel/fork.c                        |  23 ++-
 kernel/sched/pb.c                    | 280 +++++++++++++++++++++------
 kernel/sched/perf_error_detection.c  |  80 +++++++-
 kernel/sched/perf_error_detection.h  |   7 +-
 kernel/sched/sched.h                 |  41 +++-
 kernel/sched/stop_task.c             |   2 +-
 pb_utils/pb_submitter/example_run.sh |   5 +-
 pb_utils/test_results                |   0
 11 files changed, 542 insertions(+), 75 deletions(-)
 delete mode 100644 pb_utils/test_results

diff --git a/kernel/behave.c b/kernel/behave.c
index 821e231901c6..129d699b3b3c 100644
--- a/kernel/behave.c
+++ b/kernel/behave.c
@@ -31,6 +31,9 @@ SYSCALL_DEFINE0(pbm_set_root_proc) {
     return 0;
 }
 
+// FIXME: THIS IS JUST FOR TESTING PURPOSES so that i dont have to read and write the reference pid manually in the console
+pid_t last_plan_pid;
+
 /******************************************************************************
 * Based on "libpbm" (see header file for more info)
 */
@@ -490,8 +493,8 @@ int pbm_fork(struct task_struct* proc, pid_t parent_pid, pbm_NODE* fork_node) {
         BUG_ON(current->pid != root_proc);
     }
 
-    // We have to know WHEN the exit happens relative to the parent. So every child remembers the
-    // current fork-task-node of the parent on exit (so that the join can happen at the correct
+    // We have to know WHEN the fork happens relative to the parent. So every child remembers the
+    // current fork-task-node of the parent on exit (so that the merge can happen at the correct
     // position (more or less, may be imperfect due to parallelism))
     child_pbm->fork_date = fork_node;
 
@@ -600,7 +603,7 @@ void pbm_post_processing(PBM* pbm) {
     if(child_pbm) {
         printk(KERN_WARNING "Joining child thread %u\n", child_pbm->last->thread_id);
         pbm_post_processing(child_pbm);
-        pbm_join(child_pbm);
+        //pbm_join(child_pbm);
         // TODO Remove from list of childs or just mark as visited?
     }
 
@@ -608,11 +611,19 @@ void pbm_post_processing(PBM* pbm) {
     if(sib_pbm) {
         printk(KERN_WARNING "Joining sib thread %u\n", sib_pbm->last->thread_id);
         pbm_post_processing(sib_pbm);
-        pbm_join(sib_pbm);
+        //pbm_join(sib_pbm);
         // TODO Remove from list of siblings or just mark as visited?
     }
 }
 
+/* Crude recursive pbm node counter, starts with given PBM */
+u64 pbm_count_children(PBM* pbm) {
+    printk(KERN_WARNING "Node %lli: (pid = %u)\n", pbm_2_index(pbm), pbm->root->thread_id);
+    // count itself (1) and add the count of children and sibblings if existing
+    return 1 + (pbm->children ? pbm_count_children(pbm->children) : 0)
+             + (pbm->next_sib ? pbm_count_children(pbm->next_sib) : 0);
+}
+
 /* -----------------------------------------------------------------------------
 * PBM graph output functions
 */
@@ -634,6 +645,8 @@ void pbm_join_and_print_graph_self(pid_t pid) {
     printk(KERN_WARNING "indices: %lu, %u, %u\n", index_2_pbm->last_proc_index, procs->curr_proc_index, tasks->curr_task_index);
     pbm = get_pbm_by_pid(pid);
     if (pbm) {
+        pbm->child_count = pbm_count_children(pbm);
+        printk(KERN_WARNING "Child count: %llu\n", pbm->child_count);
         pbm_post_processing(pbm);
         printk(KERN_WARNING "After post-processing:\n");
         debug_print_tasks();
@@ -643,6 +656,10 @@ void pbm_join_and_print_graph_self(pid_t pid) {
     }
     // reset so that is_relevant_process() can return early
     is_initialized = 0;
+
+    printk(KERN_WARNING "ROOT: %u\n", pid);
+    // FIXME: THIS IS JUST FOR TESTING PURPOSES
+    last_plan_pid = pid;
 }
 
 /* Crude recursive ADG printer, starts with given node */
@@ -742,4 +759,86 @@ bool is_relevant_process(struct task_struct* p) {
         proc = proc->real_parent;
     }
     return false;
-}
\ No newline at end of file
+}
+
+
+/******************************************************************************
+ * Plan runtime state
+ */
+struct plan_rt_state prs;
+
+void plan_rt_state_init(void)
+{
+    int i;
+    for (i = 0; i < PROC_BUF_SIZE; ++i) {
+        prs.node_stack[i] = NULL;
+        prs.proc_stack[i] = NULL;
+    }
+    prs.stack_size = 0;
+    prs.num_exited_procs = 0;
+}
+
+struct task_struct* plan_rt_state_peek_proc(void)
+{
+    return prs.proc_stack[prs.stack_size - 1];
+}
+
+void plan_rt_state_peek(pbm_NODE** node_res, struct task_struct** proc_res)
+{
+	if (0 != prs.stack_size) {
+		*node_res = prs.node_stack[prs.stack_size - 1];
+		*proc_res = plan_rt_state_peek_proc();
+	}
+}
+
+int plan_rt_state_is_empty(void)
+{
+	// it is sufficient to only check for the process since there should never be a process without
+	// a corresponding node and vice verca
+	return NULL == plan_rt_state_peek_proc();
+}
+
+void plan_rt_state_push(pbm_NODE* node, struct task_struct* proc)
+{
+    if (prs.stack_size >= PROC_BUF_SIZE) {
+        printk(KERN_WARNING "ERROR: Plan runtime stack is full!\n");
+        return;
+    }
+    prs.node_stack[prs.stack_size] = node;
+    prs.proc_stack[prs.stack_size] = proc;
+    prs.stack_size++;
+}
+
+void plan_rt_state_pop(void)
+{
+    if (prs.stack_size == 0) {
+        printk(KERN_WARNING "ERROR: Plan runtime stack is already empty!\n");
+    }
+    prs.stack_size--;
+    prs.node_stack[prs.stack_size] = NULL;
+    prs.proc_stack[prs.stack_size] = NULL;
+}
+
+void plan_rt_state_incr_num_exited_procs(void)
+{
+    prs.num_exited_procs++;
+}
+
+u64 plan_rt_state_num_exited_procs(void)
+{
+    return prs.num_exited_procs;
+}
+
+void plan_rt_state_debug_print(void)
+{
+    size_t i;
+    pbm_NODE* node;
+    struct task_struct* task;
+    printk(KERN_WARNING "MAP\n-----\n");
+    for(i = 0; i < prs.stack_size; i++) {
+        node = prs.node_stack[i];
+        task = prs.proc_stack[i];
+        printk(KERN_WARNING "[%lu] %p, %p (%i, %s)\n", i, node, task, task ? task->pid : 0, task ? task->comm : "-");
+    }
+    printk(KERN_WARNING "-----\n");
+}
diff --git a/kernel/behave.h b/kernel/behave.h
index ec0439290ddd..8348874054c5 100644
--- a/kernel/behave.h
+++ b/kernel/behave.h
@@ -1,8 +1,12 @@
 #ifndef PLAN_BASED_LINUX_SCHEDULER_BEHAVE_H
 #define PLAN_BASED_LINUX_SCHEDULER_BEHAVE_H
 
+#include <linux/types.h>
 #include "sched/perf_error_detection.h"
 
+// FIXME: THIS IS JUST FOR TESTING PURPOSES so that i dont have to read and write the reference pid manually in the console
+extern pid_t last_plan_pid;
+
 /******************************************************************************
 * Based on "libpbm":
 *  Program Behaviour Model (PBM) as a Task Precedence Graph (TPG),
@@ -70,6 +74,9 @@ typedef struct _PBM
     struct _PBM* children; // first child (in a list of forked children)
     struct _PBM* next_sib; // next sibling
 
+    // number of children (and children's children) it totally spawned
+    uint64_t child_count;
+
     /*
     * Performance measurement and recording
     */
@@ -174,4 +181,43 @@ bool is_relevant_process(struct task_struct *p);
 int start_counting(struct task_struct *p);
 int stop_counting(void);
 
+/******************************************************************************/
+
+PBM* get_pbm_by_pid(pid_t pid);
+
+/******************************************************************************
+ * Plan runtime state
+ *
+ * Represents the current state of the plan at runtime.
+ * The head of the stack should always be the currently executed process and corresponding plan
+ * node.
+ *
+ * Implementation:
+ * Two stacks which store the plan-node and the corresponding runtime-process
+ * accumulated by forking and should be executed from top to bottom (current process is always the
+ * head).
+ * This works since we assume that a fork in the plan always corresponds to a runtime fork which
+ * means that a fork node that we push to the stack as a start for the traversal after the next
+ * exit-node should always have a corresponding runtime-process which should be run when the
+ * previous (child) process exits.
+ */
+struct plan_rt_state {
+	pbm_NODE* node_stack[PROC_BUF_SIZE];
+	struct task_struct* proc_stack[PROC_BUF_SIZE];
+	size_t stack_size;
+
+    // number of exited processes of the plan needed to measure of the plan is finished
+    u64 num_exited_procs;
+};
+
+void plan_rt_state_init(void);
+struct task_struct* plan_rt_state_peek_proc(void);
+void plan_rt_state_peek(pbm_NODE** node_res, struct task_struct** proc_res);
+int plan_rt_state_is_empty(void);
+void plan_rt_state_push(pbm_NODE* node, struct task_struct* proc);
+void plan_rt_state_pop(void);
+void plan_rt_state_debug_print(void);
+void plan_rt_state_incr_num_exited_procs(void);
+u64 plan_rt_state_num_exited_procs(void);
+
 #endif //PLAN_BASED_LINUX_SCHEDULER_BEHAVE_H
diff --git a/kernel/exit.c b/kernel/exit.c
index de4261f61cca..08b0df2566ae 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -771,10 +771,29 @@ void __noreturn do_exit(long code)
 	struct rq* rq;
 
 	rq = this_rq();
+
+	//FIXME: This disclaimer is not really true anymore (since i dont follow it and it works either way)
+	/**
+	 * BEWARE:
+	 * We want to update the plan runtime model not too early to ensure that the exiting process
+	 * has already signaled its exit to its parents otherwise the scheduler could switch the
+	 * process to early to the parent which itself is waiting for the child to exit which leads to
+	 * a deadlock!
+	 * But we also do not want to update it too late since then the model would not be up to date
+	 * when the next task should be picked and therefore the child would remain the current process
+	 * of the model which also would lead to a deadlock since the child will never be replaced by
+	 * the runnable parent.
+	 * We want to update the model exactly after the childs exit-signal is sent to the parent.
+	 */
 	// prevent syscalls from outside of the measured program (e.g. admin tasks) to be recognized
 	if (PB_EXEC_MODE == rq->pb.mode) {
-		// set flag so that the pb-scheduler knows which syscall triggered the scheduling
-		rq->pb.triggering_syscall = sched_trig_EXIT;
+		printk(KERN_EMERG "EXIT: %u, CMD: '%s', PARENT-CMD: '%s'\n", tsk->pid, tsk->comm, tsk->real_parent->comm);
+		// inform the pb-scheduler knows which syscall triggered the scheduling
+		rq->pb.triggering_syscall.type = sched_trig_EXIT;
+		rq->pb.triggering_syscall.origin = current;
+
+		// inform the plan runtime state
+		plan_rt_state_incr_num_exited_procs();
 	}
 
 	// call the readout before the process is terminated
@@ -913,6 +932,7 @@ void __noreturn do_exit(long code)
 	TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
 	TASKS_RCU(preempt_enable());
 	exit_notify(tsk, group_dead);
+
 	proc_exit_connector(tsk);
 	mpol_put_task_policy(tsk);
 #ifdef CONFIG_FUTEX
diff --git a/kernel/fork.c b/kernel/fork.c
index 812c9cbc1c41..f2d26cbec452 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2024,10 +2024,13 @@ long _do_fork(unsigned long clone_flags,
 	fork_date = NULL;
 
 	rq = this_rq();
+
 	// prevent syscalls from outside of the measured program (e.g. admin tasks) to be recognized
 	if (PB_EXEC_MODE == rq->pb.mode) {
-		// set flag so that the pb-scheduler knows which syscall triggered the scheduling
-		rq->pb.triggering_syscall = sched_trig_FORK;
+		printk(KERN_EMERG "DO FORK CALLED by: '%s' %u\n", current->comm ,parent_pid);
+		// inform the pb-scheduler knows which syscall triggered the scheduling
+		rq->pb.triggering_syscall.type = sched_trig_FORK;
+		rq->pb.triggering_syscall.origin = current;
 	}
 
 	// FIXME: This will not get called for mpirun since then bash will be the parent here
@@ -2064,6 +2067,7 @@ long _do_fork(unsigned long clone_flags,
 	if (!IS_ERR(p)) {
 		struct completion vfork;
 		struct pid *pid;
+		int error;
 
 		trace_sched_process_fork(current, p);
 
@@ -2088,6 +2092,21 @@ long _do_fork(unsigned long clone_flags,
 					p->pid, p->comm, p->real_parent->real_parent->comm);
 			pbm_fork(p, parent_pid, fork_date);
 		}
+		/**
+		 * BEWARE:
+		 * This info update must happen before the process wakes up since the information is used
+		 * in the task enqueue routine which would otherwise lead to Nullptr-Errors etc..
+		 */
+		if (PB_EXEC_MODE == rq->pb.mode) {
+			// inform the pb-scheduler which child process has been spawned
+			rq->pb.triggering_syscall.target = p;
+			// start observing the child process
+			error = init_perf_event_into_map(p, 0);
+			if (error) {
+				printk(KERN_WARNING "TASK: %u | Counting NOT started due to error\n", p->pid);
+				return -1;
+			}
+		}
 
 		wake_up_new_task(p);
 
diff --git a/kernel/sched/pb.c b/kernel/sched/pb.c
index 39748bde8531..fb196f10b390 100644
--- a/kernel/sched/pb.c
+++ b/kernel/sched/pb.c
@@ -5,11 +5,31 @@
 #include <linux/spinlock.h>
 #include <linux/perf_event.h>
 #include <linux/kthread.h>
+#include <linux/list.h>
 #include "sched.h"
 
 typedef struct pb_plan pb_plan_t;
 
+// terminal colors (source: https://pkg.go.dev/github.com/whitedevops/colors)
+#define Red "\033[91m"
+#define Yellow "\033[33m"
+#define Cyan "\033[36m"
+#define Bold "\033[1m"
+#define End "\033[0m"
+
+
+static void reset_triggering_syscall_info(void) {
+	struct pb_rq *pb = &(this_rq()->pb);
+	pb->triggering_syscall.type = sched_trig_OTHER;
+	pb->triggering_syscall.origin = NULL;
+	pb->triggering_syscall.target = NULL;
+}
+
+
+/* -------------------------------------------------------------------------- */
+
 SYSCALL_DEFINE1(pb_set_plan, pid_t, reference_proc_pid) {
+	pb_plan_t _plan;
 	struct task_struct* task;
 	struct rq* rq;
 	struct pb_rq* pb_rq;
@@ -19,6 +39,8 @@ SYSCALL_DEFINE1(pb_set_plan, pid_t, reference_proc_pid) {
 	unsigned long copied;
 	unsigned int i;
 	int res;
+
+	PBM* pbm;
 	
 	copied = copy_from_user(&_plan, plan, sizeof(pb_plan_t));
 	
@@ -47,12 +69,34 @@ SYSCALL_DEFINE1(pb_set_plan, pid_t, reference_proc_pid) {
 		return -1;
 	}
 
+	pbm = get_pbm_by_pid(last_plan_pid/*_plan.ref_pid (FIXME: THIS IS JUST FOR TESTING PURPOSES since it would be better to be able to insert the wanted pid as an argument)*/);
+
+	printk(KERN_WARNING "Init Plan RunTime state\n");
+	// reset plan runtime state
+	plan_rt_state_init();
+
+	// prepare the plan runtime stack by pushing the root node of the reference model/plan
+	plan_rt_state_push(pbm->root, task);
+
 	rq = this_rq();
 
+	// prevent the pb-scheduler from being informed of the fork of the root process since we want
+	// it to contiue running instead of getting switched
+	reset_triggering_syscall_info();
+
+	// Only the root process has the scheduling class manually set since it is already initialized
+	// for using the fair scheduler in the fork() syscall in pb_submitter which must happen before
+	// initializing the plan since knowledge of the PID of the root process is necessary to init
+	// the performance counting as soon as possible.
+	// BEWARE: This also means that the scheduling class of this root process needs to be reset to
+	// the fair scheduler on exit so that it can do the necessary cleanup in its data structures.
 	task->sched_class = &pb_sched_class;
 
 	pb_rq = &rq->pb;
 
+	pb_rq->root_proc = task;
+	pb_rq->num_exited_procs = pbm->child_count;
+
 	set_pb_plan_size(pb_rq, _plan.num_tasks);
 
 	for (i = 0; i < _plan.num_tasks; i++ ) {
@@ -86,15 +130,15 @@ int pb_submit_plan(struct rq *rq)
 	 * Must be volatile to ensure correct initialization order
 	 */
 	volatile struct pb_rq * pb = (volatile struct pb_rq*)(&(rq->pb));
-	int perf_init_res;
+	int error;
 	int i = 0;
 
 	if (pb->mode != PB_DISABLED_MODE) {
 		return -1;
 	}
 
-	perf_init_res = init_perf_event(pb->plan[i].task_struct, pb->plan[i].n_instr, &(pb->pevent));
-	if(perf_init_res < 0) {
+	error = init_perf_event_into_map(pb->plan[i].task_struct, pb->plan[i].n_instr);
+	if(error) {
 		//initialization error detection/handling could happen here
 		printk(KERN_WARNING "PB INIT,%u: FAILED OPEN PERF EVENT\n", i);
 	} else {
@@ -146,21 +190,88 @@ void init_pb_rq(struct pb_rq *pb_rq)
 	pb_rq->mode = PB_DISABLED_MODE;
 	pb_rq->c_entry = 0;
 	pb_rq->size = 0;
-	pb_rq->pevent = NULL;
 	pb_rq->is_initialized = 0;
 	pb_rq->waiting_on_io = 0;
+
+	init_pid_2_pevent_map();
 }
 EXPORT_SYMBOL(init_pb_rq);
 
 // IO has finished, we can schedule the next task
+/**
+ * This is called in activate_task() in wake_up_new_task() which gets called during _do_fork() 
+ * while switching from parent- to child-context. __schedule() may be called after it depending on
+ * various conditions (see __schedule in kernel/sched/core.c) but therefore it is always called 
+ * before dequeue_task_pb() and pick_next_task_pb().
+ */
 static void enqueue_task_pb(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct pb_rq *pb = &(rq->pb);
 
 	pb->waiting_on_io = 0;
+
+	if (rq->pb.mode == PB_EXEC_MODE) {
+		printk(KERN_WARNING "ENQEUE TASK %u\n", p->pid);
+		if (pb->triggering_syscall.type == sched_trig_FORK) {
+			pbm_NODE* cur_node;
+			pbm_NODE* fork_node;
+			struct task_struct* cur_proc;
+			int fork_node_type;
+
+			// safe current process for later use since the plan_rt_state might get modified
+			plan_rt_state_peek(&cur_node, &cur_proc);
+			fork_node = cur_node->children;
+			fork_node_type = fork_node->type;
+
+			/**
+			* the scheduling class (pb) of the forked child is set in kernel/sched/core.c:sched_fork()
+			*/
+
+			// if a fork occured then the next node should be a fork node
+			if (FORK != fork_node_type) {
+				printk(KERN_WARNING "ERROR: Fork node expected but got: %i\n", fork_node_type);
+				//TODO: Delegate to higher instance
+			}
+
+			/**
+			* since we prepend the child node in pbm_fork() (see behave.c) the child of a
+			* fork-node is the child node (->children) while the parent is the next sibling
+			* (->next_sib).
+			*/
+
+			// update the parent node: Keep the process and replace the node before the fork
+			// with the _parent_ node after it
+			// Precondition: The plan_rt_state is not empty (since pb_set_plan() initialized it)
+			plan_rt_state_pop();
+			plan_rt_state_push(fork_node->children->next_sib, pb->triggering_syscall.origin);
+
+			// add the child
+			plan_rt_state_push(fork_node->children, pb->triggering_syscall.target);
+			plan_rt_state_debug_print();
+
+			// reset the info so that the next relevant triggering syscall can be detected again
+			reset_triggering_syscall_info();
+
+			// force rescheduling so that the child can be picked nextZ
+			resched_curr(rq); //FIXME TESTING!!!
+		}
+	}
 }
 
-// task started IO and thus it is finished
+/**
+ * This is called right AFTER enqueue_task() in wake_up_new_task().
+ */
+static void check_preempt_curr_pb(struct rq *rq, struct task_struct *p, int flags)
+{
+	// NOP
+}
+
+/**
+ * task started IO and thus it is finished
+ *
+ * This is called in __schedule() by deactivate_task() BEFORE pick_next_task() which means the
+ * state of the plan (e.g. in case of an exit) is not yet updated!
+ */
 static void dequeue_task_pb(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct pb_rq *pb = &(rq->pb);
@@ -168,72 +279,130 @@ static void dequeue_task_pb(struct rq *rq, struct task_struct *p, int flags)
 	u64 perf_counter;
 	u64 counter_diff;
 	u64 read_error;
-	bool premature_finish = false;
+	u64 expected_instr_count;
+	u64 diff_from_expected;
+	pbm_NODE* cur_node;
+	struct task_struct* cur_proc;
+	struct perf_event* pevent;
 
+	bool process_exited = false;
 
-	if (pb->waiting_on_io) {
+	//printk("Dequeue task: %u\n", p->pid);
+
+	if (pb->waiting_on_io && p->state != TASK_DEAD) {
+		printk("Waiting for IO\n");
 		return;
 	}
 	pb->waiting_on_io = 1;
 	c_entry_curr = pb->c_entry;
+
+	// safe current process for later use since the plan_rt_state might get modified
+	plan_rt_state_peek(&cur_node, &cur_proc);
+	pevent = get_pevent_by_pid(cur_proc->pid);
 	
-	if(!pb->pevent) {
+	if(!pevent) {
 		printk("WARNING: PERF EVENT IS NULL");
 	}
 
+	if (pb->triggering_syscall.type == sched_trig_EXIT) {
+		// remove the exited process from the stack and run the next available
+		plan_rt_state_pop();
+		process_exited = true;
+		//terminate_perf_event(get_pevent_by_pid(prev_proc->pid));
+		plan_rt_state_debug_print();
+
+		// reset the info so that the next relevant triggering syscall can be detected again
+		reset_triggering_syscall_info();
+	} else {
+		if (!plan_rt_state_is_empty()) {
+			plan_rt_state_debug_print();
+		}
+	}
+
+	if (cur_node) {
+		expected_instr_count = cur_node->count;
+	} else {
+		expected_instr_count = 0;
+	}
+
 	// printk(KERN_ALERT "DEBUG: Passed %s %d \n",__FUNCTION__,__LINE__);
-	read_error = get_perf_counter(pb->pevent, &perf_counter);
+	read_error = get_perf_counter(pevent, &perf_counter);
 	if (read_error) {
 		printk(KERN_WARNING "FETCHING PERFORMANCE COUNTER IN PB SCHEDULER FAILED WITH %llu\n", read_error);
 	}
-	counter_diff = perf_counter - pb->total_instr;
+	counter_diff = perf_counter; //- pb->total_instr;
 	pb->plan[c_entry_curr].n_instr_counted = counter_diff;
 	pb->total_instr = perf_counter;
-	if (counter_diff < pb->plan[c_entry_curr].n_instr) {
-		u64 under_time = pb->plan[c_entry_curr].n_instr - counter_diff;
-
-		printk(KERN_WARNING "PB TASK %llu RAN %llu INSTRUCTIONS TOO SHORT\n", pb->plan[pb->c_entry].task_id, under_time);
-	} else if (counter_diff > pb->plan[c_entry_curr].n_instr) {
-		//TODO: Check if actually an overflow occurs and an another calculation is necessary
-		// (setting a flag in the perf overflow_handler could be a solution)
-		u64 over_time = counter_diff - pb->plan[c_entry_curr].n_instr;
-
-		printk(KERN_WARNING "PB TASK %llu RAN %llu INSTRUCTIONS TOO LONG\n", pb->plan[pb->c_entry].task_id, over_time);
+	diff_from_expected = abs(counter_diff - expected_instr_count);
+	//TODO: Set proper threshold for significance (relative values would probably be better than absolutes)
+	//if (diff_from_expected > 0)
+	{
+		printk(KERN_WARNING Yellow Bold "PB TASK %u RAN %llu / %llu INSTRUCTIONS TOO %s" End "\n",
+               cur_proc->pid, diff_from_expected, expected_instr_count,
+				counter_diff < expected_instr_count ? "SHORT" : "LONG");
 	}
 
 	pb->c_entry++;
 
-	/**
-		* Don't schedule a task that is dead. (e.g. plan was incorrect and program finished quicker)
-		* TODO: if we have multiple tasks structs try the next plan entry
-		*/
-	if (pb->c_entry < pb->size && pb->plan[pb->c_entry].task_struct->state == TASK_DEAD) {
-		premature_finish = true;
+	//TODO: Can this actually happen? Can a process die without calling exit?
+	// remove a dead process which has not called exit from the plan
+	if (!process_exited && cur_proc->state == TASK_DEAD) {
+		plan_rt_state_pop();
 	}
 
-	if (pb->c_entry >= pb->size || premature_finish) {
-		if (premature_finish) {
+	if (is_plan_finished(pb)) {
+		if (!is_plan_successful(pb)) {
 			printk(KERN_WARNING "PLAN TERMINATED PREMATURELY \n");
 		}
 		else {
-			printk(KERN_WARNING "PLAN DONE \n");
+			printk(KERN_WARNING Bold Yellow "PLAN DONE" End "\n");
 		}
 
 		// set back to cfs for completion of task
 		pb->is_initialized = 0;
-		pb->plan[0].task_struct->sched_class = &fair_sched_class;
+	}
+	printk(KERN_WARNING Bold Yellow "Exited: %i .Proc state: %li (?= %i)" End "\n", process_exited, cur_proc->state, TASK_DEAD);
+
+	if (process_exited && pb->root_proc == cur_proc) {
+		cur_proc->sched_class = &fair_sched_class;
 		resched_curr(rq);
 	}
-}
 
-static void yield_task_pb(struct rq *rq)
-{
-	// NOP
+		/*
+		// show all current processes (source: https://unix.stackexchange.com/questions/299140/linux-is-there-a-way-to-dump-the-task-run-queue/336663#336663)
+		{
+			struct task_struct *process, *thread;
+			int cnt = 0;
+
+			rcu_read_lock();
+			for_each_process_thread(process, thread) {
+				task_lock(thread);
+
+				// exclude system processes (which have been started earlier and thereby have a lower pid)
+				if (thread->pid > 1000) {
+					printk(KERN_WARNING "%li, %u, %s, %s\n", thread->state, thread->pid, thread->comm,
+							thread->sched_class == &fair_sched_class ? "fair" :
+								(thread->sched_class == &pb_sched_class ? "pb" :
+									"other"));
+				}
+				task_unlock(thread);
+				cnt++;
+			}
+			rcu_read_unlock();
+		}
+		*/
 }
 
-static void check_preempt_curr_pb(struct rq *rq, struct task_struct *p, int flags)
+static void task_dead_pb(struct task_struct *p)
 {
-	// NOP
+	/**
+	 * We cant put the exit notification here because this only gets called _after_ a switch from a
+	 * dead process to another one.
+	 * For such a context switch it would be required that a new process is chosen which only
+	 * happens when the plan runtime state is updated but that update requires the information that
+	 * should be emitted here (which closes the circular dependency).
+	 * Therefore it must be done e.g. in do_exit() and not here.
+	 */
 }
 
 static struct task_struct * pick_next_task_pb(struct rq *rq,
@@ -243,6 +412,9 @@ static struct task_struct * pick_next_task_pb(struct rq *rq,
 	struct task_struct *picked = NULL;
 	enum pb_mode current_mode, next_mode;
 	struct pb_rq *pb = &(rq->pb);
+
+	// FIXME: Testing purposes
+	struct task_struct* one; int i;
 	
 	current_mode = pb->mode;
 	next_mode = determine_next_mode_pb(rq);
@@ -250,8 +422,8 @@ static struct task_struct * pick_next_task_pb(struct rq *rq,
 
 	if (next_mode == PB_DISABLED_MODE && current_mode == PB_EXEC_MODE) {
 		// After Plan is done do the cleanup
-		terminate_perf_event(pb->pevent);
-		pb->pevent = NULL;
+		// FIXME: This should be done for all processes on exit and not just for the last one in the plan?!
+		//terminate_perf_event(&(get_pevent_by_pid(plan_rt_state_peek_proc()->pid)));
 		// TODO: Check if we have to free the memory or if perf takes care of it
 		// see 'perf_event_release_kernel(struct perf_event *event)' in core.c
 	}
@@ -260,7 +432,7 @@ static struct task_struct * pick_next_task_pb(struct rq *rq,
 	 * pb scheduler starts executing
 	 */
 	if (current_mode == PB_DISABLED_MODE && current_mode != next_mode) {
-		if (pb->c_entry < pb->size && pb->plan[pb->c_entry].task_struct->state == TASK_DEAD) {
+		if (!is_plan_finished(pb) && (plan_rt_state_peek_proc() ? (plan_rt_state_peek_proc()->state == TASK_DEAD) : true)) {
 			pb->mode = PB_DISABLED_MODE;
 			next_mode = PB_DISABLED_MODE;
 			picked = NULL;
@@ -270,47 +442,38 @@ static struct task_struct * pick_next_task_pb(struct rq *rq,
 	}
 
 	if (current_mode != next_mode) {
-		printk("SWITCHING MODES\n");
+		printk("SWITCHING MODES: %i -> %i\n", current_mode, next_mode);
 		pb->count_admin_cycles = 0;
 		pb->count_pb_cycles = 0;
 		// Push last non-plan task back in its corresponding runqueue
 		if (next_mode == PB_EXEC_MODE) {
 			// Necessary to manage the preempted task
 			printk("PUT OLD TASK BACK IN RQ\n");
-			put_prev_task(rq, prev);
+			put_prev_task(rq, prev);  // TODO: This does nothing, right? So why even call?
 		}
 	}
 
 	// EXEC Mode is next, so we return our next task to be executed
 	if (next_mode == PB_EXEC_MODE) {
 
-		switch(pb->triggering_syscall) {
-			case sched_trig_FORK:
-				printk(KERN_WARNING "FORK TRIGGERED THIS!!!\n");
-				break;
-			case sched_trig_EXIT:
-				printk(KERN_WARNING "EXIT TRIGGERED THIS!!!\n");
-				break;
-			default:
-				printk(KERN_WARNING "OTHER STUFF TRIGGERED THIS!!!\n");
-				break;
-		}
-
-		// reset the flag so that the relevant syscalls can be detected if they are the trigger
-		pb->triggering_syscall = sched_trig_OTHER;
-
 		// printk(KERN_ALERT "DEBUG: Passed %s %d \n",__FUNCTION__,__LINE__);
 		if(current_mode == PB_ADMIN_MODE) {
 			printk(KERN_DEBUG "PB ADMIN,STOP,%u,%llu\n", pb->c_entry, sched_clock());
 		} else if (current_mode == PB_DISABLED_MODE) {
 			printk("Switching from disabled to EXEC\n");
 		}
-		picked = pb->plan[pb->c_entry].task_struct;
+
+		picked = plan_rt_state_peek_proc();
 	}
 
 	return picked;
 }
 
+static void yield_task_pb(struct rq *rq)
+{
+	// NOP
+}
+
 static void put_prev_task_pb(struct rq *rq, struct task_struct *p)
 {
 	// NOP
@@ -363,7 +526,7 @@ static void update_curr_pb(struct rq *rq)
 }
 
 const struct sched_class pb_sched_class = {
-	.next			= &dl_sched_class,
+	.next			= &dl_sched_class, // arrange the pb scheduler to be higher prioritized than the deadline (dl) scheduler
 	.enqueue_task		= enqueue_task_pb,
 	.dequeue_task		= dequeue_task_pb,
 	.yield_task		= yield_task_pb,
@@ -373,6 +536,7 @@ const struct sched_class pb_sched_class = {
 	.pick_next_task		= pick_next_task_pb,
 	.put_prev_task		= put_prev_task_pb, // NOP
 
+	//.task_dead		= task_dead_pb, // NOP
 	.set_curr_task          = set_curr_task_pb, // NOP
 	.task_tick		= task_tick_pb,
 
diff --git a/kernel/sched/perf_error_detection.c b/kernel/sched/perf_error_detection.c
index 96d3299846d9..87244e1aaa83 100644
--- a/kernel/sched/perf_error_detection.c
+++ b/kernel/sched/perf_error_detection.c
@@ -94,7 +94,7 @@ u64 get_perf_counter(struct perf_event *pevent, u64 *perf_counter)
 	return read_error;
 }
 
-u64 terminate_perf_event(struct perf_event *pevent)
+u64 terminate_perf_event(struct perf_event* pevent)
 {
 	u64 result;
 	unsigned long irq_flags;
@@ -104,7 +104,83 @@ u64 terminate_perf_event(struct perf_event *pevent)
 	perf_event_disable(pevent);
 	result = perf_event_release_kernel(pevent);
 	local_irq_restore(irq_flags);
-    pevent = NULL;
+    //*pevent = NULL;
 
 	return result;
 }
+
+
+/* -----------------------------------------------------------------------------
+ * PID -> pevent* Hashmap
+ * Only insertion is needed currently (since memory efficiency is not our
+ * current concern)
+ */
+
+pid_2_pevent_map pid_2_pevent;
+
+void init_pid_2_pevent_map(void)
+{
+    int i;
+    for (i = 0; i < PROC_BUF_SIZE; ++i) {
+        pid_2_pevent.index_to_pid[i] = 0;
+        pid_2_pevent.index_to_pevent[i] = NULL;
+    }
+    pid_2_pevent.last_proc_index = 0;
+}
+
+/*
+ * Returns the pevent-pointer of the corresponding given process by PID if it
+ * exists otherwise it returns NULL (assumption: process with PID=0 is not
+ * considered a normal process)
+ *
+ * This is a primitive "Hashmap"-retrieval implementation (O(n))
+ */
+struct perf_event* get_pevent_by_pid(pid_t pid)
+{
+    size_t i;
+    // to get the pevent by PID one needs to find the index that corresponds to
+    // the given PID and use that to retrieve the pevent of the second array
+    // directly by index
+    size_t proc_index = 0;
+    for(i = 0; i < PROC_BUF_SIZE; i++) {
+        if (pid_2_pevent.index_to_pid[i] == pid) {
+            proc_index = i;
+            break;
+        }
+    }
+    return proc_index != 0 ? pid_2_pevent.index_to_pevent[proc_index] : NULL;
+}
+
+/*
+ * Adds a process (pid, pevent) to the pid->pevent hashmap (NON-idempotently!)
+ *
+ * Returns if the process has been successfully inserted into the hashmap
+ */
+int add_proc_to_pevent_map(pid_t pid, struct perf_event* pevent)
+{
+    if (TASK_BUF_SIZE <= pid_2_pevent.last_proc_index) {
+        printk(KERN_WARNING "PROC MAP ADD: last_proc_index too large: %lu\n", pid_2_pevent.last_proc_index);
+        return 0;
+    }
+    printk(KERN_WARNING "i: %lu, pid: %u\n", pid_2_pevent.last_proc_index, pid);
+    pid_2_pevent.last_proc_index++;
+    pid_2_pevent.index_to_pid[pid_2_pevent.last_proc_index] = pid;
+    pid_2_pevent.index_to_pevent[pid_2_pevent.last_proc_index] = pevent;
+    return 1;
+}
+
+int init_perf_event_into_map(struct task_struct* proc, u64 num_instr)
+{
+    int error;
+    struct perf_event* pevent;
+    pid_t pid = proc->pid;
+
+    error = init_perf_event(proc, num_instr, &pevent);
+    if (error) {
+        printk(KERN_WARNING "TASK: %u | Counting NOT started due to error\n", pid);
+        return -1;
+    }
+    add_proc_to_pevent_map(pid, pevent);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/kernel/sched/perf_error_detection.h b/kernel/sched/perf_error_detection.h
index 09024038328d..02113e82f8af 100644
--- a/kernel/sched/perf_error_detection.h
+++ b/kernel/sched/perf_error_detection.h
@@ -10,7 +10,7 @@ int init_perf_event(struct task_struct* proc, u64 num_instr, struct perf_event *
 
 u64 get_perf_counter(struct perf_event *pevent, u64 *perf_counter);
 
-u64 terminate_perf_event(struct perf_event *pevent);
+u64 terminate_perf_event(struct perf_event* pevent);
 
 typedef struct {
     // primitive int -> int and int -> pevent* hashmap combination
@@ -22,4 +22,9 @@ typedef struct {
     //pthread_mutex_t lock;
 } pid_2_pevent_map;
 
+void init_pid_2_pevent_map(void);
+struct perf_event* get_pevent_by_pid(pid_t pid);
+int add_proc_to_pevent_map(pid_t pid, struct perf_event* pevent);
+int init_perf_event_into_map(struct task_struct* proc, u64 num_instr);
+
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 876ef5b02926..063e47dde251 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -40,6 +40,8 @@
 
 #include <linux/perf_event.h> // For performance counter
 
+#include "../behave.h"
+
 #ifdef CONFIG_SCHED_DEBUG
 #define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
 #else
@@ -544,10 +546,17 @@ struct pb_plan {
  	    pid_t pid;			 // process_id of the prgramm tp execute with the plan
         uint64_t *inst_cnt;  // array of estimated instructions for each task
         size_t num_tasks;	 // number of tasks in the plan
+		pid_t ref_pid;       // pid of the root parent task of the plan used as reference (as generated by libpbm)
 };
 
 enum sched_trigger_syscall { sched_trig_FORK, sched_trig_EXIT, sched_trig_OTHER };
 
+struct syscall_info {
+	enum sched_trigger_syscall type; // which syscall triggered the scheduler
+	struct task_struct* origin;  // e.g. parent process in case of a fork
+	struct task_struct* target;  // e.g. fork: child process, exit: NULL
+};
+
 struct pb_rq
 {
 	struct plan_entry *plan; // plan (used to be proxy_task)
@@ -562,10 +571,13 @@ struct pb_rq
 
 	enum pb_mode mode;		// current scheduler mode
 
+	struct task_struct* root_proc; // a plans root/parent process
+
 	u64 total_instr;		// total counted instructions for current plan
 
-	struct perf_event *pevent; // linux perf handle
-	enum sched_trigger_syscall triggering_syscall; // which syscall triggered the scheduler
+	u64 num_exited_procs; // total number of exited/spawned processes of the plan to know when it is finished
+
+	struct syscall_info triggering_syscall; // which syscall triggered the scheduler
 
 	/*
 	 * flag determining whether the plan is completely initialized and should be run
@@ -901,6 +913,29 @@ static inline int cpu_of(struct rq *rq)
 
 struct task_struct *find_task_by_vpid(pid_t vnr);
 
+/**
+ * If the runtime plan has been finished without checking if it conforms to the forecasted plan
+ */
+static inline int is_plan_finished(struct pb_rq* pb)
+{
+	return plan_rt_state_is_empty()/* && pb->is_initialized*/;
+}
+
+/**
+ * If the runtime plan actually conforms to the forecasted plan
+ */
+static inline int is_plan_successful(struct pb_rq* pb)
+{
+	printk(KERN_WARNING "Plan successful? Exited processes: actual: %llu, expected: %llu\n", plan_rt_state_num_exited_procs(), pb->num_exited_procs);
+	return is_plan_finished(pb) && pb->num_exited_procs == plan_rt_state_num_exited_procs();
+}
+
+/**
+ * BEWARE:
+ * This function must take large amounts of compute time (e.g. by calling printk() etc. in it)
+ * since it must not get interrupted by the scheduler ticks (e.g. task_tick_pb) since that can lead
+ * to deadlocks!
+ */
 // used to determine the next mode of the PB-Scheduler
 // This function is located in sched.h since pb.c and fair.c are using this function
 static inline int determine_next_mode_pb(struct rq *rq)
@@ -947,7 +982,7 @@ static inline int determine_next_mode_pb(struct rq *rq)
 					 * tasks were pushed forward by the default scheduler and the IO
 					 * starved. We have to wait until the process is runnable.
 					 */
-					if (pb->plan[pb->c_entry].task_struct->state == 0)
+					if (plan_rt_state_peek_proc()->state == TASK_RUNNING)
 					{
 						/*
 						 * 0 	== Runnable (IO succeeded)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 9f69fb630853..99636d7dbcea 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -110,7 +110,7 @@ static void update_curr_stop(struct rq *rq)
  * Simple, special scheduling class for the per-CPU stop tasks:
  */
 const struct sched_class stop_sched_class = {
-	.next			= &dl_sched_class,
+	.next			= &pb_sched_class,
 
 	.enqueue_task		= enqueue_task_stop,
 	.dequeue_task		= dequeue_task_stop,
diff --git a/pb_utils/pb_submitter/example_run.sh b/pb_utils/pb_submitter/example_run.sh
index 43de115acb58..991ef59c4c13 100755
--- a/pb_utils/pb_submitter/example_run.sh
+++ b/pb_utils/pb_submitter/example_run.sh
@@ -1,3 +1,6 @@
 #!/bin/sh
 cd /root
-./pb_submitter test_prog example_plan
+# FIXME: For testing purposes the first argument is simply the PID of the reference plan (which has
+# to be run before with kernel-libpbm so that it is available in the data structure)
+#./pb_submitter $1 example_plan
+./pb_submitter 666 example_plan # FIXME: THIS IS JUST FOR TESTING PURPOSES
\ No newline at end of file
diff --git a/pb_utils/test_results b/pb_utils/test_results
deleted file mode 100644
index e69de29bb2d1..000000000000
-- 
GitLab