From 5d28268c8a30c061492a0c1878022f7f480d9a8d Mon Sep 17 00:00:00 2001
From: FKHals <5229803-FKHals@users.noreply.gitlab.com>
Date: Tue, 28 Feb 2023 13:58:32 +0100
Subject: [PATCH] Integrate LibPBM implementation

so that instead of just creating separate perf measurements for certain
processes the measurements get used to create a program behaviour model
(PBM) based on the monitored process behaviour.

The model only gets created for jobs started with mpirun from the bash
shell and it is only printed when mpirun exits.
---
 kernel/behave.c | 697 ++++++++++++++++++++++++++++++++++++++++++------
 kernel/behave.h | 174 +++++++++++-
 kernel/exit.c   |  10 +-
 kernel/fork.c   |  30 ++-
 4 files changed, 813 insertions(+), 98 deletions(-)

diff --git a/kernel/behave.c b/kernel/behave.c
index c8fa9d32eb2d..f90b5a503ffb 100644
--- a/kernel/behave.c
+++ b/kernel/behave.c
@@ -8,153 +8,322 @@
 
 int is_initialized = 0;
 
-// =============================================================================
+/******************************************************************************
+* Based on "libpbm" (see header file for more info)
+*/
+
+#define TASK_BUF_SIZE 4096
 #define PROC_BUF_SIZE 512
 
-/*
- * PID -> perf_event* Hashmap
- * Only insertion is supported currently (since memory efficiency is not our
+/* -----------------------------------------------------------------------------
+ * PID -> PBM* Hashmap
+ * Only insertion is needed currently (since memory efficiency is not our
  * current concern)
  */
 typedef struct {
-    // primitive int -> pid_t and int -> perf_event* "hashmap" (not really) combination implemented
-    // using two arrays to store the pid and corresponding perf_event* at the same index
+    // primitive int -> int and int -> PBM* hashmap combination
+    // the two arrays store the pid and corresponding PBM* at the same index
     pid_t index_to_pid[PROC_BUF_SIZE];
-    struct perf_event* index_to_pevent[PROC_BUF_SIZE];
-    size_t last_proc_index; // index of currently last process in the arrays
-} task_pevent_map;
+    PBM* index_to_pbm[PROC_BUF_SIZE];
+    // index of currently last process in the arrays
+    size_t last_proc_index;
+    //pthread_mutex_t lock;
+} shared_pbm_int_map;
+
+shared_pbm_int_map _index_2_pbm;
+shared_pbm_int_map* index_2_pbm = &_index_2_pbm;
 
-task_pevent_map task_2_pevent;
 
-void init_task_pevent_map(void) {
-	int i;
+static void init_pbm_int_map(void) {
+    int i;
+    //index_2_pbm = init_shared_memory(index_2_pbm, sizeof(shared_pbm_int_map));
+
     for (i = 0; i < PROC_BUF_SIZE; ++i) {
-        task_2_pevent.index_to_pid[i] = 0;
-        task_2_pevent.index_to_pevent[i] = NULL;
+        index_2_pbm->index_to_pid[i] = 0;
+        index_2_pbm->index_to_pbm[i] = NULL;
     }
-    task_2_pevent.last_proc_index = 0;
-	printk(KERN_WARNING "Task map initialized by TASK: %u\n", current->pid);
+    index_2_pbm->last_proc_index = 0;
+    //init_shared_lock(&index_2_pbm->lock);
+}
+
+static void uninit_pbm_int_map(void) {
+    //munmap(index_2_pbm, sizeof(index_2_pbm));
 }
 
 /*
- * Returns the perf_event* of the corresponding given process by PID if it
+ * Returns the PBM-pointer of the corresponding given process by PID if it
  * exists otherwise it returns NULL (assumption: process with PID=0 is not
  * considered a normal process)
  *
  * This is a primitive "Hashmap"-retrieval implementation (O(n))
  */
-struct perf_event* get_pevent_by_pid(pid_t pid) {
-	size_t proc_index;
-	size_t i;
+PBM* get_pbm_by_pid(pid_t pid) {
+    size_t i;
     // to get the PBM by PID one needs to find the index that corresponds to
     // the given PID and use that to retrieve the PBM of the second array
     // directly by index
-    proc_index = 0;
-    for(i = 0; i <= task_2_pevent.last_proc_index; i++) {
-        if (task_2_pevent.index_to_pid[i] == pid) {
+    size_t proc_index = 0;
+    for(i = 0; i < PROC_BUF_SIZE; i++) {
+        if (index_2_pbm->index_to_pid[i] == pid) {
             proc_index = i;
             break;
         }
     }
-    return proc_index != 0 ? task_2_pevent.index_to_pevent[proc_index] : NULL;
+    return proc_index != 0 ? index_2_pbm->index_to_pbm[proc_index] : NULL;
 }
 
 /*
- * Adds a process (pid, perf_event) to the pid->perf_event hashmap (NON-idempotently!)
+ * Adds a process (pid, pbm) to the pid->pbm hashmap (NON-idempotently!)
  *
  * Returns if the process has been successfully inserted into the hashmap
  */
-int add_proc_to_map(pid_t pid, struct perf_event* pevent) {
-    if (PROC_BUF_SIZE <= task_2_pevent.last_proc_index) {
-        printk(KERN_WARNING "PROC MAP ADD: last_proc_index too large: %lu\n", task_2_pevent.last_proc_index);
+int add_proc_to_map(pid_t pid, PBM* pbm) {
+    //pthread_mutex_lock(&index_2_pbm->lock);
+    if (TASK_BUF_SIZE <= index_2_pbm->last_proc_index) {
+        printk(KERN_WARNING "PROC MAP ADD: last_proc_index too large: %lu\n", index_2_pbm->last_proc_index);
         return 0;
     }
-    printk(KERN_WARNING "i: %lu, pid: %u\n", task_2_pevent.last_proc_index, pid);
-    task_2_pevent.last_proc_index++;
-    task_2_pevent.index_to_pid[task_2_pevent.last_proc_index] = pid;
-    task_2_pevent.index_to_pevent[task_2_pevent.last_proc_index] = pevent;
+    printk(KERN_WARNING "i: %lu, pid: %u\n", index_2_pbm->last_proc_index, pid);
+    index_2_pbm->last_proc_index++;
+    index_2_pbm->index_to_pid[index_2_pbm->last_proc_index] = pid;
+    index_2_pbm->index_to_pbm[index_2_pbm->last_proc_index] = pbm;
+    //pthread_mutex_unlock(&index_2_pbm->lock);
     return 1;
 }
-// =============================================================================
 
-int start_counting(struct task_struct *p) {
+static void debug_print_map(void) {
+    size_t i;
+    // lock to make the map printing sequential without interleaving other
+    // outputs
+    //pthread_mutex_lock(&index_2_pbm->lock);
+    printk(KERN_WARNING "MAP\n-----\n");
+    for(i = 1; i < PROC_BUF_SIZE; i++) {
+        if (0 == index_2_pbm->index_to_pid[i])
+            break;
+        printk(KERN_WARNING "  %u\n", index_2_pbm->index_to_pid[i]);
+    }
+    printk(KERN_WARNING "-----\n");
+    //pthread_mutex_unlock(&index_2_pbm->lock);
+}
+
+/* -----------------------------------------------------------------------------
+ * Task buffer which holds the nodes of the task graph
+ */
+typedef struct {
+    // buffer that holds the nodes of the task graph
+    pbm_NODE task_buffer[TASK_BUF_SIZE];
+    // index of current task in task_buffer[]
+    uint32_t curr_task_index;
+    //pthread_mutex_t lock;
+} task_buf;
+
+task_buf _tasks;
+task_buf* tasks = &_tasks;
+
+void init_task_buf(void) {
+    //tasks = init_shared_memory(tasks, sizeof(task_buf));
+    memset(tasks, 0, sizeof(task_buf));
+    tasks->curr_task_index = 0;
+    //init_shared_lock(&tasks->lock);
+}
+
+void uninit_task_buf(void) {
+    //munmap(tasks, sizeof(tasks));
+}
+
+pbm_NODE* task_alloc(void) {
+    pbm_NODE* new_task_node;
+    //pthread_mutex_lock(&tasks->lock);
+    if (TASK_BUF_SIZE <= tasks->curr_task_index) {
+        printk(KERN_WARNING "ERROR: Tried to alloc more tasks than available!\n");
+        return NULL;
+    }
+    // get the memory address of the next free task node space
+    new_task_node = &(tasks->task_buffer[tasks->curr_task_index]);
+    tasks->curr_task_index++;
+    //pthread_mutex_unlock(&tasks->lock);
+    return new_task_node;
+}
+
+void debug_print_tasks(void) {
+    size_t i;
+    pbm_NODE t;
+    printk(KERN_WARNING "-----\nTASKS:\n");
+    for(i = 0; i <= tasks->curr_task_index; i++) {
+        t = tasks->task_buffer[i];
+        printk(KERN_WARNING "type: %u, thread_id: %u, count: %llu,  children: %llx, next_sib: %llx\n", t.type, t.thread_id, t.count, (uint64_t)t.children, (uint64_t)t.next_sib);
+    }
+    printk(KERN_WARNING "^^^^^\n");
+}
+
+ /* -----------------------------------------------------------------------------
+ * PBM buffer which holds the nodes of the process graph
+ */
+typedef struct {
+    // buffer that holds the nodes of the process graph
+    PBM process_buffer[PROC_BUF_SIZE];
+    // index of current process in proc_buffer[]
+    uint32_t curr_proc_index;
+    //pthread_mutex_t lock;
+} proc_buf;
+
+proc_buf _procs;
+proc_buf* procs = &_procs;
+
+static void init_proc_buf(void) {
+    //procs = init_shared_memory(procs, sizeof(proc_buf));
+    memset(procs, 0, sizeof(proc_buf));
+    procs->curr_proc_index = 0;
+    //init_shared_lock(&procs->lock);
+}
+
+static void uninit_proc_buf(void) {
+    //munmap(procs, sizeof(procs));
+}
+
+PBM* proc_alloc(void) {
+    PBM* new_pbm;
+    //pthread_mutex_lock(&procs->lock);
+    if (PROC_BUF_SIZE <= procs->curr_proc_index) {
+        printk(KERN_WARNING "ERROR: Tried to alloc more processes than available!\n");
+        return NULL;
+    }
+    printk(KERN_WARNING "alloc proc index: %u\n", procs->curr_proc_index);
+    // get the memory address of the next free process node space
+    new_pbm = &(procs->process_buffer[procs->curr_proc_index]);
+    procs->curr_proc_index++;
+    //pthread_mutex_unlock(&procs->lock);
+    return new_pbm;
+}
+
+void debug_print_procs(void) {
+    size_t i;
+    PBM p;
+    printk(KERN_WARNING "-----\nPROCS:\n");
+    for(i = 0; i <= procs->curr_proc_index; i++) {
+        p = procs->process_buffer[i];
+        printk(KERN_WARNING "root: %llx, last: %llx, children: %llx, next_sib: %llx\n", (uint64_t)p.root, (uint64_t)p.last, (uint64_t)p.children, (uint64_t)p.next_sib);
+    }
+    printk(KERN_WARNING "^^^^^\n");
+}
+
+/* -----------------------------------------------------------------------------
+* General function for management and creation of program behavior models (PBMs)
+*/
+
+void pbm_init(void) {
+    init_pbm_int_map();
+    init_task_buf();
+    init_proc_buf();
+}
+
+void pbm_uninit(void) {
+    uninit_pbm_int_map();
+    uninit_task_buf();
+    uninit_proc_buf();
+}
+
+/* Insert a task node into the PBM of given type (only COMP for now) */
+int pbm_task_start(PBM* pbm, uint8_t type, struct task_struct* proc) {
+    pbm_NODE* node;
+
 	unsigned long irq_flags;
-    struct perf_event_attr pe;
 	struct perf_event *pevent;
-	u64 cpu;
 
-	if (!is_initialized) {
-		init_task_pevent_map();
-		is_initialized = 1;
+	// only continue for COMP-nodes since other types are not implemented yet
+    if(!(pbm && type == COMP)) {
+        return 0;
 	}
 
-	memset(&pe, 0, sizeof(struct perf_event_attr));
-	pe.type = PERF_TYPE_HARDWARE;
-	pe.size = sizeof(struct perf_event_attr);
-	pe.config = PERF_COUNT_HW_INSTRUCTIONS;
-	pe.sample_period = 400800;
-	pe.disabled = 0;		// start the counter as soon as we're in userland
-	pe.pinned = 1;			// ?
-	pe.exclude_kernel = 1;
-	pe.exclude_hv = 1;
+	/*
+    * Append a new node to the task graph
+    */
+    node = _pbm_create_node(type, proc->pid);
+    if(!node) {
+        printk(KERN_WARNING "ERROR: Could not create node!\n");
+        return 0;
+    }
+    if(!pbm->root) {
+        pbm->root = node;
+	} else {
+        pbm->last->children = node; // append the new node
+	}
+	// finally the new node becomes the last inserted one
+    pbm->last = node;
 
-	// Not needed on 3.2?
-	pe.wakeup_events = 1;
+    /*
+    * Configure the performance counter
+    */
+    memset(&(pbm->pea), 0, sizeof(struct perf_event_attr));
+	pbm->pea.type = PERF_TYPE_HARDWARE;
+	pbm->pea.size = sizeof(struct perf_event_attr);
+	pbm->pea.config = PERF_COUNT_HW_INSTRUCTIONS;
+	pbm->pea.sample_period = 400800;
+	pbm->pea.disabled = 0; // start the counter as soon as we're in userland
+	pbm->pea.pinned = 1;
+	pbm->pea.exclude_kernel = 1;
+	pbm->pea.exclude_hv = 1;
+	pbm->pea.wakeup_events = 1; // Not needed on 3.2?
 
-	cpu = smp_processor_id();
-	printk(KERN_WARNING "TASK: %u, CPU: %llu, PTR: %llu\n", p->pid, cpu, (u64)p);
+	printk(KERN_WARNING "TASK: %u, CPU: %i, PTR: %llu\n", proc->pid, smp_processor_id(), (u64)proc);
 
+    /*
+    * Try to enable the performance counter
+    */
 	// disable irqs to make 'perf_event_ctx_activate' in 'kernel/events/core.c' happy
 	local_irq_save(irq_flags);
 	pevent = perf_event_create_kernel_counter(
-        &pe,
+        &(pbm->pea),
         -1,   // measure on all cores (in case the process runs on different ones)
-        p,    // exclusively measure the forked process (BEWARE: a process can only measure itself!)
+        proc, // exclusively measure the forked process (BEWARE: a process can only measure itself!)
         NULL, //&overflow_handler,
         NULL
     );
 	local_irq_restore(irq_flags);
-
 	if (IS_ERR(pevent)) {
-			printk(KERN_WARNING "TASK: %u | PB ERROR INITIALISING PERF EVENT: %li\n", p->pid, PTR_ERR(pevent));
+			printk(KERN_WARNING "TASK: %u | PB ERROR INITIALISING PERF EVENT: %li\n", proc->pid, PTR_ERR(pevent));
+            // cast to prevent compiler warnings
+            if (-EOPNOTSUPP == (int64_t)pevent) {
+                printk(KERN_WARNING
+                    "TASK: %u | EOPNOTSUPP (-95): The hardware does not support certain attributes! "
+                    "E.g. perf_event_attr.precise_ip > 0 may not be supported.\n", proc->pid);
+            }
+            if (-EINVAL == (int64_t)pevent) {
+                printk(KERN_WARNING
+                    "TASK: %u | EINVAL (-22): Invalid argument!"
+                    "E.g. CPU with given index does not exist.\n", proc->pid);
+            }
             return 0;
 	}
     if (pevent->state != PERF_EVENT_STATE_ACTIVE) {
-		printk(KERN_WARNING "TASK: %u | Event is inactive", p->pid);
+		printk(KERN_WARNING "TASK: %u | Event is inactive", proc->pid);
 	}
+	pbm->pevent = pevent;
 
-	add_proc_to_map(p->pid, pevent);
+    printk(KERN_WARNING "TASK: %u | Counting started...\n", proc->pid);
 
-    printk(KERN_WARNING "TASK: %u | Counting started...\n", p->pid);
-	return 1;
+    return 2;
 }
 
-/*
- *	handle the perf overflow event -> task needed more instructions than planed
- */
-void overflow_handler(
-		struct perf_event *event,
-		struct perf_sample_data *data,
-		struct pt_regs *regs)
-{
-	printk(KERN_WARNING "PB TASK RAN TOO LONG\n");
-	//perf_event_get
-	//->unlocked_ioctl(event, reset);
-}
-
-int stop_counting(void) {
+/* Conclude the last task of the given PBM */
+int pbm_task_end(PBM* pbm) {
 	unsigned long irq_flags;
     int read_error;
 	struct perf_event *pevent;
 	u64 perf_counter;
 
+    if (!pbm) {
+        printk(KERN_WARNING "Error: Could not end given task due to invalid PBM!\n");
+        return 0;
+    }
+
+    // record performance results
 	if (!is_initialized) {
 		printk(KERN_WARNING "TASK: %u | Pevent map not initialized!\n", current->pid);
 		return 0;
 	}
-
 	printk(KERN_WARNING "TASK: %u | Stopping counting...\n", current->pid);
-	pevent = get_pevent_by_pid(current->pid);
+	pevent = pbm->pevent;
 	if (!pevent) {
 		printk(KERN_WARNING "TASK: %u | ERROR: Could not find perf_event!\n", current->pid);
 		return 0;
@@ -163,12 +332,21 @@ int stop_counting(void) {
 		printk(KERN_WARNING "TASK: %u | PEVENT INVALID\n", current->pid);
 		return 0;
 	}
-
     read_error = perf_event_read_local(pevent, &perf_counter);
 	if (read_error) {
 		printk(KERN_WARNING "TASK: %u | FETCHING PERFORMANCE COUNTER IN stop_counting FAILED WITH %i\n", current->pid, read_error);
+            if (-EINVAL == (int64_t)read_error) {
+                // If this is a per-task event, it must be for current.
+                // If this is a per-CPU event, it must be for this CPU.
+                printk(KERN_WARNING
+                    "TASK: %u | EINVAL (-22): Invalid argument! "
+                    "E.g. trying to measure a different task than itself.\n", current->pid);
+            }
+	} else {
+		pbm->last->count = perf_counter;
 	}
 
+	// disable performance counter while preventing context switching
 	local_irq_save(irq_flags);
 	perf_event_disable(pevent);
 	perf_event_release_kernel(pevent);
@@ -176,5 +354,368 @@ int stop_counting(void) {
     pevent = NULL;
 
     printk(KERN_WARNING "TASK: %u | ...Counting stopped: %llu instr.\n", current->pid, perf_counter);
-	return 0;
+
+    return 0;
 }
+
+/**
+ * Why is the code concerning the forking separated into the two functions
+ * pbm_fork_parent_new_task() and pbm_fork() instead of simply putting it at the end of _do_fork?
+ *
+ * The separation is necessary since in the _do_fork a context switch from the parent to the
+ * child process takes place which is problematic since we want to end (and restart) perf-measuring
+ * the parent as well as the child process and the measurements (in pbm_task_end()) can only happen
+ * from the process itself. But in the beginning of _do_fork the child process does not exist yet.
+ * Therefore we have to split the code into the two functions to be able to measure the parent
+ * before the context switch as well as initialize the child-measuring after switching to the child.
+ */
+
+/* Stop previous task and start new task for the parent process and also reset the perf counter
+ * Returns a pointer to the fork-task-node which the forked process can use as a time information.
+ *
+ * BEWARE:
+ * Inside the _do_fork routine the context gets switched from the parent to the child process.
+ * This function must get called in the _do_fork() routine BEFORE (!) the child process starts to
+ * run (current == parent) otherwise the perf counting will fail!
+ */
+ pbm_NODE* pbm_fork_parent_new_task(struct task_struct* parent_proc) {
+    PBM* parent_pbm;
+    pbm_NODE* fork_node;
+
+    // end task of parent process
+	parent_pbm = get_pbm_by_pid(parent_proc->pid);
+	if(!parent_pbm) {
+		printk(KERN_WARNING "COULD NOT FIND PARENT-PBM!\n");
+		//TODO Since this will happen right at the first call to pbm_fork()
+		//TODO since no parent process has been initialized (since this is
+		//TODO the first relevant parent process)
+        return NULL;
+	}
+	pbm_task_end(parent_pbm);
+
+	/*
+    * Before starting the new task, append the fork-node to the task graph to maintain the correct order
+    */ 
+    fork_node = _pbm_create_node(FORK, parent_proc->pid);
+    if(!fork_node) {
+        printk(KERN_WARNING "COULD NOT CREATE NEW FORK NODE!\n");
+        return NULL;
+    }
+    if(!parent_pbm->root) {
+        parent_pbm->root = fork_node;
+    } else {
+        parent_pbm->last->children = fork_node; // append the new node
+    }
+    parent_pbm->last = fork_node; // the new node becomes the last inserted one
+
+    // start the new task
+	pbm_task_start(parent_pbm, COMP, parent_proc);
+
+    return fork_node;
+ }
+
+//TODO Consider the difference between calling it from the root-process (that has no registered parent process) and from "normal" child processes (that are registered in the maps)
+/* Insert a FORK node into the given PBM for up to 'num_thr' child threads
+ *
+ * BEWARE:
+ * Inside fork.c:_do_fork() the context gets switched from the parent to the child process.
+ * This function must get called in the _do_fork() routine AFTER (!) the child process starts to
+ * run (current == child) otherwise the perf counting will fail!
+ */
+int pbm_fork(struct task_struct* proc, pid_t parent_pid, pbm_NODE* fork_date) {
+	unsigned long irq_flags;
+
+	PBM* parent_pbm;
+    PBM* child_pbm;
+
+    /* NOTE:
+     * since the first time that _do_fork() is called the parent is "bash" which causes the
+     * pbm_fork_parent_new_task() to not be called but pbm_fork() since the child is "mpirun" so we
+     * put the initialization in here instead of into pbm_fork_parent_new_task().
+     */
+	// avoid context switching during initialization by disabling interrupts
+	local_irq_save(irq_flags);
+	if (!is_initialized) {
+		pbm_init();
+		is_initialized = 1;
+	}
+	local_irq_restore(irq_flags);
+
+	child_pbm = get_pbm_by_pid(proc->pid);
+	parent_pbm = get_pbm_by_pid(parent_pid);
+
+    printk(KERN_WARNING "FORK: %u from parent %u\n", proc->pid, parent_pid);
+
+    // check if the child already exists (if and only if the "forked" process
+    // is the process itself which happens in this case because OpenMP also
+    // uses the parent process for parallel calculations)
+    if (!child_pbm) {
+        // Create and initialize a new PBM for the child
+        {
+            child_pbm = proc_alloc();
+            if(!child_pbm) {
+                printk(KERN_WARNING "ERROR: Could not alloc child-PBM! %llx\n", (uint64_t)&child_pbm);
+                return 0;
+            }
+
+            // general configurations for perf_event interface
+            child_pbm->pea.size = sizeof(struct perf_event_attr);
+
+            child_pbm->root = NULL;
+            child_pbm->last = NULL;
+            child_pbm->children = NULL;
+            child_pbm->next_sib = NULL;
+            child_pbm->fork_date = NULL; // this gets updated later in the function // TODO Remove initialization here since it is done later?
+            child_pbm->exit_date = NULL;
+        }
+        if (!add_proc_to_map(proc->pid, child_pbm)) {
+            printk(KERN_WARNING "FORK ERROR: Could not add process to map: %u\n", proc->pid);
+            return 0;
+            // TODO Reverse previous allocation of child pbm?
+        } else {
+            printk(KERN_WARNING "Added process to map: %u\n", proc->pid);
+            debug_print_map(); //FIXME
+        }
+    } else {
+        printk(KERN_WARNING "Process already exists: %u\n", proc->pid);
+    }
+
+    // add child pbm to parents children
+	if(parent_pbm) { // checking this is only important in case of the root task which has no (recorded) parent
+		if(parent_pbm->children) {
+			// prepend the child to the list of children so we dont have to
+			// modify the sibbling (which works since we use a single-linked
+			// list)
+			child_pbm->next_sib = parent_pbm->children;
+		}
+		parent_pbm->children = child_pbm;
+	}
+
+    // We have to know WHEN the exit happens relative to the parent. So every child remembers the
+    // current fork-task-node of the parent on exit (so that the join can happen at the correct
+    // position (more or less, may be imperfect due to parallelism))
+    child_pbm->fork_date = fork_date;
+
+    // continue performance counting for child (restarting parent counting has already been started
+    pbm_task_start(child_pbm, COMP, proc);
+
+    return 1;
+}
+
+// This should get called by the child at sysexit()
+int pbm_exit(pid_t pid, pid_t parent_pid) {
+    PBM* pbm;
+    PBM* parent_pbm;
+
+    printk(KERN_WARNING "EXIT: %u\n", pid);
+    pbm = get_pbm_by_pid(pid);
+    if(!pbm) {
+        printk(KERN_WARNING "COULD NOT FIND PBM!\n");
+        debug_print_map();
+        return 0;
+    }
+    pbm_task_end(pbm);
+
+    parent_pbm = get_pbm_by_pid(parent_pid);
+    // set current parent task as the exit task of this child where the join
+    // gets inserted
+    if(parent_pbm) {
+        pbm->exit_date = parent_pbm->last;
+    }
+
+    return 1;
+}
+
+/* -----------------------------------------------------------------------------
+* PBM graph post-processing functions
+*/
+
+/* Insert a JOIN node into the given PBM and merge the forked child PBMs into this PBM */
+int pbm_join(PBM* child_pbm) {
+    pbm_NODE* fork_node;
+    pbm_NODE* join_node;
+    pid_t join_label;
+
+    if(!child_pbm)
+        return 0;
+
+    fork_node = child_pbm->fork_date;
+
+    // the child process is used to label the join operation to know which process the join belongs
+    // to since using the parent as the label would be ambiguous since more than one child could
+    // have been spawned by the same parent
+    join_label = child_pbm->last->thread_id;
+    join_node = _pbm_create_node(JOIN, join_label);
+    if(!join_node) {
+        printk(KERN_WARNING "ERROR: Could not create node!\n");
+        return 0;
+    }
+
+    // any fork-node has exactly two children because a fork creates only one
+    // copy of an existing process (1 (copy) + 1 (existing) = 2)
+    fork_node->count = 2;
+    join_node->count = 2;
+
+    // insert front of child task graph into parent task graph:
+    // prepend child-task-tree to list of child-nodes in the fork-node
+    {
+        // assumption: child_pbm->root->next_sib == NULL (since any fork-node
+        // has at most two children and only one before the join-operation)
+        child_pbm->root->next_sib = fork_node->children;
+        fork_node->children = child_pbm->root;
+    }
+
+    // insert back of child task graph with appended join-node into parent task
+    // graph
+    {
+        // assumption: child_pbm->last->children == NULL (since it should be
+        // the last task the child did before exit)
+        // append join node to child task-graph
+        child_pbm->last->children = join_node;
+        // insert join node directly after the exit-date-node in the parent pbm
+        join_node->children = child_pbm->exit_date->children;
+        child_pbm->exit_date->children = join_node->children;
+    }
+
+    return 1;
+}
+
+// recursively traverse all PBMs and insert the child task-graphs
+void pbm_post_processing(PBM* pbm) {
+    PBM* sib_pbm;
+    PBM* child_pbm = pbm->children;
+    if(child_pbm) {
+        pbm_post_processing(child_pbm);
+        pbm_join(child_pbm);
+        // TODO Remove from list of childs or just mark as visited?
+    }
+
+    sib_pbm = pbm->next_sib;
+    if(sib_pbm) {
+        pbm_post_processing(sib_pbm);
+        // TODO Remove from list of siblings or just mark as visited?
+    }
+}
+
+/* -----------------------------------------------------------------------------
+* PBM graph output functions
+*/
+
+void pbm_join_and_print_graph_self(pid_t pid) {
+    PBM* pbm;
+
+    debug_print_map();
+    debug_print_procs();
+    debug_print_tasks();
+    printk(KERN_WARNING "indices: %lu, %u, %u\n", index_2_pbm->last_proc_index, procs->curr_proc_index, tasks->curr_task_index);
+    pbm = get_pbm_by_pid(pid);
+    if (pbm) {
+        pbm_post_processing(pbm);
+        pbm_print_graph(pbm, pbm->root);
+    } else {
+        printk(KERN_WARNING "JOIN: PBM not found for: %u\n", pid);
+    }
+}
+
+/* Crude recursive ADG printer, starts with given node */
+void pbm_print_graph(PBM* pbm, pbm_NODE* node) {
+    pbm_NODE* root;
+    char types[5][5] = {"", "FORK", "JOIN", "COMP", "COMM"};
+
+    if(!node)
+        return;
+
+    if(node->visited)
+        return;
+
+    root = node;
+
+    printk(KERN_WARNING "Node %p: (%s, count = %llu), children:\n", node, types[node->type], node->count);
+
+    if(node->children)
+    {
+        node = node->children;
+        while(node)
+        {
+            printk(KERN_WARNING "  -- Node %p: (%s, count = %llu), next sibling: %p\n", node, types[node->type], node->count, node->next_sib);
+            node = node->next_sib;
+        }
+    }
+
+    if(root->children)
+        pbm_print_graph(pbm, root->children);
+
+    if(root->next_sib)
+        pbm_print_graph(pbm, root->next_sib);
+
+    root->visited = 1;
+    if(root == pbm->root)
+        _pbm_unvisit_node(pbm->root);
+}
+
+/* -----------------------------------------------------------------------------
+* Auxiliary functions, not for public use.
+*/
+
+pbm_NODE* _pbm_create_node(uint8_t type, pid_t pid) {
+    pbm_NODE* node = task_alloc();
+    if(!node)
+        return NULL;
+
+    node->thread_id = pid;
+    node->type = type;
+    node->count = 0;
+    node->children = NULL;
+    node->next_sib = NULL;
+    node->visited = 0;
+    return node;
+}
+
+// recursive
+void _pbm_unvisit_node(pbm_NODE* node) {
+    if(!node)
+       return;
+
+    if(!node->visited)
+        return;
+
+    if(node->children)
+        _pbm_unvisit_node(node->children);
+
+    if(node->next_sib)
+        _pbm_unvisit_node(node->next_sib);
+
+    node->visited = 0;
+}
+
+/******************************************************************************/
+
+/*
+ *	handle the perf overflow event -> task needed more instructions than planed
+ */
+/*
+static void overflow_handler(
+		struct perf_event *event,
+		struct perf_sample_data *data,
+		struct pt_regs *regs)
+{
+	printk(KERN_WARNING "PB TASK RAN TOO LONG\n");
+}
+*/
+
+int is_root_process(struct task_struct* p) {
+    return strcmp(p->comm, "mpirun") == 0;
+}
+
+int is_relevant_process(struct task_struct* p) {
+    struct task_struct* proc = p;
+    // check if mpirun is a parent, super-parent, ... until the root-parent ("swapper") is found
+    while (proc && !(strcmp(proc->comm, "swapper") == 0)) {
+        if (is_root_process(proc)) {
+            return 1;
+        }
+        //printk(KERN_WARNING "Searching relevant process: %s\n", proc->comm);
+        proc = proc->real_parent;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/kernel/behave.h b/kernel/behave.h
index 4e83c96cfcce..881970f662de 100644
--- a/kernel/behave.h
+++ b/kernel/behave.h
@@ -3,12 +3,176 @@
 
 #include <linux/perf_event.h>
 
+/******************************************************************************
+* Based on "libpbm":
+*  Program Behaviour Model (PBM) as a Task Precedence Graph (TPG),
+*  implemented as a Acyclic Directed Graph (ADG) structure. Using
+*  Linux' perf_event interface for task performance measurement.
+*  Author: Michael Zent
+*  Context: Softwareproject 'Cluster Management', Lecturer: Barry Linnert, SS 2022 FU Berlin
+*/
+
+// node types
+#define FORK 1
+#define JOIN 2
+#define COMP 3 // computation
+#define COMM 4 // communication (not supported yet)
+
+/*
+* PBM node, describing a program task
+*/
+typedef struct _pbm_NODE
+{
+    uint8_t type;      // FORK, JOIN, or COMP (COMM not supported yet)
+    int32_t thread_id; // ID of the current thread within its thread group
+
+    /*
+    * Performance count value, interpretation depends on type
+    * FORK - Number of forked threads
+    * JOIN - Number of joined threads
+    * COMP - Number of instructions needed to complete the task
+    * COMM - Total length of all messages sent, in byte (not supported yet)
+    */
+    uint64_t count;
+
+    /*
+    * Inter-node connectors
+    * children - First child in a list of children. There should be >= 1 children
+    *            only if type == FORK, else a node has just one child.
+    * next_sib - Next sibling of a node. Should be != NULL only if the node is a
+    *            child of a FORK node, with exception of the last child.
+    */
+    struct _pbm_NODE* children; // first child (in a list of children)
+    struct _pbm_NODE* next_sib; // next sibling
+
+    // marker for graph traversion
+    uint8_t visited;
+} pbm_NODE;
+
+/*
+* Program Behavior Model (PBM)
+*/
+typedef struct _PBM
+{
+    pbm_NODE* root; // first task of a thread // TODO Rename to first or first_task?
+    pbm_NODE* last; // current last task
+
+    // the task nodes of the parent which are used as time markers/dates to
+    // know where the task graph must be inserted in the parent task-graph in
+    // the post-processing stage
+    pbm_NODE* fork_date; // fork task of the parent process
+    pbm_NODE* exit_date; // current/last task of the parent while this child exited
+
+    /*
+    * The Fork Buffer contains pointers to PBMs which describe the forked
+    * child threads. Should be != NULL with size > 0 only after forking.
+    */
+    struct _PBM* children; // first child (in a list of forked children)
+    struct _PBM* next_sib; // next sibling
+
+    /*
+    * Performance measurement and recording
+    */
+    struct perf_event_attr pea; // config info for the perf_event interface
+    struct perf_event* pevent;
+} PBM;
+
+/*
+* Creates a new PBM and initializes it.
+* Should be the first PBM-function called.
+*
+* Returns a pointer to that PBM, or NULL on error.
+*/
+PBM* pbm_create(void);
+
+/*
+* Deletes the given PBM and frees the associated resources.
+* Should only be called if pbm_create() was called prior, and
+* if no pbm_task_start() or pbm_fork() remained unclosed.
+*/
+void pbm_destroy(PBM* pbm);
+
+/*
+* Inserts into the given PBM a new node of given type (for now only
+* COMP, as COMM in not supported yet) and starts performance counting.
+* Should be called immediately before the task starts, and be closed
+* by pbm_task_end().
+*
+* Returns 0 on error, 1 on full success, or 2 if performance counting
+* could not start.
+*/
+int pbm_task_start(PBM* pbm, uint8_t type, struct task_struct* proc);
+
+/*
+* Ends performance counting for the last node of the given PBM and
+* records the results.
+* Should be called immediately after the task ends, and as the next
+* PBM-method after pbm_task_start().
+*
+* Returns 0 on failure, i.e. the performance counts could not be
+* recorded, otherwise returns 1.
+*/
+int pbm_task_end(PBM* pbm);
+
+ pbm_NODE* pbm_fork_parent_new_task(struct task_struct* parent_proc);
+
+/*
+* Inserts into the given PBM a FORK node which may have up to
+* 'num_thr' children, describing the forked child threads. If
+* num_thr == 0 the max number of threads is determined auto-
+* matically via OpenMP's omp_get_max_threads().
+* Should be called immediately before the thread is forked, and be
+* closed by pbm_join().
+*
+* Returns 0 on error, or 1 on success.
+*
+* The process as an explicit argument is needed since the fork is called by the parent.
+*/
+int pbm_fork(struct task_struct* proc, pid_t parent_pid, pbm_NODE* fork_date);
+
+int pbm_exit(pid_t pid, pid_t parent_pid);
+
+/*
+* Inserts into the given PBM a JOIN node. Records the actual number of
+* forked threads. Merges the sub-PBMs, describing the child threads, from
+* the fork buffer into the parent PBM and releases the fork-buffer.
+* Should be called immediately after the child threads are joined, and as
+* the next PBM-method after pbm_fork().
+*
+* Returns 0 on error, or 1 on success.
+*/
+int pbm_join(PBM* child_pbm);
+
+/*
+* Crude recursive ADG printer. Starts with the given node, which
+* should be the root node of the given PBM.
+*/
+void pbm_print_graph(PBM* pbm, pbm_NODE* node);
+
+void pbm_join_and_print_graph_self(pid_t pid);
+
+/*
+* Auxiliary methods, not for public usage. Nomen est omen.
+*/
+pbm_NODE* _pbm_create_node(uint8_t type, pid_t pid);
+void _pbm_destroy_node(pbm_NODE* node);
+void _pbm_unvisit_node(pbm_NODE* node);
+
+/*
+* Initialize the map necessary to locate the PBM by given PID
+* 
+* BEWARE: Forgetting to call this before calling fork() will trigger
+* Segmentation faults!
+*/
+void pbm_init(void);
+void pbm_uninit(void);
+
+/******************************************************************************/
+
+int is_root_process(struct task_struct* p);
+int is_relevant_process(struct task_struct *p);
+
 int start_counting(struct task_struct *p);
 int stop_counting(void);
 
-void overflow_handler(
-		struct perf_event *,
-		struct perf_sample_data *,
-		struct pt_regs *regs);
-
 #endif //PLAN_BASED_LINUX_SCHEDULER_BEHAVE_H
diff --git a/kernel/exit.c b/kernel/exit.c
index 97470434dad1..1d7f88f7aec2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -768,11 +768,11 @@ void __noreturn do_exit(long code)
 	int group_dead;
 
 	// call the readout before the process is terminated
-	if (strcmp(tsk->real_parent->comm, "bash") == 0
-			|| strcmp(tsk->real_parent->real_parent->comm, "bash") == 0
-			|| strcmp(tsk->real_parent->real_parent->real_parent->comm, "bash") == 0) {
-		stop_counting();
-		printk(KERN_EMERG "EXIT: %u, CMD: '%s', PARENT-CMD: '%s', PTR: %llu\n", tsk->pid, tsk->comm, tsk->real_parent->comm, (u64)tsk);
+	if (is_relevant_process(tsk)) {
+		pbm_exit(tsk->pid, tsk->real_parent->pid);
+	}
+	if (is_root_process(tsk)) {
+		pbm_join_and_print_graph_self(tsk->pid);
 	}
 
 	TASKS_RCU(int tasks_rcu_i);
diff --git a/kernel/fork.c b/kernel/fork.c
index 2c2aab1bbb6a..b3ec7595bd1c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2016,7 +2016,16 @@ long _do_fork(unsigned long clone_flags,
 	int trace = 0;
 	long nr;
 
-    printk(KERN_EMERG "DO FORK CALLED by: %u\n", current->pid);
+	pbm_NODE* fork_date;
+	pid_t parent_pid = current->pid;
+	fork_date = NULL;
+
+	// FIXME: This will not get called for mpirun since then bash will be the parent here
+	if (is_relevant_process(current)) {
+		printk(KERN_EMERG "DO FORK CALLED by: '%s' %u\n", current->comm ,parent_pid);
+		fork_date = pbm_fork_parent_new_task(current);
+	}
+
 	/*
 	 * Determine whether and which event to report to ptracer.  When
 	 * called from kernel_thread or CLONE_UNTRACED is explicitly
@@ -2060,6 +2069,16 @@ long _do_fork(unsigned long clone_flags,
 			get_task_struct(p);
 		}
 
+		// this must be called before the new task wakes up to make sure that
+		// initialization of the perf event is done at that point
+		if (is_relevant_process(p)) {
+			// BEWARE: At this point p->comm is not yet up to date but shows the
+			// command of the parent!
+			printk(KERN_EMERG "FORKED!!!!: %u, Parent: %s, Super-Parent:%s\n",
+					p->pid, p->comm, p->real_parent->real_parent->comm);
+			pbm_fork(p, parent_pid, fork_date);
+		}
+
 		wake_up_new_task(p);
 
 		/* forking complete and child started to run, tell ptracer */
@@ -2072,15 +2091,6 @@ long _do_fork(unsigned long clone_flags,
 		}
 
 		put_pid(pid);
-        printk(KERN_EMERG "FORKED!!!!: %u, Parent: %s, Super-Parent:%s\n",
-				p->pid, p->comm, p->real_parent->real_parent->comm);
-		//FIXME At this point p->comm is not up to date but shows the command of the parent!
-		//      (This may not be a problem since the name of the forked processes are not needed?)
-		if (strcmp(p->comm, "bash") == 0
-				|| strcmp(p->real_parent->comm, "bash") == 0
-				|| strcmp(p->real_parent->real_parent->comm, "bash") == 0) {
-			start_counting(p);
-		}
 	} else {
 		nr = PTR_ERR(p);
 	}
-- 
GitLab