pb.c

#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include "perf_error_detection.h"
#include <linux/syscalls.h>
#include <linux/spinlock.h>
#include <linux/perf_event.h>
#include <linux/kthread.h>

typedef struct pb_plan pb_plan_t;

SYSCALL_DEFINE1(pb_set_plan, pb_plan_t __user*, plan) {
	pb_plan_t _plan;
	struct task_struct* task;
	struct rq* rq;
	struct pb_rq* pb_rq;
	size_t expected;
	uint64_t* inst_cnt;

	unsigned long copied;
	unsigned int i;
	int res;
	
	copied = copy_from_user(&_plan, plan, sizeof(pb_plan_t));
	
	if (copied != 0) {
		return -1;
	}

	expected = _plan.num_tasks * sizeof(*_plan.inst_cnt);

	inst_cnt = (uint64_t *)kzalloc(expected, GFP_KERNEL);

	if (inst_cnt == NULL) {
		return -1;
	}

	copied = copy_from_user(inst_cnt, _plan.inst_cnt, expected);

	if (copied != 0) {
		printk("copy from user inst_cnt failed \n");
		return -1;
	}

	task = find_task_by_vpid(_plan.pid);

	if (!task) {
		return -1;
	}

	rq = this_rq();

	task->sched_class = &pb_sched_class;

	pb_rq = &rq->pb;

	set_pb_plan_size(pb_rq, _plan.num_tasks);

	for (i = 0; i < _plan.num_tasks; i++ ) {
		set_pb_plan_entry(
			pb_rq,
			i,
			inst_cnt[i],
			i,
			task
		);
	}

	kfree(inst_cnt);

	res = pb_submit_plan(rq);

	if (res == -1) {
		printk("pb_submit_plan == -1\n");
		return res;
	}

	return 0;
}

/*
 * It is possible submit a plan only if no plan is currently executed
 */
int pb_submit_plan(struct rq *rq)
{
	/*
	 * Must be volatile to ensure correct initialization order
	 */
	volatile struct pb_rq * pb = (volatile struct pb_rq*)(&(rq->pb));
	int perf_init_res;
	int i = 0;

	if (pb->mode != PB_DISABLED_MODE) {
		return -1;
	}

	perf_init_res = init_perf_event(&pb->plan[i], &pb->pevent);
	if(perf_init_res < 0) {
		//initialization error detection/handling could happen here
		printk(KERN_WARNING "PB INIT,%u: FAILED OPEN PERF EVENT\n", i);
	} else {
		printk(KERN_DEBUG "PB INIT,%u\n", i);
	}

	pb->c_entry = 0;
	pb->count_pb_cycles = 0;
	pb->count_admin_cycles = 0;
	pb->total_instr = 0;

	pb->is_initialized = 1;	// must be initialized last

	resched_curr(rq);	// reschedule ASAP

	return 0;
}
EXPORT_SYMBOL(pb_submit_plan);
/*
 * Kelvin's Testcodes
 */
void set_pb_plan_size(struct pb_rq *pb_rq, unsigned int size)
{
	pb_rq->size = size;
	pb_rq->plan = kmalloc(sizeof(struct plan_entry) * size , GFP_KERNEL);
	memset(pb_rq->plan, 0x0, sizeof(struct plan_entry) * size);
}
EXPORT_SYMBOL(set_pb_plan_size);

/*
 * Kelvin's Testcode
 */
//insert into pb queue (analog to enqueue)
void set_pb_plan_entry(struct pb_rq *pb_rq, unsigned int i, u64 n_instr, u64 task_id, struct task_struct *task_struct)
{
	pb_rq->plan[i].n_instr = n_instr;
	pb_rq->plan[i].task_id = task_id;
	pb_rq->plan[i].task_struct = task_struct;
}
EXPORT_SYMBOL(set_pb_plan_entry);

// called by core.c sched_init
void init_pb_rq(struct pb_rq *pb_rq)
{
	pb_rq->n_pb_cycles = 100;
	pb_rq->count_pb_cycles = 0;
	pb_rq->n_admin_cycles = 20;
	pb_rq->count_admin_cycles = 0;
	pb_rq->mode = PB_DISABLED_MODE;
	pb_rq->c_entry = 0;
	pb_rq->size = 0;
	pb_rq->pevent = NULL;
	pb_rq->is_initialized = 0;
	pb_rq->waiting_on_io = 0;
}
EXPORT_SYMBOL(init_pb_rq);

// IO has finished, we can schedule the next task
static void enqueue_task_pb(struct rq *rq, struct task_struct *p, int flags)
{
	struct pb_rq *pb = &(rq->pb);

	pb->waiting_on_io = 0;
}

// task started IO and thus it is finished
static void dequeue_task_pb(struct rq *rq, struct task_struct *p, int flags)
{
	struct pb_rq *pb = &(rq->pb);
	unsigned int c_entry_curr;
	u64 perf_counter;
	u64 counter_diff;
	u64 read_error;
	bool premature_finish = false;


	if (pb->waiting_on_io) {
		return;
	}
	pb->waiting_on_io = 1;
	c_entry_curr = pb->c_entry;
	
	if(!pb->pevent) {
		printk("WARNING: PERF EVENT IS NULL");
	}

	// printk(KERN_ALERT "DEBUG: Passed %s %d \n",__FUNCTION__,__LINE__);
	read_error = get_perf_counter(pb->pevent, &perf_counter);
	if (read_error) {
		printk(KERN_WARNING "FETCHING PERFORMANCE COUNTER IN PB SCHEDULER FAILED WITH %llu\n", read_error);
	}
	counter_diff = perf_counter - pb->total_instr;
	pb->plan[c_entry_curr].n_instr_counted = counter_diff;
	pb->total_instr = perf_counter;
	if (counter_diff < pb->plan[c_entry_curr].n_instr) {
		u64 under_time = pb->plan[c_entry_curr].n_instr - counter_diff;

		printk(KERN_WARNING "PB TASK %llu RAN %llu INSTRUCTIONS TOO SHORT\n", pb->plan[pb->c_entry].task_id, under_time);
	} else if (counter_diff > pb->plan[c_entry_curr].n_instr) {
		//TODO: Check if actually an overflow occurs and an another calculation is necessary
		// (setting a flag in the perf overflow_handler could be a solution)
		u64 over_time = counter_diff - pb->plan[c_entry_curr].n_instr;

		printk(KERN_WARNING "PB TASK %llu RAN %llu INSTRUCTIONS TOO LONG\n", pb->plan[pb->c_entry].task_id, over_time);
	}


	pb->c_entry++;

	/**
		* Don't schedule a task that is dead. (e.g. plan was incorrect and program finished quicker)
		* TODO: if we have multiple tasks structs try the next plan entry
		*/
	if (pb->c_entry < pb->size && pb->plan[pb->c_entry].task_struct->state == TASK_DEAD) {
		premature_finish = true;
	}

	if (pb->c_entry >= pb->size || premature_finish) {
		if (premature_finish) {
			printk(KERN_WARNING "PLAN TERMINATED PREMATURELY \n");
		}
		else {
			printk(KERN_WARNING "PLAN DONE \n");
		}

		// set back to cfs for completion of task
		pb->is_initialized = 0;
		pb->plan[0].task_struct->sched_class = &fair_sched_class;
		resched_curr(rq);
	}
}

static void yield_task_pb(struct rq *rq)
{
	// NOP
}

static void check_preempt_curr_pb(struct rq *rq, struct task_struct *p, int flags)
{
	// NOP
}

static struct task_struct * pick_next_task_pb(struct rq *rq,
		struct task_struct *prev, struct rq_flags *rf)
{
	// contains task to be executed
	struct task_struct *picked = NULL;
	enum pb_mode current_mode, next_mode;
	struct pb_rq *pb = &(rq->pb);
	
	current_mode = pb->mode;
	next_mode = determine_next_mode_pb(rq);
	pb->mode = next_mode;

	if (next_mode == PB_DISABLED_MODE && current_mode == PB_EXEC_MODE) {
		// After Plan is done do the cleanup
		terminate_perf_event(pb->pevent);
		pb->pevent = NULL;
		// TODO: Check if we have to free the memory or if perf takes care of it
		// see 'perf_event_release_kernel(struct perf_event *event)' in core.c
	}
	/**
	 * This handles the case where the program to be run is dead before the
	 * pb scheduler starts executing
	 */
	if (current_mode == PB_DISABLED_MODE && current_mode != next_mode) {
		if (pb->c_entry < pb->size && pb->plan[pb->c_entry].task_struct->state == TASK_DEAD) {
			pb->mode = PB_DISABLED_MODE;
			next_mode = PB_DISABLED_MODE;
			picked = NULL;
			pb->is_initialized = 0;
			printk(KERN_WARNING "PLAN TERMINATED PREMATURELY \n");
		}
	}

	if (current_mode != next_mode) {
		printk("SWITCHING MODES\n");
		pb->count_admin_cycles = 0;
		pb->count_pb_cycles = 0;
		// Push last non-plan task back in its corresponding runqueue
		if (next_mode == PB_EXEC_MODE) {
			// Necessary to manage the preempted task
			printk("PUT OLD TASK BACK IN RQ\n");
			put_prev_task(rq, prev);
		}
	}

	// EXEC Mode is next, so we return our next task to be executed
	if (next_mode == PB_EXEC_MODE) {
		// printk(KERN_ALERT "DEBUG: Passed %s %d \n",__FUNCTION__,__LINE__);
		if(current_mode == PB_ADMIN_MODE) {
			printk(KERN_DEBUG "PB ADMIN,STOP,%u,%llu\n", pb->c_entry, sched_clock());
		} else if (current_mode == PB_DISABLED_MODE) {
			printk("Switching from disabled to EXEC\n");
		}
		picked = pb->plan[pb->c_entry].task_struct;
	}

	return picked;
}

static void put_prev_task_pb(struct rq *rq, struct task_struct *p)
{
	// NOP
}

static void set_curr_task_pb(struct rq *rq)
{
	// NOP
}

/*
 * TODO: Make sure this does't interrupt determine_next_mode_pb() and pick_next_task_pb()
 */
static void task_tick_pb(struct rq *rq, struct task_struct *p, int queued)
{
	struct pb_rq *pb = &(rq->pb);

	if (pb->mode != PB_EXEC_MODE) {
		return;
	}

	pb->count_pb_cycles++;

	// printk("TICK #%d\n",pb->count_pb_cycles);

	if (determine_next_mode_pb(rq) != PB_EXEC_MODE && pb->mode == PB_EXEC_MODE) {
		//printk("Reschudling in task_tick_pb");
		resched_curr(rq);
	}
}

static unsigned int get_rr_interval_pb(struct rq *rq, struct task_struct *task)
{
	return 0;
}

static void prio_changed_pb(struct rq *rq, struct task_struct *p, int oldprio)
{
	// NOP
}

static void switched_to_pb(struct rq *rq, struct task_struct *p)
{
	// NOP
}

static void update_curr_pb(struct rq *rq)
{
	// NOP
}

const struct sched_class pb_sched_class = {
	.next			= &dl_sched_class,
	.enqueue_task		= enqueue_task_pb,
	.dequeue_task		= dequeue_task_pb,
	.yield_task		= yield_task_pb,

	.check_preempt_curr	= check_preempt_curr_pb, // NOP

	.pick_next_task		= pick_next_task_pb,
	.put_prev_task		= put_prev_task_pb, // NOP

	.set_curr_task          = set_curr_task_pb, // NOP
	.task_tick		= task_tick_pb,

	.get_rr_interval	= get_rr_interval_pb, // NOP (return 0)

	.prio_changed		= prio_changed_pb, // NOP
	.switched_to		= switched_to_pb, // NOP

	.update_curr		= update_curr_pb, // NOP
};
EXPORT_SYMBOL(pb_sched_class);


/***********************************************************************
 * /proc filesystem entry
 * use 'cat /proc/pbsched' to read
 **********************************************************************/

static int show_pbsched(struct seq_file *seq, void *v)
{
    int cpu;

	if (v == (void *)1) {
		seq_printf(seq, "cpuid mode curr_entry curr_pb_cycles curr_admin_cycles\n");
	} else {
        char mode;
		struct rq *rq;
		struct pb_rq *pb;

		int i;
		struct plan_entry *plan;

		cpu = (unsigned long)(v - 2);
		rq = cpu_rq(cpu);
        pb = &(rq->pb);

        switch(pb->mode) {
	        case PB_DISABLED_MODE: mode='D'; break;
	        case PB_EXEC_MODE: mode='E'; break;
	        case PB_ADMIN_MODE: mode='A'; break;
	        default: mode='U'; break;
        }

		/* runqueue-specific stats */
		seq_printf(seq,
		    "cpu%d %c %u %llu %llu\n",
		    cpu,
		    mode,
            pb->c_entry,
            pb->count_pb_cycles,
            pb->count_admin_cycles
        );

		/* plan stats */
		if(pb->size){
			seq_printf(seq, "\ntask_id n_instr n_instr_counted\n");

			plan = pb->plan;
			for (i=0; i < pb->size; i++){
				 // only print completed tasks, after completion is_initialized is 0 and we can print the last
				if(i<pb->c_entry || !pb->is_initialized){
					seq_printf(seq,
						"%llu %llu %llu\n",
						plan[i].task_id,
						plan[i].n_instr,
						plan[i].n_instr_counted
					);
				}else{
					seq_printf(seq,
						"%llu %llu queued\n",
						plan[i].task_id,
						plan[i].n_instr
					);
				}


			}

		}
	}
	return 0;
}

/*
 * This itererator needs some explanation.
 * It returns 1 for the header position.
 * This means 2 is cpu 0.
 * In a hotplugged system some cpus, including cpu 0, may be missing so we have
 * to use cpumask_* to iterate over the cpus.
 */
static void *pbsched_start(struct seq_file *file, loff_t *offset)
{
    unsigned long n = *offset;

	if (n == 0)
		return (void *) 1;

	n--;

	if (n > 0)
		n = cpumask_next(n - 1, cpu_online_mask);
	else
		n = cpumask_first(cpu_online_mask);

	*offset = n + 1;

	if (n < nr_cpu_ids)
		return (void *)(unsigned long)(n + 2);
	return NULL;
}

static void *pbsched_next(struct seq_file *file, void *data, loff_t *offset)
{
    (*offset)++;
	return pbsched_start(file, offset);
}

static void pbsched_stop(struct seq_file *file, void *data)
{
    // NOP
}

static const struct seq_operations pbsched_sops = {
	.start = pbsched_start,
	.next  = pbsched_next,
	.stop  = pbsched_stop,
	.show  = show_pbsched,
};

static int pbsched_open(struct inode *inode, struct file *file)
{
	return seq_open(file, &pbsched_sops);
}

static const struct file_operations proc_pbsched_operations = {
	.open    = pbsched_open,
	.read    = seq_read,
	.llseek  = seq_lseek,
	.release = seq_release,
};

static int __init proc_pbsched_init(void)
{
	proc_create("pbsched", 0, NULL, &proc_pbsched_operations);
	return 0;
}
subsys_initcall(proc_pbsched_init);