Skip to content
Snippets Groups Projects
Commit e98b464a authored by alexander06's avatar alexander06
Browse files

Remove redundant clean in main pipeline

parent 8ac2d2f9
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Evaluation Report Pipeline # Evaluation Report Pipeline
This file hosts the pipeline, which generates all images containing the final evaluation results This file hosts the pipeline, which generates all images containing the final evaluation results
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import os, pandas, math,json import os, pandas, math,json
import regex as re import regex as re
import lib.constants as CONSTANTS import lib.constants as CONSTANTS
import lib.api as api import lib.api as api
question_cache = {} question_cache = {}
intervals = {} intervals = {}
uuid_regex = re.compile('\\b[0-9a-f]{8}\\b-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-\\b[0-9a-f]{12}\\b') uuid_regex = re.compile('\\b[0-9a-f]{8}\\b-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-\\b[0-9a-f]{12}\\b')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def file_iterator(): def file_iterator():
for (root, _dirs, files) in os.walk(CONSTANTS.CLEAN_DATA_PATH): for (root, _dirs, files) in os.walk(CONSTANTS.CLEAN_DATA_PATH):
if len(files) > 0 and 'all.csv' in files: if len(files) > 0 and 'all.csv' in files:
file_path = os.path.join(root, "all.csv") file_path = os.path.join(root, "all.csv")
df = pandas.read_csv(file_path) df = pandas.read_csv(file_path)
yield root.split("/")[-2:] + [df] yield root.split("/")[-2:] + [df]
def generate_scales(): def generate_scales():
result = {} result = {}
scales = api.get_scales() scales = api.get_scales()
for scale in scales: for scale in scales:
answers = api.get_answers_for(scale["id"]) answers = api.get_answers_for(scale["id"])
result[scale["id"]] = answers result[scale["id"]] = answers
return result return result
def generate_intervals(): def generate_intervals():
global intervals global intervals
intervals = {key: [x['value'] for x in value] for key, value in generate_scales().items()} intervals = {key: [x['value'] for x in value] for key, value in generate_scales().items()}
def get_factors(): def get_factors():
factor = {} factor = {}
for interval in intervals: for interval in intervals:
# These intervals either just have two steps or the answers have no weight # These intervals either just have two steps or the answers have no weight
if interval in ["scale_time", "y_n"]: continue if interval in ["scale_time", "y_n"]: continue
(a,b) = intervals[interval] (a,b) = intervals[interval]
factor[interval] = round(abs(a-b-1)/2, 0) factor[interval] = round(abs(a-b-1)/2, 0)
return factor return factor
# The type of the value is unknown # The type of the value is unknown
def convert_score(value, factor) -> int: def convert_score(value, factor) -> int:
# Matches Numbers from 0-9 an optional (floating) point and optional trailing zero # Matches Numbers from 0-9 an optional (floating) point and optional trailing zero
number_regex = re.compile('^\d(\.0)?$') number_regex = re.compile('^\d(\.0)?$')
# Minimum requirement for a number # Minimum requirement for a number
# Cast to string safeguards real floats # Cast to string safeguards real floats
if number_regex.match(str(value)) != None: if number_regex.match(str(value)) != None:
return factor[int(float(value) - 1)] return factor[int(float(value) - 1)]
else: else:
return 0 return 0
def calc_score(series: pandas.Series) -> float: def calc_score(series: pandas.Series) -> float:
if uuid_regex.match(series.name) != None: if uuid_regex.match(series.name) != None:
# TODO: Negate that one question # TODO: Negate that one question
if series.name not in question_cache: if series.name not in question_cache:
question_cache[series.name] = intervals[api.get_scale_for(series.name)["id"]] question_cache[series.name] = intervals[api.get_scale_for(series.name)["id"]]
factor = question_cache[series.name] factor = question_cache[series.name]
# Catches weightless questions # Catches weightless questions
if len([x for x in factor if x != 0]) == 0: if len([x for x in factor if x != 0]) == 0:
return 0 return 0
if factor == [1, 0]: if factor == [1, 0]:
return series.map(lambda x: 1 if x == "Ja" else 0).sum() return series.map(lambda x: 1 if x == "Ja" else 0).sum()
# Pandas saves values as float, so they need to be converted # Pandas saves values as float, so they need to be converted
series = series.map(lambda x: convert_score(x, factor), na_action="ignore") series = series.map(lambda x: convert_score(x, factor), na_action="ignore")
return series.sum() return series.sum()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Generate static values # Generate static values
generate_intervals() generate_intervals()
participation = {} participation = {}
participation_path = os.path.join(CONSTANTS.RAW_DATA_PATH, "participation") participation_path = os.path.join(CONSTANTS.CLEAN_DATA_PATH, "participation")
for file in os.listdir(participation_path): for file in os.listdir(participation_path):
if not file.endswith(".csv") or file.startswith("example"): continue if not file.endswith(".csv") or file.startswith("example"): continue
semester = file.replace(".csv", "") semester = file.replace(".csv", "")
participation[semester.replace("-", "/")] = pandas.read_csv(os.path.join(participation_path, file), sep=";") participation[semester.replace("-", "/")] = pandas.read_csv(os.path.join(participation_path, file))
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Calculate weight for Dimension ## Calculate weight for Dimension
Calculates the individual score for all dimensions for every course Calculates the individual score for all dimensions for every course
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
result_frames = {} result_frames = {}
for (institute, semester, df) in file_iterator(): for (institute, semester, df) in file_iterator():
res_df = pandas.DataFrame(data={}) res_df = pandas.DataFrame(data={})
grouped = df.groupby(["Veranstaltung-Nr.", "Person-Name", "Veranstaltung-Kategorie", "Veranstaltung-Name"]) grouped = df.groupby(["Veranstaltung-Nr.", "Person-Name", "Veranstaltung-Kategorie", "Veranstaltung-Name"])
for group in grouped: for group in grouped:
# Keeps the at least five responses constraint # Keeps the at least five responses constraint
if len(group[1]) < 5: if len(group[1]) < 5:
continue continue
head = pandas.Series({"Veranstaltung-Nr.": group[0][0], "Person-Name": group[0][1], "Veranstaltung-Typ": group[0][2], "Veranstaltung-Name": group[0][3], "Antworten": len(group[1])}) head = pandas.Series({"Veranstaltung-Nr.": group[0][0], "Person-Name": group[0][1], "Veranstaltung-Typ": group[0][2], "Veranstaltung-Name": group[0][3], "Antworten": len(group[1])})
result = group[1].dropna(axis="columns", how="all").apply(calc_score).dropna(how="all") result = group[1].dropna(axis="columns", how="all").apply(calc_score).dropna(how="all")
combined = pandas.concat([head, result]) combined = pandas.concat([head, result])
res_df = pandas.concat([res_df, combined.to_frame().T]) res_df = pandas.concat([res_df, combined.to_frame().T])
result_frames["{0}_{1}".format(institute,semester)] = res_df result_frames["{0}_{1}".format(institute,semester)] = res_df
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#print(result_frames)
def norm_name(name: str) -> str:
if ', ' not in name: return name
split = name.split(", ")
return split[1] + " " + split[0]
for (institute_semester, df) in result_frames.items(): for (institute_semester, df) in result_frames.items():
collection = [] collection = []
institute, semester = institute_semester.split("_") institute, semester = institute_semester.split("_")
institute = CONSTANTS.INSTITUTE_MAP[institute] institute = CONSTANTS.INSTITUTE_MAP[institute]
if "WiSe" in semester: if "WiSe" in semester:
semester = semester.replace("-", "/") semester = semester.replace("-", "/")
if semester in participation: if semester in participation:
# This may lead to data loss but is impossible to work around # This may lead to data loss but is impossible to work around
semester_obj = participation[semester] semester_obj = participation[semester]
semester_obj.drop_duplicates(inplace=True) semester_obj.drop_duplicates(inplace=True)
names = semester_obj["Person-Name"]
semester_obj["Person-Name"] = names.map(norm_name)
df = df.merge(semester_obj[['Veranstaltung-Nr.', 'Anzahl Teilnehmende', 'Person-Name', 'Veranstaltung-Name']], on=["Veranstaltung-Nr.", "Person-Name", "Veranstaltung-Name"], how="left") df = df.merge(semester_obj[['Veranstaltung-Nr.', 'Anzahl Teilnehmende', 'Person-Name', 'Veranstaltung-Name']], on=["Veranstaltung-Nr.", "Person-Name", "Veranstaltung-Name"], how="left")
for row in df.iterrows(): for row in df.iterrows():
course_object = { course_object = {
"course_number": str(row[1]["Veranstaltung-Nr."]), "course_number": str(row[1]["Veranstaltung-Nr."]),
"course_name": row[1]["Veranstaltung-Name"], "course_name": row[1]["Veranstaltung-Name"],
"course_type": row[1]["Veranstaltung-Typ"], "course_type": row[1]["Veranstaltung-Typ"],
"lecturer": row[1]["Person-Name"], "lecturer": row[1]["Person-Name"],
"semester": semester, "semester": semester,
"institute": institute, "institute": institute,
"answers": row[1]["Antworten"], "answers": row[1]["Antworten"],
# -1 indicated that no data is available # -1 indicated that no data is available
"participants": -1, "participants": -1,
"scores": [] "scores": []
} }
if "Anzahl Teilnehmende" in df.columns: if "Anzahl Teilnehmende" in df.columns:
if not math.isnan(row[1]["Anzahl Teilnehmende"]): if not math.isnan(row[1]["Anzahl Teilnehmende"]):
course_object["participants"] = int(row[1]["Anzahl Teilnehmende"]) course_object["participants"] = int(row[1]["Anzahl Teilnehmende"])
for col in row[1].index: for col in row[1].index:
if uuid_regex.match(col) != None: if uuid_regex.match(col) != None:
# Early Break for irrelevant columns # Early Break for irrelevant columns
if math.isnan(row[1][col]): if math.isnan(row[1][col]):
continue continue
course_object["scores"].append({"question": col, "score": int(row[1][col])}) course_object["scores"].append({"question": col, "score": int(row[1][col])})
collection.append(course_object) collection.append(course_object)
api.force_courses(collection) api.force_courses(collection)
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment