Skip to content
Snippets Groups Projects
Commit e98b464a authored by alexander06's avatar alexander06
Browse files

Remove redundant clean in main pipeline

parent 8ac2d2f9
Branches
Tags
No related merge requests found
%% Cell type:markdown id: tags:
# Evaluation Report Pipeline
This file hosts the pipeline, which generates all images containing the final evaluation results
%% Cell type:code id: tags:
``` python
import os, pandas, math,json
import regex as re
import lib.constants as CONSTANTS
import lib.api as api
question_cache = {}
intervals = {}
uuid_regex = re.compile('\\b[0-9a-f]{8}\\b-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-\\b[0-9a-f]{12}\\b')
```
%% Cell type:code id: tags:
``` python
def file_iterator():
for (root, _dirs, files) in os.walk(CONSTANTS.CLEAN_DATA_PATH):
if len(files) > 0 and 'all.csv' in files:
file_path = os.path.join(root, "all.csv")
df = pandas.read_csv(file_path)
yield root.split("/")[-2:] + [df]
def generate_scales():
result = {}
scales = api.get_scales()
for scale in scales:
answers = api.get_answers_for(scale["id"])
result[scale["id"]] = answers
return result
def generate_intervals():
global intervals
intervals = {key: [x['value'] for x in value] for key, value in generate_scales().items()}
def get_factors():
factor = {}
for interval in intervals:
# These intervals either just have two steps or the answers have no weight
if interval in ["scale_time", "y_n"]: continue
(a,b) = intervals[interval]
factor[interval] = round(abs(a-b-1)/2, 0)
return factor
# The type of the value is unknown
def convert_score(value, factor) -> int:
# Matches Numbers from 0-9 an optional (floating) point and optional trailing zero
number_regex = re.compile('^\d(\.0)?$')
# Minimum requirement for a number
# Cast to string safeguards real floats
if number_regex.match(str(value)) != None:
return factor[int(float(value) - 1)]
else:
return 0
def calc_score(series: pandas.Series) -> float:
if uuid_regex.match(series.name) != None:
# TODO: Negate that one question
if series.name not in question_cache:
question_cache[series.name] = intervals[api.get_scale_for(series.name)["id"]]
factor = question_cache[series.name]
# Catches weightless questions
if len([x for x in factor if x != 0]) == 0:
return 0
if factor == [1, 0]:
return series.map(lambda x: 1 if x == "Ja" else 0).sum()
# Pandas saves values as float, so they need to be converted
series = series.map(lambda x: convert_score(x, factor), na_action="ignore")
return series.sum()
```
%% Cell type:code id: tags:
``` python
# Generate static values
generate_intervals()
participation = {}
participation_path = os.path.join(CONSTANTS.RAW_DATA_PATH, "participation")
participation_path = os.path.join(CONSTANTS.CLEAN_DATA_PATH, "participation")
for file in os.listdir(participation_path):
if not file.endswith(".csv") or file.startswith("example"): continue
semester = file.replace(".csv", "")
participation[semester.replace("-", "/")] = pandas.read_csv(os.path.join(participation_path, file), sep=";")
participation[semester.replace("-", "/")] = pandas.read_csv(os.path.join(participation_path, file))
```
%% Cell type:markdown id: tags:
## Calculate weight for Dimension
Calculates the individual score for all dimensions for every course
%% Cell type:code id: tags:
``` python
result_frames = {}
for (institute, semester, df) in file_iterator():
res_df = pandas.DataFrame(data={})
grouped = df.groupby(["Veranstaltung-Nr.", "Person-Name", "Veranstaltung-Kategorie", "Veranstaltung-Name"])
for group in grouped:
# Keeps the at least five responses constraint
if len(group[1]) < 5:
continue
head = pandas.Series({"Veranstaltung-Nr.": group[0][0], "Person-Name": group[0][1], "Veranstaltung-Typ": group[0][2], "Veranstaltung-Name": group[0][3], "Antworten": len(group[1])})
result = group[1].dropna(axis="columns", how="all").apply(calc_score).dropna(how="all")
combined = pandas.concat([head, result])
res_df = pandas.concat([res_df, combined.to_frame().T])
result_frames["{0}_{1}".format(institute,semester)] = res_df
```
%% Cell type:code id: tags:
``` python
#print(result_frames)
def norm_name(name: str) -> str:
if ', ' not in name: return name
split = name.split(", ")
return split[1] + " " + split[0]
for (institute_semester, df) in result_frames.items():
collection = []
institute, semester = institute_semester.split("_")
institute = CONSTANTS.INSTITUTE_MAP[institute]
if "WiSe" in semester:
semester = semester.replace("-", "/")
if semester in participation:
# This may lead to data loss but is impossible to work around
semester_obj = participation[semester]
semester_obj.drop_duplicates(inplace=True)
names = semester_obj["Person-Name"]
semester_obj["Person-Name"] = names.map(norm_name)
df = df.merge(semester_obj[['Veranstaltung-Nr.', 'Anzahl Teilnehmende', 'Person-Name', 'Veranstaltung-Name']], on=["Veranstaltung-Nr.", "Person-Name", "Veranstaltung-Name"], how="left")
for row in df.iterrows():
course_object = {
"course_number": str(row[1]["Veranstaltung-Nr."]),
"course_name": row[1]["Veranstaltung-Name"],
"course_type": row[1]["Veranstaltung-Typ"],
"lecturer": row[1]["Person-Name"],
"semester": semester,
"institute": institute,
"answers": row[1]["Antworten"],
# -1 indicated that no data is available
"participants": -1,
"scores": []
}
if "Anzahl Teilnehmende" in df.columns:
if not math.isnan(row[1]["Anzahl Teilnehmende"]):
course_object["participants"] = int(row[1]["Anzahl Teilnehmende"])
for col in row[1].index:
if uuid_regex.match(col) != None:
# Early Break for irrelevant columns
if math.isnan(row[1][col]):
continue
course_object["scores"].append({"question": col, "score": int(row[1][col])})
collection.append(course_object)
api.force_courses(collection)
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment