Skip to content
Snippets Groups Projects
Commit e98b464a authored by alexander06's avatar alexander06
Browse files

Remove redundant clean in main pipeline

parent 8ac2d2f9
Branches
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# Evaluation Report Pipeline
This file hosts the pipeline, which generates all images containing the final evaluation results
%% Cell type:code id: tags:
``` python
import os, pandas, math,json
import regex as re
import lib.constants as CONSTANTS
import lib.api as api
question_cache = {}
intervals = {}
uuid_regex = re.compile('\\b[0-9a-f]{8}\\b-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-\\b[0-9a-f]{12}\\b')
```
%% Cell type:code id: tags:
``` python
def file_iterator():
for (root, _dirs, files) in os.walk(CONSTANTS.CLEAN_DATA_PATH):
if len(files) > 0 and 'all.csv' in files:
file_path = os.path.join(root, "all.csv")
df = pandas.read_csv(file_path)
yield root.split("/")[-2:] + [df]
def generate_scales():
result = {}
scales = api.get_scales()
for scale in scales:
answers = api.get_answers_for(scale["id"])
result[scale["id"]] = answers
return result
def generate_intervals():
global intervals
intervals = {key: [x['value'] for x in value] for key, value in generate_scales().items()}
def get_factors():
factor = {}
for interval in intervals:
# These intervals either just have two steps or the answers have no weight
if interval in ["scale_time", "y_n"]: continue
(a,b) = intervals[interval]
factor[interval] = round(abs(a-b-1)/2, 0)
return factor
# The type of the value is unknown
def convert_score(value, factor) -> int:
# Matches Numbers from 0-9 an optional (floating) point and optional trailing zero
number_regex = re.compile('^\d(\.0)?$')
# Minimum requirement for a number
# Cast to string safeguards real floats
if number_regex.match(str(value)) != None:
return factor[int(float(value) - 1)]
else:
return 0
def calc_score(series: pandas.Series) -> float:
if uuid_regex.match(series.name) != None:
# TODO: Negate that one question
if series.name not in question_cache:
question_cache[series.name] = intervals[api.get_scale_for(series.name)["id"]]
factor = question_cache[series.name]
# Catches weightless questions
if len([x for x in factor if x != 0]) == 0:
return 0
if factor == [1, 0]:
return series.map(lambda x: 1 if x == "Ja" else 0).sum()
# Pandas saves values as float, so they need to be converted
series = series.map(lambda x: convert_score(x, factor), na_action="ignore")
return series.sum()
```
%% Cell type:code id: tags:
``` python
# Generate static values
generate_intervals()
participation = {}
participation_path = os.path.join(CONSTANTS.RAW_DATA_PATH, "participation")
participation_path = os.path.join(CONSTANTS.CLEAN_DATA_PATH, "participation")
for file in os.listdir(participation_path):
if not file.endswith(".csv") or file.startswith("example"): continue
semester = file.replace(".csv", "")
participation[semester.replace("-", "/")] = pandas.read_csv(os.path.join(participation_path, file), sep=";")
participation[semester.replace("-", "/")] = pandas.read_csv(os.path.join(participation_path, file))
```
%% Cell type:markdown id: tags:
## Calculate weight for Dimension
Calculates the individual score for all dimensions for every course
%% Cell type:code id: tags:
``` python
result_frames = {}
for (institute, semester, df) in file_iterator():
res_df = pandas.DataFrame(data={})
grouped = df.groupby(["Veranstaltung-Nr.", "Person-Name", "Veranstaltung-Kategorie", "Veranstaltung-Name"])
for group in grouped:
# Keeps the at least five responses constraint
if len(group[1]) < 5:
continue
head = pandas.Series({"Veranstaltung-Nr.": group[0][0], "Person-Name": group[0][1], "Veranstaltung-Typ": group[0][2], "Veranstaltung-Name": group[0][3], "Antworten": len(group[1])})
result = group[1].dropna(axis="columns", how="all").apply(calc_score).dropna(how="all")
combined = pandas.concat([head, result])
res_df = pandas.concat([res_df, combined.to_frame().T])
result_frames["{0}_{1}".format(institute,semester)] = res_df
```
%% Cell type:code id: tags:
``` python
#print(result_frames)
def norm_name(name: str) -> str:
if ', ' not in name: return name
split = name.split(", ")
return split[1] + " " + split[0]
for (institute_semester, df) in result_frames.items():
collection = []
institute, semester = institute_semester.split("_")
institute = CONSTANTS.INSTITUTE_MAP[institute]
if "WiSe" in semester:
semester = semester.replace("-", "/")
if semester in participation:
# This may lead to data loss but is impossible to work around
semester_obj = participation[semester]
semester_obj.drop_duplicates(inplace=True)
names = semester_obj["Person-Name"]
semester_obj["Person-Name"] = names.map(norm_name)
df = df.merge(semester_obj[['Veranstaltung-Nr.', 'Anzahl Teilnehmende', 'Person-Name', 'Veranstaltung-Name']], on=["Veranstaltung-Nr.", "Person-Name", "Veranstaltung-Name"], how="left")
for row in df.iterrows():
course_object = {
"course_number": str(row[1]["Veranstaltung-Nr."]),
"course_name": row[1]["Veranstaltung-Name"],
"course_type": row[1]["Veranstaltung-Typ"],
"lecturer": row[1]["Person-Name"],
"semester": semester,
"institute": institute,
"answers": row[1]["Antworten"],
# -1 indicated that no data is available
"participants": -1,
"scores": []
}
if "Anzahl Teilnehmende" in df.columns:
if not math.isnan(row[1]["Anzahl Teilnehmende"]):
course_object["participants"] = int(row[1]["Anzahl Teilnehmende"])
for col in row[1].index:
if uuid_regex.match(col) != None:
# Early Break for irrelevant columns
if math.isnan(row[1][col]):
continue
course_object["scores"].append({"question": col, "score": int(row[1][col])})
collection.append(course_object)
api.force_courses(collection)
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment