Remove redundant clean in main pipeline

e98b464a · alexander06 · 8ac2d2f9 · e98b464a
Commit e98b464a authored May 9, 2024 by alexander06
--- a/src/pipeline.ipynb
+++ b/src/pipeline.ipynb
@@ -105,11 +105,11 @@
    "\n",
    "participation = {}\n",
    "\n",
-    "participation_path = os.path.join(CONSTANTS.RAW_DATA_PATH, \"participation\")\n",
+    "participation_path = os.path.join(CONSTANTS.CLEAN_DATA_PATH, \"participation\")\n",
    "for file in os.listdir(participation_path):\n",
    "\tif not file.endswith(\".csv\") or file.startswith(\"example\"): continue\n",
    "\tsemester = file.replace(\".csv\", \"\")\n",
-    "\tparticipation[semester.replace(\"-\", \"/\")] = pandas.read_csv(os.path.join(participation_path, file), sep=\";\")"
+    "\tparticipation[semester.replace(\"-\", \"/\")] = pandas.read_csv(os.path.join(participation_path, file))"
   ]
  },
  {
@@ -143,15 +143,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
-    "#print(result_frames)\n",
-    "def norm_name(name: str) -> str:\n",
-    "\tif ', ' not in name: return name\n",
-    "\tsplit = name.split(\", \")\n",
-    "\treturn split[1] + \" \" + split[0]\n",
    "for (institute_semester, df) in result_frames.items():\n",
    "\tcollection = []\n",
    "\tinstitute, semester = institute_semester.split(\"_\") \n",
@@ -163,8 +158,7 @@
    "\t\t# This may lead to data loss but is impossible to work around\n",
    "\t\tsemester_obj = participation[semester]\n",
    "\t\tsemester_obj.drop_duplicates(inplace=True)\n",
-    "\t\tnames = semester_obj[\"Person-Name\"]\n",
+    "\n",
-    "\t\tsemester_obj[\"Person-Name\"] = names.map(norm_name)\n",
    "\t\tdf = df.merge(semester_obj[['Veranstaltung-Nr.', 'Anzahl Teilnehmende', 'Person-Name', 'Veranstaltung-Name']], on=[\"Veranstaltung-Nr.\", \"Person-Name\", \"Veranstaltung-Name\"], how=\"left\")\n",
    "\n",
    "\tfor row in df.iterrows():\n",

 %% Cell type:markdown id: tags:
 # Evaluation Report Pipeline
 This file hosts the pipeline, which generates all images containing the final evaluation results
 %% Cell type:code id: tags:
 ``` python
 import os, pandas, math,json
 import regex as re
 import lib.constants as CONSTANTS
 import lib.api as api
 question_cache = {}
 intervals = {}
 uuid_regex = re.compile('\\b[0-9a-f]{8}\\b-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-\\b[0-9a-f]{12}\\b')
 ```
 %% Cell type:code id: tags:
 ``` python
 def file_iterator():
 	for (root, _dirs, files) in os.walk(CONSTANTS.CLEAN_DATA_PATH):
 		if len(files) > 0 and 'all.csv' in files:
 			file_path = os.path.join(root, "all.csv")
 			df = pandas.read_csv(file_path)
 			yield root.split("/")[-2:] + [df]
 def generate_scales():
 	result = {}
 	scales = api.get_scales()
 	for scale in scales:
 		answers = api.get_answers_for(scale["id"])
 		result[scale["id"]] = answers
 	return result
 def generate_intervals():
 	global intervals
 	intervals = {key: [x['value'] for x in value] for key, value in generate_scales().items()}
 def get_factors():
 	factor = {}
 	for interval in intervals:
 		# These intervals either just have two steps or the answers have no weight
 		if interval in ["scale_time", "y_n"]: continue
 		(a,b) = intervals[interval]
 		factor[interval] = round(abs(a-b-1)/2, 0)
 	return factor
 # The type of the value is unknown
 def convert_score(value, factor) -> int:
 	# Matches Numbers from 0-9 an optional (floating) point and optional trailing zero
 	number_regex = re.compile('^\d(\.0)?$')
 	# Minimum requirement for a number
 	# Cast to string safeguards real floats
 	if number_regex.match(str(value)) != None:
 		return factor[int(float(value) - 1)]
 	else:
 		return 0
 def calc_score(series: pandas.Series) -> float:
 	if uuid_regex.match(series.name) != None:
 		# TODO: Negate that one question
 		if series.name not in question_cache:
 			question_cache[series.name] = intervals[api.get_scale_for(series.name)["id"]]
 		factor = question_cache[series.name]
 		# Catches weightless questions
 		if len([x for x in factor if x != 0]) == 0:
 			return 0
 		if factor == [1, 0]:
 			return series.map(lambda x: 1 if x == "Ja" else 0).sum()
 		# Pandas saves values as float, so they need to be converted
 		series = series.map(lambda x: convert_score(x, factor), na_action="ignore")
 		return series.sum()
 ```
 %% Cell type:code id: tags:
 ``` python
 # Generate static values
 generate_intervals()
 participation = {}
-participation_path = os.path.join(CONSTANTS.RAW_DATA_PATH, "participation")
+participation_path = os.path.join(CONSTANTS.CLEAN_DATA_PATH, "participation")
 for file in os.listdir(participation_path):
 	if not file.endswith(".csv") or file.startswith("example"): continue
 	semester = file.replace(".csv", "")
-	participation[semester.replace("-", "/")] = pandas.read_csv(os.path.join(participation_path, file), sep=";")
+	participation[semester.replace("-", "/")] = pandas.read_csv(os.path.join(participation_path, file))
 ```
 %% Cell type:markdown id: tags:
 ## Calculate weight for Dimension
 Calculates the individual score for all dimensions for every course
 %% Cell type:code id: tags:
 ``` python
 result_frames = {}
 for (institute, semester, df) in file_iterator():
 	res_df = pandas.DataFrame(data={})
 	grouped = df.groupby(["Veranstaltung-Nr.", "Person-Name", "Veranstaltung-Kategorie", "Veranstaltung-Name"])
 	for group in grouped:
 		# Keeps the at least five responses constraint
 		if len(group[1]) < 5:
 			continue
 		head = pandas.Series({"Veranstaltung-Nr.": group[0][0], "Person-Name": group[0][1], "Veranstaltung-Typ": group[0][2], "Veranstaltung-Name": group[0][3], "Antworten": len(group[1])})
 		result = group[1].dropna(axis="columns", how="all").apply(calc_score).dropna(how="all")
 		combined = pandas.concat([head, result])
 		res_df = pandas.concat([res_df, combined.to_frame().T])
 	result_frames["{0}_{1}".format(institute,semester)] = res_df
 ```
 %% Cell type:code id: tags:
 ``` python
-#print(result_frames)
-def norm_name(name: str) -> str:
-	if ', ' not in name: return name
-	split = name.split(", ")
-	return split[1] + " " + split[0]
 for (institute_semester, df) in result_frames.items():
 	collection = []
 	institute, semester = institute_semester.split("_")
 	institute = CONSTANTS.INSTITUTE_MAP[institute]
 	if "WiSe" in semester:
 		semester = semester.replace("-", "/")
 	if semester in participation:
 		# This may lead to data loss but is impossible to work around
 		semester_obj = participation[semester]
 		semester_obj.drop_duplicates(inplace=True)
-		names = semester_obj["Person-Name"]
-		semester_obj["Person-Name"] = names.map(norm_name)
 		df = df.merge(semester_obj[['Veranstaltung-Nr.', 'Anzahl Teilnehmende', 'Person-Name', 'Veranstaltung-Name']], on=["Veranstaltung-Nr.", "Person-Name", "Veranstaltung-Name"], how="left")
 	for row in df.iterrows():
 		course_object = {
 			"course_number": str(row[1]["Veranstaltung-Nr."]),
 			"course_name": row[1]["Veranstaltung-Name"],
 			"course_type": row[1]["Veranstaltung-Typ"],
 			"lecturer": row[1]["Person-Name"],
 			"semester": semester,
 			"institute": institute,
 			"answers": row[1]["Antworten"],
 			# -1  indicated that no data is available
 			"participants": -1,
 			"scores": []
 		}
 		if "Anzahl Teilnehmende" in df.columns:
 			if not math.isnan(row[1]["Anzahl Teilnehmende"]):
 				course_object["participants"] = int(row[1]["Anzahl Teilnehmende"])
 		for col in row[1].index:
 			if uuid_regex.match(col) != None:
 				# Early Break for irrelevant columns
 				if math.isnan(row[1][col]):
 					continue
 				course_object["scores"].append({"question": col, "score": int(row[1][col])})
 		collection.append(course_object)
 	api.force_courses(collection)
 ```