diff --git a/src/pipeline.ipynb b/src/pipeline.ipynb index 4946ca4751182510a5d935dd65ad885119872f49..5050b045b58da1ced5b9a45918724ecbf8a20b82 100644 --- a/src/pipeline.ipynb +++ b/src/pipeline.ipynb @@ -105,11 +105,11 @@ "\n", "participation = {}\n", "\n", - "participation_path = os.path.join(CONSTANTS.RAW_DATA_PATH, \"participation\")\n", + "participation_path = os.path.join(CONSTANTS.CLEAN_DATA_PATH, \"participation\")\n", "for file in os.listdir(participation_path):\n", "\tif not file.endswith(\".csv\") or file.startswith(\"example\"): continue\n", "\tsemester = file.replace(\".csv\", \"\")\n", - "\tparticipation[semester.replace(\"-\", \"/\")] = pandas.read_csv(os.path.join(participation_path, file), sep=\";\")" + "\tparticipation[semester.replace(\"-\", \"/\")] = pandas.read_csv(os.path.join(participation_path, file))" ] }, { @@ -143,15 +143,10 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "#print(result_frames)\n", - "def norm_name(name: str) -> str:\n", - "\tif ', ' not in name: return name\n", - "\tsplit = name.split(\", \")\n", - "\treturn split[1] + \" \" + split[0]\n", "for (institute_semester, df) in result_frames.items():\n", "\tcollection = []\n", "\tinstitute, semester = institute_semester.split(\"_\") \n", @@ -163,8 +158,7 @@ "\t\t# This may lead to data loss but is impossible to work around\n", "\t\tsemester_obj = participation[semester]\n", "\t\tsemester_obj.drop_duplicates(inplace=True)\n", - "\t\tnames = semester_obj[\"Person-Name\"]\n", - "\t\tsemester_obj[\"Person-Name\"] = names.map(norm_name)\n", + "\n", "\t\tdf = df.merge(semester_obj[['Veranstaltung-Nr.', 'Anzahl Teilnehmende', 'Person-Name', 'Veranstaltung-Name']], on=[\"Veranstaltung-Nr.\", \"Person-Name\", \"Veranstaltung-Name\"], how=\"left\")\n", "\n", "\tfor row in df.iterrows():\n",