diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..dc8068a75ca0ca15d0be5ab79572fd89e34f403b --- /dev/null +++ b/Makefile @@ -0,0 +1,4 @@ +out = clean_data + +clean: + @$(RM) -rf ./$(out)/* \ No newline at end of file diff --git a/src/cleanup.ipynb b/src/cleanup.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f18fd073e3c2e818b63b14f804ef9307a99015c2 --- /dev/null +++ b/src/cleanup.ipynb @@ -0,0 +1,314 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cleanup\n", + "This file is dedicated to data cleanup. See the corresponding markdown headers to get more information on the steps applied" + ] + }, + { + "cell_type": "code", + "execution_count": 425, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports and constants\n", + "import pandas, os, re, requests, json, math, numbers\n", + "from lib.scraping import get_course, get_complete_catalogue_for\n", + "import lib.constants as CONSTANTS" + ] + }, + { + "cell_type": "code", + "execution_count": 426, + "metadata": {}, + "outputs": [], + "source": [ + "def normalize_type(type: str) -> str:\n", + "\tnormalized_identifiers = {\n", + "\t\t\"V\": \"VL\",\n", + "\t\t\"Vl\": \"VL\",\n", + "\t\t\"VL+Ü\": \"VL\",\n", + "\t\t\"VLeKo\": \"VL\",\n", + "\t\t\"SLeKo\": \"S\"\n", + "\t}\n", + "\t# Using get prevents KeyErrors by types that already are right\n", + "\treturn normalized_identifiers.get(type, type)\n", + "\n", + "def make_plural(category: str) -> str:\n", + "\tplural_list = {\n", + "\t\t\"Vorlesung\": \"Vorlesungen\",\n", + "\t\t\"Übung\": \"Übungen\",\n", + "\t\t\"Seminar\": \"Seminare\"\n", + "\t}\n", + "\n", + "\treturn plural_list.get(category, category)\n", + "\n", + "def request_questions():\n", + "\tres = requests.get(\"http://localhost:50000/v1/questions\")\n", + "\treturn json.loads(res.text)\n", + "\n", + "def choose(a,b):\n", + "\tif type(a) == str:\n", + "\t\treturn b if a == \"\" else a\n", + "\telif isinstance(a, numbers.Number):\n", + "\t\treturn b if math.isnan(a) else a\n", + "\telse:\n", + "\t\treturn b if a == None else a" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generate Paths\n", + "Converts all files in the `raw_data` folder into pandas dataframes" + ] + }, + { + "cell_type": "code", + "execution_count": 427, + "metadata": {}, + "outputs": [], + "source": [ + "paths = []\n", + "\n", + "for (root, dirs, files) in os.walk(CONSTANTS.RAW_DATA_PATH):\n", + "\tif len(files) > 0 and 'daten.csv' in files:\n", + "\t\tpaths.append((root, pandas.read_csv(os.path.join(root, 'daten.csv'), sep=\";\", encoding=\"ISO-8859-1\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Remove unnecessary columns\n", + "Some columns (like text answers) are not needed for the pipeline and get excluded here." + ] + }, + { + "cell_type": "code", + "execution_count": 428, + "metadata": {}, + "outputs": [], + "source": [ + "to_remove = [\"language\", \"Zulassungsbeschr.\", \"Bearbeitungsstand\", \"zeit\", \"Ausfülldauer\", \"Ausfülldauer (s)\"]\n", + "multiples = [\"Was hat Ihnen an dieser Lehrveranstaltung gut gefallen?\", \"Was könnte der Dozent/die Dozentin an dieser Lehrveranstaltung verbessern?\"]\n", + "\n", + "# Easier handling for doubles columns\n", + "for i in range(1, 8):\n", + "\tfor j in multiples:\n", + "\t\tto_remove.append(j + \".\" + str(i))\n", + "to_remove.extend(multiples)\n", + "\n", + "for (path, df) in paths:\n", + "\tto_drop = [column for column in to_remove if column in df]\n", + "\tdf.drop(columns=to_drop, inplace=True, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Normalize Course Type\n", + "The pipeline expects four questionnaire types `VL` for lectures, `S` for seminars, `Ü` for tutorials and `LeKo` for teaching competence. The provided data has multiple possible notations. \n", + "- It is quite common to see `VL`, `Vl` or `V` for lectures. In small cases they are even denoted by `VL + Ü`. To unify these types, they will all be converted to `VL`. \n", + "- Teaching competence has multiple ways to be categorized. Older data may have `VLeKo` or `SLeKo` for lectures or seminars evaluated according to teaching competence. This will be normalized to `LeKo`. In this case the course type will be changed in the corresponding course type column, to preserve the original type of the course" + ] + }, + { + "cell_type": "code", + "execution_count": 429, + "metadata": {}, + "outputs": [], + "source": [ + "for (path, df) in paths:\n", + "\tdf[\"Veranstaltung-Typ\"] = df[\"Veranstaltung-Typ\"].map(normalize_type)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Normalize Course Category\n", + "The pipeline handles the three course types lectures (`Vorlesungen`), tutorials (`Übungen`) and seminars (`Seminare`). The provided date knows a few more types.\n", + "- Tutorials are mostly noted with the singular `Übung`. This gets translated into plural to keep it in line with the other types, which all are plural\n", + "- Data has a type called `Projekte / Praktika`, which don't have an own questionnaire type, so the name gets normalized to the questionnaire type it belongs to\n", + "### Problems:\n", + "- if a course has `LeKo` as a questionnaire and course type, it's impossible to handle. Possible Solution: VV Webscraper\n", + "- if a course has `Prak` as questionnaire type and `Projekte / Praktika` as course type, it can't be categorized. Possible Solution: Always count as seminar" + ] + }, + { + "cell_type": "code", + "execution_count": 430, + "metadata": {}, + "outputs": [], + "source": [ + "normalized_categories = {\n", + "\t\"VL\": \"Vorlesungen\",\n", + "\t\"Ü\": \"Übungen\",\n", + "\t\"S\": \"Seminare\"\n", + "}\n", + "for (path, df) in paths:\n", + "\t# Iterates over dataframe rows\n", + "\tfor i in df.index:\n", + "\t\tvalue = df.loc[i]\n", + "\n", + "\t\t# Normalized courses can be skipped\n", + "\t\tif value.loc[\"Veranstaltung-Kategorie\"] in [\"Vorlesungen\", \"Übungen\", \"Seminare\"]: continue\n", + "\n", + "\t\tevaluation_type = value.loc[\"Veranstaltung-Typ\"]\n", + "\t\t\n", + "\t\tif evaluation_type in normalized_categories:\n", + "\t\t\tdf.at[i, \"Veranstaltung-Kategorie\"] = normalized_categories[evaluation_type]\n", + "\t\telse:\n", + "\t\t\t# Pluralizing\n", + "\t\t\tdf.at[i, \"Veranstaltung-Kategorie\"] = make_plural(value.loc[\"Veranstaltung-Kategorie\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fix Course Numbers\n", + "Sometimes Course Numbers contain a space at the end. This hinders comparing them with the same number that does not contain a space. This part removes leading and trailing spaces" + ] + }, + { + "cell_type": "code", + "execution_count": 431, + "metadata": {}, + "outputs": [], + "source": [ + "for (path, df) in paths:\n", + "\tdf[\"Veranstaltung-Nr.\"] = df[\"Veranstaltung-Nr.\"].map(lambda x: str(x).strip())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Merge questions\n", + "Since all questions of all questionnaires are put into a row, some questions are duplicated. But since questions are the header row, they need to be unique. To counteract this, pandas appends numbers to duplicates, thus turning them unique. However a duplicated question most likely will not be answered twice, thus enabling to merge duplicated questions into a single column, where text takes precedence above empty cells." + ] + }, + { + "cell_type": "code", + "execution_count": 432, + "metadata": {}, + "outputs": [], + "source": [ + "end_regex = re.compile('\\.\\d+$')\n", + "\n", + "for (path, df) in paths:\n", + "\tfor column in df:\n", + "\t\t# If this line matches, a sufficient requirement for a duplicated question is met\n", + "\t\tif (match := end_regex.search(column)) != None:\n", + "\t\t\tclean_name = column[:-(len(match.group()))]\n", + "\t \t\t# Merges duplicated column into the original, deletes duplicate\n", + "\t\t\tdf[clean_name] = df[clean_name].combine(df[column], choose)\n", + "\t\t\tdf.drop(columns=[column], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Replace Questions with IDs\n", + "Takes the IDs from the api and converts the questions into them. This allows to extract the ID programmatically" + ] + }, + { + "cell_type": "code", + "execution_count": 433, + "metadata": {}, + "outputs": [], + "source": [ + "questions = request_questions()\n", + "not_indexable = []\n", + "for (path, df) in paths:\n", + "\n", + "\tfor column in df:\n", + "\t\t# Strip is needed, because some questions have a trailing whitespace\n", + "\t\t# TODO: Document the regex somewhere\n", + "\t\tprepared_string = re.sub(r\"(^\\.+)|(\\.+$)|(\\s?\\?$)|(^\\?)\", \"\", column.strip())\n", + "\t\t# In a few files a single question exists with two spaces, this unifies them to one\n", + "\t\tprepared_string = ' '.join(prepared_string.split())\n", + "\n", + "\t\tid = [item for item in questions if item[\"content\"].replace(\"Der/die Lehrende \", \"\") == prepared_string]\n", + "\t\tif len(id) > 0:\n", + "\t\t\tname = id[0][\"id\"]\n", + "\t\t\tdf.rename(columns={column: name}, inplace=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Split into course types\n", + "Splits data into the four questionnaire types and writes them separately into files" + ] + }, + { + "cell_type": "code", + "execution_count": 434, + "metadata": {}, + "outputs": [], + "source": [ + "for (path, df) in paths:\n", + "\tinformation = path.split(\"/\")[-1]\n", + "\t(institute, semester) = information.split(\"_\")\n", + "\tinstitute_path = os.path.join(CONSTANTS.CLEAN_DATA_PATH, institute)\n", + "\tsemester_path = os.path.join(institute_path, semester)\n", + "\n", + "\tif not os.path.isdir(institute_path):\n", + "\t\tos.mkdir(institute_path)\n", + "\tif not os.path.isdir(semester_path):\n", + "\t\tos.mkdir(semester_path)\n", + "\n", + "\tquestionnaire_types = [\"VL\", \"Ü\", \"S\", \"LeKo\"]\n", + "\n", + "\t# Separate into types\n", + "\tfor q_type in questionnaire_types:\n", + "\t\tres = df[df[\"Veranstaltung-Typ\"] == q_type]\n", + "\t\tresult_path = os.path.join(semester_path, q_type + \".csv\")\n", + "\n", + "\t\tres.to_csv(result_path, encoding=\"utf-8\", index=False)\n", + "\t\n", + "\t# If Outliers exist\n", + "\trest = df[~df[\"Veranstaltung-Typ\"].isin(questionnaire_types)]\n", + "\n", + "\tif len(rest.index) > 0:\n", + "\t\trest.to_csv(os.path.join(semester_path, \"rest.csv\"), encoding=\"utf-8\", index=False)\n", + "\n", + "\t# If everything needs to be handled at once\n", + "\tdf.to_csv(os.path.join(semester_path, \"all.csv\"), encoding=\"utf-8\", index=False)\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/db/questions.json b/src/db/questions.json index d34e3038b800a1ff341bd02b1b6d01ad57ec1463..0d217a4839ca18a857ce71742aac404399acfe44 100644 --- a/src/db/questions.json +++ b/src/db/questions.json @@ -58,7 +58,7 @@ ["Der/die Lehrende regt die Studierenden an, sich mit den Lerninhalten auch außerhalb der Veranstaltung auseinanderzusetzen", "scale_applies"], ["Ich habe die im Rahmen der Lehrveranstaltung gestellten Aufgaben umfassend bearbeitet", "scale_applies"], ["Ich hatte jederzeit einen Überblick über anstehende Termine und zu erledigende Aufgaben", "scale_applies"], - ["Wie viele Stunden pro Woche brauchen Sie ungefähr für die Vor- und Nachbereitung der Lehrveranstaltung (inklusive Hausaufgaben)", "scale_time"], + ["Wieviele Stunden pro Woche brauchen Sie ungefähr für die Vor- und Nachbereitung der Lehrveranstaltung (inklusive Hausaufgaben)", "scale_time"], ["Werden im Rahmen der Lehrveranstaltung Lernmaterialien (z.B. Videoaufzeichnungen) online zur Verfügung gestellt", "y_n"], ["Wenn ja, sind die Lernmaterialien hilfreich", "scale_applies"], ["Der Umgang mit rein digitalen Kursinhalten im Vergleich zu regelmäßigen wöchentlichen Präsenzveranstaltungen ist", "scale_diff"], @@ -99,7 +99,7 @@ ["Das Thema der Lehrveranstaltung hat mich schon vorher interessiert", "scale_applies_alt"], ["Ich verfügte bereits vor dem Lehrveranstaltungsbesuch über umfangreiches Wissen zu den in der LV behandelten Themengebieten", "scale_applies_alt"], ["Ich verfügte bereits vor der Lehrveranstaltung über viel Erfahrung mit E-Learning", "scale_applies_alt"], - ["Ich habe an mindestens zwei Drittel der Termine dieser Lehrveranstaltung teilgenommen bzw. die Inhalte Lerneinheiten bearbeitet", "y_n"], + ["Ich habe an mindestens zwei Drittel der Termine dieser Lehrveranstaltung teilgenommen bzw. die Inhalte der Lerneinheiten bearbeitet", "y_n"], ["Der Stoffumfang, der in der Veranstaltung behandelt wird, ist für mich", "scale_height"], ["Der Schwierigkeitsgrad der Veranstaltung ist für mich", "scale_height"], ["Der/die Lehrende hat die gesamte Lehrveranstaltung gut strukturiert und nachvollziehbar gegliedert", "scale_applies_alt"], @@ -112,10 +112,10 @@ ["Der/die Lehrende fasst regelmäßig die wichtigsten Inhalte der LV zusammen", "scale_applies_alt"], ["Der/die Lehrende stellt zu Beginn eines Termins/einer Lerneinheit den Zusammenhang zu dem Termin/der Lerneinheit davor her", "scale_applies_alt"], ["Der/die Lehrende stellt immer wieder Bezüge zu bereits behandelten Lerninhalten her", "scale_applies_alt"], - ["Der/die Lehrende stellt Fragen, die den Studierenden die Gelegenheit geben, zu überprüfen, ob sie den Inhalt verstanden haben", "scale_applies_alt"], + ["Der/die Lehrende stellt Fragen, die den Studierenden die Gelegenheit geben zu überprüfen, ob sie den Inhalt verstanden haben", "scale_applies_alt"], ["Der/die Lehrende vergewissert sich, dass die Studierenden zentrale Aspekte verstanden haben, bevor er/sie im Stoff weitergeht", "scale_applies_alt"], ["Der/die Lehrende ist bei Fragen gut erreichbar", "scale_applies_alt"], - ["Der/die Lehrende gibt mir ein konstruktives Feedback zu meinen Beitragen/Lösungen", "scale_applies_alt"], + ["Der/die Lehrende gibt mir ein konstruktives Feedback zu meinen Beiträgen/Lösungen", "scale_applies_alt"], ["Der/die Lehrende gibt mir zu wenige Rückmeldungen zu meinen Beiträgen/Lösungen", "scale_applies_alt"], ["Der/die Lehrende gibt mir konkrete Hinweise zur Verbesserung meiner Leistungen", "scale_applies_alt"], ["Der/die Lehrende setzt Modelle, Graphiken oder Schemata so ein, dass sie das Verständnis komplexer Sachverhalte erleichtern", "scale_applies_alt"], @@ -137,7 +137,7 @@ ["Der/die Lehrende hat klare Verhaltensregeln für unsere Zusammenarbeit in dieser Lehrveranstaltung kommuniziert", "scale_applies_alt"], ["Der/die Lehrende achtet darauf, dass wir die aufgestellten Verhaltensregeln einhalten", "scale_applies_alt"], ["Der/die Lehrende bringt wichtige Inhalte gut auf den Punkt", "scale_applies_alt"], - ["Der/die Lehrende nutzt die Verfügbare Zeit effektiv", "scale_applies_alt"], + ["Der/die Lehrende nutzt die verfügbare Zeit effektiv", "scale_applies_alt"], ["In dieser Lehrveranstaltung habe ich viel dazugelernt", "scale_applies_alt"], ["Insgesamt bin ich mit dieser Lehrveranstaltung zufrieden", "scale_applies_alt"] ], diff --git a/src/db/schema.sql b/src/db/schema.sql index 7f90b407d8d3b32782e98bd0c232695e77c2e528..d5dc83de1e553bca1403b53eee3dfdc9e41bc482 100644 --- a/src/db/schema.sql +++ b/src/db/schema.sql @@ -172,10 +172,10 @@ INSERT INTO "question" ("id", "content", "possibilities", "dimension") VALUES ('fb98204a-e7c6-11ee-9404-00620b2c9060', 'Der Umgang mit rein digitalen Kursinhalten im Vergleich zu regelmäßigen wöchentlichen Präsenzveranstaltungen ist', 'scale_difficulty', NULL), ('fb984109-e7c6-11ee-9404-00620b2c9060', 'Ich war an mindestens zwei Drittel der Termine dieser Lehrveranstaltung anwesend', 'y_n', NULL), ('fb985e1a-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende trägt anregend und engagiert vor', 'scale_applies', 'B2'), -('fb987b71-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende stellt Fragen, die den Studierenden die Gelegenheit geben, zu überprüfen, ob sie den Inhalt verstanden haben', 'scale_applies', 'A2'), +('fb987b71-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende stellt Fragen, die den Studierenden die Gelegenheit geben zu überprüfen, ob sie den Inhalt verstanden haben', 'scale_applies', 'A2'), ('fb989861-e7c6-11ee-9404-00620b2c9060', 'Ich verfügte bereits vor dem Veranstaltungsbesuch über umfangreiches Wissen zu den in der Lehrveranstaltung behandelten Themengebieten', 'scale_applies', NULL), ('fb98b420-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende stellt zu Beginn einer Sitzung den Zusammenhang zur letzen Sitzung her', 'scale_applies', 'A1'), -('fb98d60e-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende gibt mir ein konstruktives Feedback zu meinen Beitragen/Lösungen', 'scale_applies', 'A2'), +('fb98d60e-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende gibt mir ein konstruktives Feedback zu meinen Beiträgen/Lösungen', 'scale_applies', 'A2'), ('fb98f563-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende stellt immer wieder Bezüge zu bereits behandelten Lerninhalten her', 'scale_applies', 'A1'), ('fb9915da-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende fesselt die Studierenden durch eine anregende und engagierte Vortragsweise', 'scale_applies', 'B2'), ('fb993295-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende setzt Modelle, Graphiken oder Schemata so ein, dass sie das Verständnis komplexer Sachverhalte erleichtern', 'scale_applies', 'A1'), @@ -184,7 +184,7 @@ INSERT INTO "question" ("id", "content", "possibilities", "dimension") VALUES ('fb998cc4-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende bringt wichtige Inhalte gut auf den Punkt', 'scale_applies', 'C2'), ('fb99ad31-e7c6-11ee-9404-00620b2c9060', 'Insgesamt bin ich mit dieser Lehrveranstaltung zufrieden', 'scale_applies', NULL), ('fb99cb87-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende unterstützt Studierende bei Lernschwierigkeiten', 'scale_applies', 'B1'), -('fb99e833-e7c6-11ee-9404-00620b2c9060', 'Wie viele Stunden pro Woche brauchen Sie ungefähr für die Vor- und Nachbereitung der Lehrveranstaltung (inklusive Hausaufgaben)', 'scale_time', NULL), +('fb99e833-e7c6-11ee-9404-00620b2c9060', 'Wieviele Stunden pro Woche brauchen Sie ungefähr für die Vor- und Nachbereitung der Lehrveranstaltung (inklusive Hausaufgaben)', 'scale_time', NULL), ('fb9a0a66-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende stellt immer wieder Bezüge zu den bereits behandelten Lerninhalten her', 'scale_applies', 'A1'), ('fb9a2c05-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende hat die gesamte Lehrveranstaltung gut strukturiert und nachvollziehbar gegliedert', 'scale_applies', 'A1'), ('fb9a4ce2-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende nimmt die Beiträge der Studierenden ernst', 'scale_applies', 'B1'), @@ -202,8 +202,8 @@ INSERT INTO "question" ("id", "content", "possibilities", "dimension") VALUES ('fb9bd3e1-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende ist in der Lage, die Studierenden für die in der Lehrveranstaltung behandelten Inhalte zu interessieren', 'scale_applies', 'B2'), ('fb9bf803-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende gibt den Studierenden zu wenige Rückmeldungen zu ihren Beiträgen/Antworten', 'scale_applies', 'A2'), ('fb9c1da7-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende ist bei Fragen gut erreichbar', 'scale_applies', 'A2'), -('fb9c4149-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende nutzt die Verfügbare Zeit effektiv', 'scale_applies', 'C2'), -('fb9c6523-e7c6-11ee-9404-00620b2c9060', 'Ich habe an mindestens zwei Drittel der Termine dieser Lehrveranstaltung teilgenommen bzw. die Inhalte Lerneinheiten bearbeitet', 'y_n', NULL), +('fb9c4149-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende nutzt die verfügbare Zeit effektiv', 'scale_applies', 'C2'), +('fb9c6523-e7c6-11ee-9404-00620b2c9060', 'Ich habe an mindestens zwei Drittel der Termine dieser Lehrveranstaltung teilgenommen bzw. die Inhalte der Lerneinheiten bearbeitet', 'y_n', NULL), ('fb9c8f5a-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende verdeutlicht den Anwendungsbezug der Lerninhalte', 'scale_applies', 'B2'), ('fb9cb6cb-e7c6-11ee-9404-00620b2c9060', 'Der/die Lehrende regt die Studierenden dazu an, die Richtigkeit ihrer Beitrage/Antworten selbst zu überprüfen', 'scale_applies', 'B2'), ('fb9cdee4-e7c6-11ee-9404-00620b2c9060', 'Der Schwierigkeitsgrad der Veranstaltung ist für mich', 'scale_height', 'A1'), diff --git a/src/lib/contants.py b/src/lib/constants.py similarity index 85% rename from src/lib/contants.py rename to src/lib/constants.py index b8c46285b81ac7e7dd9e928502371604c22809bc..c15f429f1eca7bbab9f203ded705581613d14c88 100644 --- a/src/lib/contants.py +++ b/src/lib/constants.py @@ -8,4 +8,8 @@ MATHEMATICS_VALUES = os.path.join(RAW_DATA_PATH, "Mathematik") CLEAN_DATA_PATH = os.path.normpath(os.path.join(os.getcwd(), "..", "clean_data")) OUT_PATH = os.path.normpath(os.path.join(os.getcwd(), '..', "outputs")) -INSTITUTES = ["cs", "math", "bio"] \ No newline at end of file +INSTITUTE_MAP = { + "Bioinformatik": "bio", + "Mathematik": "math", + "Informatik": "cs" +} \ No newline at end of file diff --git a/src/lib/scraping.py b/src/lib/scraping.py index f3160eaa3365f32adc96352763adcffa1714f9eb..6454eb028f12d87bd3dc221a51ead85ea46558e3 100644 --- a/src/lib/scraping.py +++ b/src/lib/scraping.py @@ -1,5 +1,6 @@ import requests, re from bs4 import BeautifulSoup +from typing import Union # In the VV the Ids for the Institutes are fixed every semester INSTITUTE_IDS = { @@ -134,6 +135,7 @@ def append_dict(dictionary, key, value) -> dict: res = dictionary res[key] = value return res + def get_catalogue_for(semester: str) -> dict: cs = get_complete_catalogue_for(semester, "cs") cs = {k: append_dict(v, "institute", "cs") for k, v in cs.items()} @@ -146,4 +148,22 @@ def get_catalogue_for(semester: str) -> dict: merge = math | bio | cs - return merge \ No newline at end of file + return merge + + +def get_course(id: str) -> Union[dict, None]: + search_url = "https://www.fu-berlin.de/vv/de/search?utf8=✓&query={0}".format(id) + res = requests.get(search_url) + + soup = BeautifulSoup(res.text, 'html.parser') + + main_content = soup.select('#main_content .well')[0] + name = main_content.select("h1")[0].text + if name == "Erweiterte Suche": return None + + ret = { + "id": id, + "name": main_content.select("h1")[0].text, + "type": main_content.select(".label_container span")[0].text.strip() + } + return ret \ No newline at end of file diff --git a/src/pipeline.ipynb b/src/pipeline.ipynb index 39454ae034f2d0acbfdc48792c621b04c5457da9..487a43432d9b4106944b445a6e7fb06f357893ac 100644 --- a/src/pipeline.ipynb +++ b/src/pipeline.ipynb @@ -2,14 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 155, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# Imports and constants\n", "import os, pandas\n", "from lib.scraping import get_catalogue_for\n", - "import lib.contants as CONSTANTS\n", + "import lib.constants as CONSTANTS\n", "\n", "# Add semester here in the form of \"Sommersemester YYYY\" or \"Wintersemester YYYY/YY\"\n", "# e.g.: Sommersemester 2023 or Wintersemester 2022/23\n", @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -154,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -174,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -190,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -206,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -222,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 163, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -255,7 +255,7 @@ }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -275,7 +275,7 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -307,7 +307,7 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -339,37 +339,9 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 27, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "71" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "77" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "159" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "map = {\n", "\t\"Bioinformatik\": \"bio\",\n", @@ -396,7 +368,7 @@ "for k, v in type_map.items():\n", "\n", "\tregistered = all_registered[all_registered[\"Kategorie\"] == v]\n", - "\tdisplay(len(registered))\n", + "\n", "\tcollegial_num = collegial[collegial[\"Kategorie\"] == v]\n", "\tless_five = less_five_registered[less_five_registered[\"Kategorie\"] == v]\n", "\t# First key\n", diff --git a/tests/scraping/get_from_id/test_function_get_from_id.py b/tests/scraping/get_from_id/test_function_get_from_id.py new file mode 100644 index 0000000000000000000000000000000000000000..4aa5740f7ecbfe9044b608334e9e5b5ffe67737f --- /dev/null +++ b/tests/scraping/get_from_id/test_function_get_from_id.py @@ -0,0 +1,28 @@ +import os, sys + +from schema import Schema, Regex + +# Makes the lib folder visible +path = os.path.abspath(__file__) +steps = 4 +for i in range(steps): + path = os.path.dirname(path) + +sys.path.append(path) +import src.lib.scraping as scraping + +def test_valid_id(): + id = "19201401" + + result_schema = Schema({ + 'id': str, + 'name': str, + 'type': str + }) + + res = scraping.get_course(id) + assert result_schema.is_valid(res) + +def test_invalid_id(): + res = scraping.get_course("0000000") + assert res == None \ No newline at end of file