From 45f32c76a4a81d4b108998d847d791b33d643017 Mon Sep 17 00:00:00 2001
From: dominip89 <dominip89@mi.fu-berlin.de>
Date: Fri, 14 May 2021 08:34:16 +0000
Subject: [PATCH] Resolve "Parsen von buchsys.de"

---
 requirements.txt                              |  9 +++-
 unisportomat/course_scraper/course_scraper.py | 44 +++++++++++++++++++
 .../course_scraper/test_course_scraper.py     | 10 +++++
 3 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 unisportomat/course_scraper/course_scraper.py
 create mode 100644 unisportomat/course_scraper/test_course_scraper.py

diff --git a/requirements.txt b/requirements.txt
index 2cde7b3..4ecec85 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,11 @@
 asgiref==3.3.4
+beautifulsoup4==4.9.3
+certifi==2020.12.5
+chardet==4.0.0
 Django==3.2
+idna==2.10
 pytz==2021.1
-sqlparse==0.4.1
\ No newline at end of file
+requests==2.25.1
+soupsieve==2.2.1
+sqlparse==0.4.1
+urllib3==1.26.4
diff --git a/unisportomat/course_scraper/course_scraper.py b/unisportomat/course_scraper/course_scraper.py
new file mode 100644
index 0000000..27ea1ce
--- /dev/null
+++ b/unisportomat/course_scraper/course_scraper.py
@@ -0,0 +1,44 @@
+import requests
+from bs4 import BeautifulSoup
+
+
+def fetch_website(url):
+    """
+    Helper function to fetch the content of a website.
+    Uses requests to fetch the html page and BeautifulSoup to parse the html.
+    """
+    try:
+        # get a object containing the web page's html
+        response = requests.get(url)
+
+        # parse the html content with BeautifulSoup
+        soup = BeautifulSoup(response.content, "html.parser")
+
+        # pinpoint the parser only to the section containing the course names and links
+        return soup.find("dl", {"class": "bs_menu"}).find_all("a", href=True)
+
+    except requests.exceptions.RequestException as e:
+        print(e)
+
+
+def scraping(site=None) -> dict:
+    """
+    Returns a dictionary of the form {name: link}, containing the scraped content of https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/index.html, unless another URL is given as an argument.
+    """
+    courses = {}
+
+    if site == None:
+        site = "https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/"
+
+    website = fetch_website(site)
+
+    for element in website:
+        # filters out the link to the Restplätze-Suche, which isn't a course itself
+        if element["href"] != "kurssuche.html#RP":
+            courses[element.text] = f'{site}{element["href"]}'
+
+    return courses
+
+
+if __name__ == "__main__":
+    print(scraping())
diff --git a/unisportomat/course_scraper/test_course_scraper.py b/unisportomat/course_scraper/test_course_scraper.py
new file mode 100644
index 0000000..17df436
--- /dev/null
+++ b/unisportomat/course_scraper/test_course_scraper.py
@@ -0,0 +1,10 @@
+from django.test import TestCase
+from course_scraper import fetch_website, scraping
+
+
+class ScraperTestCase(TestCase):
+    def test_returns_dict(self):
+        self.assertIsInstance(scraping(), dict)
+
+    def test_dict_not_empty(self):
+        self.assertTrue(len(scraping()) > 0)
-- 
GitLab