From 45f32c76a4a81d4b108998d847d791b33d643017 Mon Sep 17 00:00:00 2001 From: dominip89 <dominip89@mi.fu-berlin.de> Date: Fri, 14 May 2021 08:34:16 +0000 Subject: [PATCH] Resolve "Parsen von buchsys.de" --- requirements.txt | 9 +++- unisportomat/course_scraper/course_scraper.py | 44 +++++++++++++++++++ .../course_scraper/test_course_scraper.py | 10 +++++ 3 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 unisportomat/course_scraper/course_scraper.py create mode 100644 unisportomat/course_scraper/test_course_scraper.py diff --git a/requirements.txt b/requirements.txt index 2cde7b3..4ecec85 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,11 @@ asgiref==3.3.4 +beautifulsoup4==4.9.3 +certifi==2020.12.5 +chardet==4.0.0 Django==3.2 +idna==2.10 pytz==2021.1 -sqlparse==0.4.1 \ No newline at end of file +requests==2.25.1 +soupsieve==2.2.1 +sqlparse==0.4.1 +urllib3==1.26.4 diff --git a/unisportomat/course_scraper/course_scraper.py b/unisportomat/course_scraper/course_scraper.py new file mode 100644 index 0000000..27ea1ce --- /dev/null +++ b/unisportomat/course_scraper/course_scraper.py @@ -0,0 +1,44 @@ +import requests +from bs4 import BeautifulSoup + + +def fetch_website(url): + """ + Helper function to fetch the content of a website. + Uses requests to fetch the html page and BeautifulSoup to parse the html. + """ + try: + # get a object containing the web page's html + response = requests.get(url) + + # parse the html content with BeautifulSoup + soup = BeautifulSoup(response.content, "html.parser") + + # pinpoint the parser only to the section containing the course names and links + return soup.find("dl", {"class": "bs_menu"}).find_all("a", href=True) + + except requests.exceptions.RequestException as e: + print(e) + + +def scraping(site=None) -> dict: + """ + Returns a dictionary of the form {name: link}, containing the scraped content of https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/index.html, unless another URL is given as an argument. + """ + courses = {} + + if site == None: + site = "https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/" + + website = fetch_website(site) + + for element in website: + # filters out the link to the Restplätze-Suche, which isn't a course itself + if element["href"] != "kurssuche.html#RP": + courses[element.text] = f'{site}{element["href"]}' + + return courses + + +if __name__ == "__main__": + print(scraping()) diff --git a/unisportomat/course_scraper/test_course_scraper.py b/unisportomat/course_scraper/test_course_scraper.py new file mode 100644 index 0000000..17df436 --- /dev/null +++ b/unisportomat/course_scraper/test_course_scraper.py @@ -0,0 +1,10 @@ +from django.test import TestCase +from course_scraper import fetch_website, scraping + + +class ScraperTestCase(TestCase): + def test_returns_dict(self): + self.assertIsInstance(scraping(), dict) + + def test_dict_not_empty(self): + self.assertTrue(len(scraping()) > 0) -- GitLab