diff --git a/requirements.txt b/requirements.txt index 2cde7b39e66d8003f4e99d1c121a15fee032769a..4ecec85564b1a27bce2d30e8f01924e2de9df4e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,11 @@ asgiref==3.3.4 +beautifulsoup4==4.9.3 +certifi==2020.12.5 +chardet==4.0.0 Django==3.2 +idna==2.10 pytz==2021.1 -sqlparse==0.4.1 \ No newline at end of file +requests==2.25.1 +soupsieve==2.2.1 +sqlparse==0.4.1 +urllib3==1.26.4 diff --git a/unisportomat/course_scraper/course_scraper.py b/unisportomat/course_scraper/course_scraper.py new file mode 100644 index 0000000000000000000000000000000000000000..27ea1ce5d033d773a6725112d88e9b0d042b899c --- /dev/null +++ b/unisportomat/course_scraper/course_scraper.py @@ -0,0 +1,44 @@ +import requests +from bs4 import BeautifulSoup + + +def fetch_website(url): + """ + Helper function to fetch the content of a website. + Uses requests to fetch the html page and BeautifulSoup to parse the html. + """ + try: + # get a object containing the web page's html + response = requests.get(url) + + # parse the html content with BeautifulSoup + soup = BeautifulSoup(response.content, "html.parser") + + # pinpoint the parser only to the section containing the course names and links + return soup.find("dl", {"class": "bs_menu"}).find_all("a", href=True) + + except requests.exceptions.RequestException as e: + print(e) + + +def scraping(site=None) -> dict: + """ + Returns a dictionary of the form {name: link}, containing the scraped content of https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/index.html, unless another URL is given as an argument. + """ + courses = {} + + if site == None: + site = "https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/" + + website = fetch_website(site) + + for element in website: + # filters out the link to the Restplätze-Suche, which isn't a course itself + if element["href"] != "kurssuche.html#RP": + courses[element.text] = f'{site}{element["href"]}' + + return courses + + +if __name__ == "__main__": + print(scraping()) diff --git a/unisportomat/course_scraper/test_course_scraper.py b/unisportomat/course_scraper/test_course_scraper.py new file mode 100644 index 0000000000000000000000000000000000000000..17df4369b844ce2890b81dae7fe42cb9f0621b9c --- /dev/null +++ b/unisportomat/course_scraper/test_course_scraper.py @@ -0,0 +1,10 @@ +from django.test import TestCase +from course_scraper import fetch_website, scraping + + +class ScraperTestCase(TestCase): + def test_returns_dict(self): + self.assertIsInstance(scraping(), dict) + + def test_dict_not_empty(self): + self.assertTrue(len(scraping()) > 0)