Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
course_scraper.py 1.48 KiB
"""
Implementation of a rudementary scraping tool
for http://www.buchsys.de for SWP UniSport-O-Mat.
"""

import requests
from bs4 import BeautifulSoup


def fetch_website(url):
    """
    Helper function to fetch the content of a website.
    Uses requests to fetch the html page and BeautifulSoup to parse the html.
    """
    try:
        # get a object containing the web page's html
        response = requests.get(url)

        # parse the html content with BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # pinpoint the parser only to the section containing the course names and links
        return soup.find("dl", {"class": "bs_menu"}).find_all("a", href=True)

    except requests.exceptions.RequestException as err:
        print(err)
        raise


def scraping(site=None) -> dict:
    """
    Returns a dictionary of the form {name: link},
    containing the scraped content of
    https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/index.html,
    unless another URL is given as an argument.
    """
    courses = {}

    if site is None:
        site = "https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/"

    website = fetch_website(site)

    for element in website:
        # filters out the link to the Restplätze-Suche, which isn't a course itself
        if element["href"] != "kurssuche.html#RP":
            courses[element.text] = f'{site}{element["href"]}'

    return courses


if __name__ == "__main__":
    print(scraping())