Something went wrong on our end
-
borzechof99 authoredborzechof99 authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
course_scraper.py 1.48 KiB
"""
Implementation of a rudementary scraping tool
for http://www.buchsys.de for SWP UniSport-O-Mat.
"""
import requests
from bs4 import BeautifulSoup
def fetch_website(url):
"""
Helper function to fetch the content of a website.
Uses requests to fetch the html page and BeautifulSoup to parse the html.
"""
try:
# get a object containing the web page's html
response = requests.get(url)
# parse the html content with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
# pinpoint the parser only to the section containing the course names and links
return soup.find("dl", {"class": "bs_menu"}).find_all("a", href=True)
except requests.exceptions.RequestException as err:
print(err)
raise
def scraping(site=None) -> dict:
"""
Returns a dictionary of the form {name: link},
containing the scraped content of
https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/index.html,
unless another URL is given as an argument.
"""
courses = {}
if site is None:
site = "https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/"
website = fetch_website(site)
for element in website:
# filters out the link to the Restplätze-Suche, which isn't a course itself
if element["href"] != "kurssuche.html#RP":
courses[element.text] = f'{site}{element["href"]}'
return courses
if __name__ == "__main__":
print(scraping())