Skip to content
Snippets Groups Projects
Commit 45f32c76 authored by dominip89's avatar dominip89 Committed by fu2662cw
Browse files

Resolve "Parsen von buchsys.de"

parent 757d7273
No related branches found
No related tags found
No related merge requests found
asgiref==3.3.4
beautifulsoup4==4.9.3
certifi==2020.12.5
chardet==4.0.0
Django==3.2
idna==2.10
pytz==2021.1
sqlparse==0.4.1
\ No newline at end of file
requests==2.25.1
soupsieve==2.2.1
sqlparse==0.4.1
urllib3==1.26.4
import requests
from bs4 import BeautifulSoup
def fetch_website(url):
"""
Helper function to fetch the content of a website.
Uses requests to fetch the html page and BeautifulSoup to parse the html.
"""
try:
# get a object containing the web page's html
response = requests.get(url)
# parse the html content with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
# pinpoint the parser only to the section containing the course names and links
return soup.find("dl", {"class": "bs_menu"}).find_all("a", href=True)
except requests.exceptions.RequestException as e:
print(e)
def scraping(site=None) -> dict:
"""
Returns a dictionary of the form {name: link}, containing the scraped content of https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/index.html, unless another URL is given as an argument.
"""
courses = {}
if site == None:
site = "https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/"
website = fetch_website(site)
for element in website:
# filters out the link to the Restplätze-Suche, which isn't a course itself
if element["href"] != "kurssuche.html#RP":
courses[element.text] = f'{site}{element["href"]}'
return courses
if __name__ == "__main__":
print(scraping())
from django.test import TestCase
from course_scraper import fetch_website, scraping
class ScraperTestCase(TestCase):
def test_returns_dict(self):
self.assertIsInstance(scraping(), dict)
def test_dict_not_empty(self):
self.assertTrue(len(scraping()) > 0)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment