Skip to content
Snippets Groups Projects
Commit aba99400 authored by fu2662cw's avatar fu2662cw :speech_balloon:
Browse files

Merge branch '7-parsen-von-buchsys-de' into 'master'

Resolve "Parsen von buchsys.de"

Closes #7

See merge request swp-unisport/team-warumkeinrust/unisport-o-mat!6
parents 757d7273 45f32c76
No related branches found
No related tags found
No related merge requests found
asgiref==3.3.4 asgiref==3.3.4
beautifulsoup4==4.9.3
certifi==2020.12.5
chardet==4.0.0
Django==3.2 Django==3.2
idna==2.10
pytz==2021.1 pytz==2021.1
sqlparse==0.4.1 requests==2.25.1
\ No newline at end of file soupsieve==2.2.1
sqlparse==0.4.1
urllib3==1.26.4
import requests
from bs4 import BeautifulSoup
def fetch_website(url):
"""
Helper function to fetch the content of a website.
Uses requests to fetch the html page and BeautifulSoup to parse the html.
"""
try:
# get a object containing the web page's html
response = requests.get(url)
# parse the html content with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
# pinpoint the parser only to the section containing the course names and links
return soup.find("dl", {"class": "bs_menu"}).find_all("a", href=True)
except requests.exceptions.RequestException as e:
print(e)
def scraping(site=None) -> dict:
"""
Returns a dictionary of the form {name: link}, containing the scraped content of https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/index.html, unless another URL is given as an argument.
"""
courses = {}
if site == None:
site = "https://www.buchsys.de/fu-berlin/angebote/aktueller_zeitraum/"
website = fetch_website(site)
for element in website:
# filters out the link to the Restplätze-Suche, which isn't a course itself
if element["href"] != "kurssuche.html#RP":
courses[element.text] = f'{site}{element["href"]}'
return courses
if __name__ == "__main__":
print(scraping())
from django.test import TestCase
from course_scraper import fetch_website, scraping
class ScraperTestCase(TestCase):
def test_returns_dict(self):
self.assertIsInstance(scraping(), dict)
def test_dict_not_empty(self):
self.assertTrue(len(scraping()) > 0)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment