Initial commit

09518761 · djcaballero · 09518761
Commit 09518761 authored 8 years ago by djcaballero
--- a/web-scraper.py
+++ b/web-scraper.py
+#import libraries
+import unicodecsv as csv
+import requests
+from bs4 import BeautifulSoup
+
+def getSoup(url):
+    #query the website
+    page_query = requests.get(url)
+    #parse the html using BS
+    soup_page = BeautifulSoup(page_query.text, 'html.parser')
+    return soup_page
+
+def main():
+
+    #open csv.file and create  csv.writer
+    csv_file = open('heise_articles.csv', 'wb')
+    writer = csv.writer(csv_file, delimiter=',')
+
+    #store the urls to be used in urls
+    urls=[]
+    url1 = 'https://www.heise.de/thema/https'
+    url2 = 'https://www.heise.de/thema/https?seite=1'
+    url3 = 'https://www.heise.de/thema/https?seite=2'
+    url4 = 'https://www.heise.de/thema/https?seite=3'
+    urls.append(url1)
+    urls.append(url2)
+    urls.append(url3)
+    urls.append(url4)
+
+    #get the parent <div class="keywordliste">...<\div>
+    for i in urls:
+        parent_div = getSoup(i).find('div', attrs={'class': 'keywordliste'})
+        #get the <nav>...<\nav>
+        nav_tag = parent_div.nav
+        #get all headers
+        articles = nav_tag.findAll('header')
+        #print all headers
+        for article in articles:
+            article = article.text.encode('utf-8')
+            writer.writerow([article])
+
+if __name__ == '__main__':
+    main()
+