first version with import

17b65b30 · sommerann · eb22c268 · 17b65b30
Commit 17b65b30 authored Jun 8, 2017 by sommerann
--- a/heise.py
+++ b/heise.py
+# python 3.6.1
+
+# imports
+from bs4 import BeautifulSoup
+import requests
+import csv
+
+def has_source(tag):
+    # filtering the search results 
+    return tag.has_attr('data-sourcechannel') and tag.has_attr('class')
+
+# scraper website: https://www.heise.de/thema/https
+def main():
+
+    fobj = open('heise.csv', 'w')      # open file
+    csvw = csv.writer(fobj, delimiter = ';')      # create csv writer, set delimiter to ;
+
+    r = requests.get("https://www.heise.de/thema/https") #
+    soup = BeautifulSoup(r.content, "lxml")
+    headlines = (soup.find_all(has_source)) #find the corresponding entries
+    
+    for line in headlines:
+        txt = ["Überschrift: "]
+        for header in line.header: #filter only for the header-tag
+            txt.append(header)
+        csvw.writerow(txt)
+        #print(txt)
+
+    fobj.close()                                # close file
+    print("\nDONE !\n\n\nHeise.de was scraped completely.\n")
+
+
+    
+    
+# main program
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file