Upload new file

1171a22e · yklingele · 1135aaee · 1171a22e
Commit 1171a22e authored Jun 9, 2017 by yklingele
--- a/heisescraper_klingele_brose.py
+++ b/heisescraper_klingele_brose.py
+# -*- coding: cp1252 -*-
+# imports
+from bs4 import BeautifulSoup
+import requests
+import csv
+import collections
+import itertools
+
+def main():
+    
+    print "Scraping from heise.de/thema/https\n"
+
+    csvfile = csv.writer(open('yannis_julius_heisescraper.csv', 'w'), delimiter = ';')  #create csv file with delimiter ;
+
+    for page in range(0, 4, 1): #repeat process for each page of heise.de to the topic https
+        
+        heise_url = "https://www.heise.de/thema/https?seite="+str(page)+".html"
+        data = requests.get(heise_url).text
+        content = BeautifulSoup(data, "lxml")
+        keywords = content.findAll('div', class_='keywordliste') #find all 
+
+        for header in keywords:
+            header = header.findAll('header')
+            del header[0] #delete the very first header that 
+            txt = []
+            for h in header: #appends all headers to a list
+                txt.append(h.text.encode('utf-8').rstrip()) #removes \n from the headers 
+            csvfile.writerow(txt) #adds headers to the csv file
+
+    print "Finished scraping from heise.de/thema/https\n"
+    print "Continuing with counting word frequency\n"
+
+    #split all the words with a delimiter ; and put them into a list
+    split_words = [item.split(';') for item in (open('yannis_julius_heisescraper.csv', 'r').read().split())]
+    merged_list = list(itertools.chain(*split_words))
+    #count the frequency of each word, put the words into a dictionary with the amount
+    wordfrequency = [merged_list.count(word) for word in merged_list]
+    dictionary = dict(zip(merged_list, wordfrequency))
+    #sort the amounts     
+    aux = [(dictionary[key], key) for key in dictionary]
+    aux.sort()
+    aux.reverse()
+    #print top 3 results
+    print "Results: \n 1.",aux[0],"\n 2.",aux[1],"\n 3.",aux[2]
+
+#execution
+if __name__ == '__main__':
+    main()