From 1171a22e2c62933a34bf6b5e9b05fe106acb46d4 Mon Sep 17 00:00:00 2001 From: yklingele <yannis.klingele@gmail.com> Date: Fri, 9 Jun 2017 13:33:48 +0000 Subject: [PATCH] Upload new file --- heisescraper_klingele_brose.py | 48 ++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 heisescraper_klingele_brose.py diff --git a/heisescraper_klingele_brose.py b/heisescraper_klingele_brose.py new file mode 100644 index 0000000..ffd390b --- /dev/null +++ b/heisescraper_klingele_brose.py @@ -0,0 +1,48 @@ +# -*- coding: cp1252 -*- +# imports +from bs4 import BeautifulSoup +import requests +import csv +import collections +import itertools + +def main(): + + print "Scraping from heise.de/thema/https\n" + + csvfile = csv.writer(open('yannis_julius_heisescraper.csv', 'w'), delimiter = ';') #create csv file with delimiter ; + + for page in range(0, 4, 1): #repeat process for each page of heise.de to the topic https + + heise_url = "https://www.heise.de/thema/https?seite="+str(page)+".html" + data = requests.get(heise_url).text + content = BeautifulSoup(data, "lxml") + keywords = content.findAll('div', class_='keywordliste') #find all + + for header in keywords: + header = header.findAll('header') + del header[0] #delete the very first header that + txt = [] + for h in header: #appends all headers to a list + txt.append(h.text.encode('utf-8').rstrip()) #removes \n from the headers + csvfile.writerow(txt) #adds headers to the csv file + + print "Finished scraping from heise.de/thema/https\n" + print "Continuing with counting word frequency\n" + + #split all the words with a delimiter ; and put them into a list + split_words = [item.split(';') for item in (open('yannis_julius_heisescraper.csv', 'r').read().split())] + merged_list = list(itertools.chain(*split_words)) + #count the frequency of each word, put the words into a dictionary with the amount + wordfrequency = [merged_list.count(word) for word in merged_list] + dictionary = dict(zip(merged_list, wordfrequency)) + #sort the amounts + aux = [(dictionary[key], key) for key in dictionary] + aux.sort() + aux.reverse() + #print top 3 results + print "Results: \n 1.",aux[0],"\n 2.",aux[1],"\n 3.",aux[2] + +#execution +if __name__ == '__main__': + main() -- GitLab