diff --git a/heise.py b/heise.py index b367c0b8228fb08566e25f0183d4550589328340..d61e73fae03661c6195b56164383033e9d3695d2 100644 --- a/heise.py +++ b/heise.py @@ -1,4 +1,4 @@ -# python 3.6.1 +#! /usr/bin/env python3 # imports from bs4 import BeautifulSoup @@ -13,26 +13,64 @@ def has_source(tag): def main(): fobj = open('heise.csv', 'w') # open file - csvw = csv.writer(fobj, delimiter = ';') # create csv writer, set delimiter to ; + csvw = csv.writer(fobj, delimiter = ';') # create csv writer, set delimiter to ; - r = requests.get("https://www.heise.de/thema/https") # - soup = BeautifulSoup(r.content, "lxml") - headlines = (soup.find_all(has_source)) #find the corresponding entries + r = requests.get("https://www.heise.de/thema/https") + soup = BeautifulSoup(r.content, "lxml") # soup object + headlines = (soup.find_all(has_source)) # find the corresponding entries for line in headlines: - txt = ["Überschrift: "] - for header in line.header: #filter only for the header-tag + txt = ["title: "] + for header in line.header: #filter only for the header tag txt.append(header) csvw.writerow(txt) - #print(txt) - fobj.close() # close file - print("\nDONE !\n\n\nHeise.de was scraped completely.\n") + fobj.close() # close file + print("\nHeise.de zum Thema 'https' komplett durchsucht.\n") + # count words + word_count = {} + with open('heise.csv', newline='') as f: + reader = csv.reader(f, delimiter=' ', quotechar='\"') # delimiter changed to space + for row in reader: + for word in row: + w = word.strip() # clean the words + w = word.lower() # clean up capital letters + if len(w) > 0: + word_count[w] = word_count.get(w, 0) + 1 + + # not pretty, but it works... + word_one, word_two, word_three = '','','' # places 1 2 and 3 + count_one, count_two, count_three = 0,0,0 # counts 1 2 and 3 + + # first place + for word in word_count: + if word_count[word] > count_one: + word_one = word + count_one = word_count[word] + + del word_count[word_one] + + # second place + for word in word_count: + if word_count[word] > count_two: + word_two = word + count_two = word_count[word] + + del word_count[word_two] + # third place + for word in word_count: + if word_count[word] > count_three: + word_three = word + count_three = word_count[word] + + print ("Das haeufigste Wort ist '%s' mit %i Vorkommen." % (word_one, count_one)) + print ("Das zweithaeufigste Wort ist '%s' mit %i Vorkommen." % (word_two, count_two)) + print ("Das dritthaeufigste Wort ist '%s' mit %i Vorkommen." % (word_three, count_three)) # main program if __name__ == '__main__': - main() \ No newline at end of file + main()