fertige Version in Python3

23129704 · Annika Sommer · 17b65b30 · 23129704
Commit 23129704 authored 8 years ago by Annika Sommer
--- a/heise.py
+++ b/heise.py
-# python 3.6.1
+#! /usr/bin/env python3
 # imports
 from bs4 import BeautifulSoup
@@ -13,26 +13,64 @@ def has_source(tag):
 def main():
    fobj = open('heise.csv', 'w')      # open file
-    csvw = csv.writer(fobj, delimiter = ';')      # create csv writer, set delimiter to ;
+    csvw = csv.writer(fobj, delimiter = ';')   # create csv writer, set delimiter to ;
-    r = requests.get("https://www.heise.de/thema/https") #
+    r = requests.get("https://www.heise.de/thema/https") 
-    soup = BeautifulSoup(r.content, "lxml")
+    soup = BeautifulSoup(r.content, "lxml") # soup object
-    headlines = (soup.find_all(has_source)) #find the corresponding entries
+    headlines = (soup.find_all(has_source)) # find the corresponding entries
    for line in headlines:
-        txt = ["Überschrift: "]
+        txt = ["title: "]
-        for header in line.header: #filter only for the header-tag
+        for header in line.header:  #filter only for the header tag
            txt.append(header)
        csvw.writerow(txt)
-        #print(txt)
-    fobj.close()                                # close file
+    fobj.close()                              # close file
-    print("\nDONE !\n\n\nHeise.de was scraped completely.\n")
+    print("\nHeise.de zum Thema 'https' komplett durchsucht.\n")
+    # count words
+    word_count = {}   
+    with open('heise.csv', newline='') as f:
+        reader = csv.reader(f, delimiter=' ', quotechar='\"') # delimiter changed to space
+        for row in reader:
+            for word in row:
+                w = word.strip() # clean the words
+                w = word.lower() # clean up capital letters
+                if len(w) > 0:
+                    word_count[w] = word_count.get(w, 0) + 1
+    # not pretty, but it works...
+    word_one, word_two, word_three = '','',''    # places 1 2 and 3
+    count_one, count_two, count_three = 0,0,0    # counts 1 2 and 3
+    # first place
+    for word in word_count:
+        if word_count[word] > count_one:
+            word_one = word
+            count_one = word_count[word]
+    del word_count[word_one]
+    # second place
+    for word in word_count:
+        if word_count[word] > count_two:
+            word_two = word
+            count_two = word_count[word]
+    del word_count[word_two]
+    # third place
+    for word in word_count:
+        if word_count[word] > count_three:
+            word_three = word
+            count_three = word_count[word]
+    print ("Das haeufigste Wort ist '%s' mit %i Vorkommen." % (word_one, count_one))
+    print ("Das zweithaeufigste Wort ist '%s' mit %i Vorkommen." % (word_two, count_two))
+    print ("Das dritthaeufigste Wort ist '%s' mit %i Vorkommen." % (word_three, count_three)) 
 # main program
 if __name__ == '__main__':
    main()
\ No newline at end of file