Skip to content
Snippets Groups Projects
Commit 23129704 authored by Annika Sommer's avatar Annika Sommer
Browse files

fertige Version in Python3

parent 17b65b30
No related branches found
No related tags found
No related merge requests found
# python 3.6.1 #! /usr/bin/env python3
# imports # imports
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
...@@ -13,26 +13,64 @@ def has_source(tag): ...@@ -13,26 +13,64 @@ def has_source(tag):
def main(): def main():
fobj = open('heise.csv', 'w') # open file fobj = open('heise.csv', 'w') # open file
csvw = csv.writer(fobj, delimiter = ';') # create csv writer, set delimiter to ; csvw = csv.writer(fobj, delimiter = ';') # create csv writer, set delimiter to ;
r = requests.get("https://www.heise.de/thema/https") # r = requests.get("https://www.heise.de/thema/https")
soup = BeautifulSoup(r.content, "lxml") soup = BeautifulSoup(r.content, "lxml") # soup object
headlines = (soup.find_all(has_source)) #find the corresponding entries headlines = (soup.find_all(has_source)) # find the corresponding entries
for line in headlines: for line in headlines:
txt = ["Überschrift: "] txt = ["title: "]
for header in line.header: #filter only for the header-tag for header in line.header: #filter only for the header tag
txt.append(header) txt.append(header)
csvw.writerow(txt) csvw.writerow(txt)
#print(txt)
fobj.close() # close file fobj.close() # close file
print("\nDONE !\n\n\nHeise.de was scraped completely.\n") print("\nHeise.de zum Thema 'https' komplett durchsucht.\n")
# count words
word_count = {}
with open('heise.csv', newline='') as f:
reader = csv.reader(f, delimiter=' ', quotechar='\"') # delimiter changed to space
for row in reader:
for word in row:
w = word.strip() # clean the words
w = word.lower() # clean up capital letters
if len(w) > 0:
word_count[w] = word_count.get(w, 0) + 1
# not pretty, but it works...
word_one, word_two, word_three = '','','' # places 1 2 and 3
count_one, count_two, count_three = 0,0,0 # counts 1 2 and 3
# first place
for word in word_count:
if word_count[word] > count_one:
word_one = word
count_one = word_count[word]
del word_count[word_one]
# second place
for word in word_count:
if word_count[word] > count_two:
word_two = word
count_two = word_count[word]
del word_count[word_two]
# third place
for word in word_count:
if word_count[word] > count_three:
word_three = word
count_three = word_count[word]
print ("Das haeufigste Wort ist '%s' mit %i Vorkommen." % (word_one, count_one))
print ("Das zweithaeufigste Wort ist '%s' mit %i Vorkommen." % (word_two, count_two))
print ("Das dritthaeufigste Wort ist '%s' mit %i Vorkommen." % (word_three, count_three))
# main program # main program
if __name__ == '__main__': if __name__ == '__main__':
main() main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment