Skip to content
Snippets Groups Projects
Commit 17b65b30 authored by sommerann's avatar sommerann
Browse files

first version with import

parent eb22c268
No related branches found
No related tags found
No related merge requests found
heise.py 0 → 100644
# python 3.6.1
# imports
from bs4 import BeautifulSoup
import requests
import csv
def has_source(tag):
# filtering the search results
return tag.has_attr('data-sourcechannel') and tag.has_attr('class')
# scraper website: https://www.heise.de/thema/https
def main():
fobj = open('heise.csv', 'w') # open file
csvw = csv.writer(fobj, delimiter = ';') # create csv writer, set delimiter to ;
r = requests.get("https://www.heise.de/thema/https") #
soup = BeautifulSoup(r.content, "lxml")
headlines = (soup.find_all(has_source)) #find the corresponding entries
for line in headlines:
txt = ["Überschrift: "]
for header in line.header: #filter only for the header-tag
txt.append(header)
csvw.writerow(txt)
#print(txt)
fobj.close() # close file
print("\nDONE !\n\n\nHeise.de was scraped completely.\n")
# main program
if __name__ == '__main__':
main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment