Skip to content
Snippets Groups Projects
Commit 09518761 authored by djcaballero's avatar djcaballero
Browse files

Initial commit

parents
No related branches found
No related tags found
No related merge requests found
#import libraries
import unicodecsv as csv
import requests
from bs4 import BeautifulSoup
def getSoup(url):
#query the website
page_query = requests.get(url)
#parse the html using BS
soup_page = BeautifulSoup(page_query.text, 'html.parser')
return soup_page
def main():
#open csv.file and create csv.writer
csv_file = open('heise_articles.csv', 'wb')
writer = csv.writer(csv_file, delimiter=',')
#store the urls to be used in urls
urls=[]
url1 = 'https://www.heise.de/thema/https'
url2 = 'https://www.heise.de/thema/https?seite=1'
url3 = 'https://www.heise.de/thema/https?seite=2'
url4 = 'https://www.heise.de/thema/https?seite=3'
urls.append(url1)
urls.append(url2)
urls.append(url3)
urls.append(url4)
#get the parent <div class="keywordliste">...<\div>
for i in urls:
parent_div = getSoup(i).find('div', attrs={'class': 'keywordliste'})
#get the <nav>...<\nav>
nav_tag = parent_div.nav
#get all headers
articles = nav_tag.findAll('header')
#print all headers
for article in articles:
article = article.text.encode('utf-8')
writer.writerow([article])
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment