diff --git a/common_criteria_scraper/cc_portal_scraper.py b/common_criteria_scraper/cc_portal_scraper.py index c9f36f72e26c50b5ec93cceb1e3ed07f68caa8b7..90ae4e6c54f4098dddd7f37dc663c29be215346c 100644 --- a/common_criteria_scraper/cc_portal_scraper.py +++ b/common_criteria_scraper/cc_portal_scraper.py @@ -1,8 +1,30 @@ import urllib.request +import urllib.parse import csv import os import sys +def download(url): + if url != '': + print(url) + print(url.split('/')[-1]) + try: + urllib.request.urlretrieve(iri2uri(url.replace(':443','')), url.split('/')[-1]) + except ConnectionResetError: + print('Connection reset by server. Continuing') + except urllib.request.HTTPError: + print('File Not Found. Continuing') + except Exception as e: + print('Unexpected error. Continuing') + print(e) + + +def iri2uri(url): + #converts IRIs to URIs as urllib only supports ASCII + url = list(urllib.parse.urlsplit(url)) + url[2] = urllib.parse.quote(url[2]) + return urllib.parse.urlunsplit(url) + csv_url = 'https://www.commoncriteriaportal.org/products/certified_products.csv' # download the csv from commoncriterialportal.org urllib.request.urlretrieve(csv_url, 'certified_products.csv') @@ -12,7 +34,7 @@ with open('certified_products.csv', 'r', errors="ignore") as certified_products, reader = csv.reader(certified_products) line = 0 for line, row in enumerate(reader): - if 'Mobility' in row or line == 0: + if 'Mobility' or 'ICs, Smart Cards and Smart Card-Related Devices and Systems' in row or line == 0: writer.writerow(row) try: @@ -32,26 +54,9 @@ with open ('filtered.csv', 'r') as filtered: sys.exit(1) csv_dict = csv.DictReader(filtered) for row in csv_dict: - if row['Certification Report URL'] != '': - try: - urllib.request.urlretrieve(row['Certification Report URL'].replace(' ','%20').replace(':443',''),(row['Certification Report URL'].split('/')[-1])) - except ConnectionResetError: - print('Connection reset by server. Continuing') - if row['Security Target URL'] != '': - try: - urllib.request.urlretrieve(row['Security Target URL'].replace(' ','%20').replace(':443',''),(row['Security Target URL'].split('/')[-1])) - except ConnectionResetError: - print('Connection reset by server. Continuing') - if row['Maintenance Report'] != '': - try: - urllib.request.urlretrieve(row['Maintenance Report'].replace(' ','%20').replace(':443',''),(row['Maintenance Report'].split('/')[-1])) - except ConnectionResetError: - print('Connection reset by server. Continuing') - if row['Maintenance ST'] != '': - try: - urllib.request.urlretrieve(row['Maintenance ST'].replace(' ','%20').replace(':443',''),(row['Maintenance ST'].split('/')[-1])) - except ConnectionResetError: - print('Connection reset by server. Continuing') - + download(row['Certification Report URL']) + download(row['Security Target URL']) + download(row['Maintenance Report']) + download(row['Maintenance ST']) + - \ No newline at end of file