X-Git-Url: https://code.communitydata.science/covid19.git/blobdiff_plain/ee91df4c0441266c994ec8a4fd1c8389dd42731d..50f58a3887692505e7408da3784d3d55c55e6122:/wikipedia_views/scripts/fetch_daily_views.py diff --git a/wikipedia_views/scripts/fetch_daily_views.py b/wikipedia_views/scripts/fetch_daily_views.py index ab824c2..4f496fe 100755 --- a/wikipedia_views/scripts/fetch_daily_views.py +++ b/wikipedia_views/scripts/fetch_daily_views.py @@ -17,7 +17,8 @@ import csv import time import os.path import datetime -#import feather +import logging +#import feather #TBD def parse_args(): @@ -26,6 +27,8 @@ def parse_args(): parser.add_argument('-o', '--output_folder', help='Where to save output', default="../data/", type=str) parser.add_argument('-i', '--article_file', help='File listing article names', default="../resources/articles.txt", type=str) parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format please.', type=str) + parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info'), + parser.add_argument('-W', '--logging_destination', help='Logging destination.', default='../logs/'), args = parser.parse_args() return(args) @@ -38,6 +41,7 @@ def main(): outputPath = args.output_folder articleFile = args.article_file + #handle -d if (args.query_date): queryDate = args.query_date else: @@ -46,8 +50,29 @@ def main(): queryDate = queryDate + "00" #requires specifying hours + #handle -W + logHome = f"{args.logging_destination}dailylogrun{datetime.datetime.today().strftime('%Y%m%d')}" + + #handle -L + loglevel = args.logging_level + if loglevel == 'debug': + logging.basicConfig(filename=logHome, filemode='a', level=logging.DEBUG) + elif loglevel == 'info': + logging.basicConfig(filename=logHome, filemode='a', level=logging.INFO) + elif loglevel == 'warning': + logging.basicConfig(filename=logHome, filemode='a', level=logging.WARNING) + elif loglevel == 'error': + logging.basicConfig(filename=logHome, filemode='a', level=logging.ERROR) + elif loglevel == 'critical': + logging.basicConfig(filename=logHome, filemode='a', level=logging.CRITICAL) + else: + print("Choose a valid log level: debug, info, warning, error, or critical") + exit + articleList = [] + logging.debug(f"Starting run at {datetime.datetime.now()}") + #1 Load up the list of article names j_Out = f"{outputPath}dailyviews{queryDate}.json" @@ -57,25 +82,31 @@ def main(): next(infile) #skip header articleList = list(infile) - j = [] + j = [] + success = 0 #for logging how many work/fail + failure = 0 - #2 Repeatedly call the API with that list of names + #2 Repeatedly call the API with that list of names - for a in articleList: - a = a.strip("\"\n") #destringify - url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}/{queryDate}" + for a in articleList: + a = a.strip("\"\n") #destringify + url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}/{queryDate}" - response = requests.get(url) - if response.ok: - jd = json.loads(response.content) - j.append(jd["items"][0]) - time.sleep(.1) - else: - print(f"Not ok response: {response.status_code} from {url}") - - #3 Save results as a JSON and TSV + response = requests.get(url) + if response.ok: + jd = json.loads(response.content) + j.append(jd["items"][0]) + time.sleep(.1) + success = success + 1 + else: + failure = failure + 1 + logging.warning(f"Failure: {response.status_code} from {url}") + + #3 Save results as a JSON and TSV #all data in j now, make json file + logging.info(f"Processed {success} successful URLs and {failure} failures.") + with open(j_Out, 'w') as j_outfile: json.dump(j, j_outfile, indent=2) @@ -84,6 +115,7 @@ def main(): dw.writeheader() dw.writerows(j) + logging.debug(f"Run complete at {datetime.datetime.now()}") # f_Out = outputPath + "dailyviews" + queryDate + ".feather" # read the json back in and make a feather file?