X-Git-Url: https://code.communitydata.science/covid19.git/blobdiff_plain/c54d8ba28ac8fe22b3da3ee171277fbabffe64fb..72bf7bcd3787ffbda4ec2c47204896483e8069c9:/wikipedia_views/scripts/fetch_daily_views.py diff --git a/wikipedia_views/scripts/fetch_daily_views.py b/wikipedia_views/scripts/fetch_daily_views.py index 5ce989f..e3a2ff4 100755 --- a/wikipedia_views/scripts/fetch_daily_views.py +++ b/wikipedia_views/scripts/fetch_daily_views.py @@ -17,8 +17,8 @@ import csv import time import os.path import datetime -#import feather - +import logging +#import feather #TBD def parse_args(): @@ -27,6 +27,8 @@ def parse_args(): parser.add_argument('-o', '--output_folder', help='Where to save output', default="../data/", type=str) parser.add_argument('-i', '--article_file', help='File listing article names', default="../resources/articles.txt", type=str) parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format please.', type=str) + parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info'), + parser.add_argument('-W', '--logging_destination', help='Logging destination.', default='../logs/'), args = parser.parse_args() return(args) @@ -39,6 +41,7 @@ def main(): outputPath = args.output_folder articleFile = args.article_file + #handle -d if (args.query_date): queryDate = args.query_date else: @@ -47,39 +50,62 @@ def main(): queryDate = queryDate + "00" #requires specifying hours + #handle -W + logHome = f"{args.logging_destination}dailylogrun{datetime.datetime.today().strftime('%Y%m%d')}" + + #handle -L + loglevel = args.logging_level + if loglevel == 'debug': + logging.basicConfig(filename=logHome, filemode='a', level=logging.DEBUG) + elif loglevel == 'info': + logging.basicConfig(filename=logHome, filemode='a', level=logging.INFO) + elif loglevel == 'warning': + logging.basicConfig(filename=logHome, filemode='a', level=logging.WARNING) + elif loglevel == 'error': + logging.basicConfig(filename=logHome, filemode='a', level=logging.ERROR) + elif loglevel == 'critical': + logging.basicConfig(filename=logHome, filemode='a', level=logging.CRITICAL) + else: + print("Choose a valid log level: debug, info, warning, error, or critical") + exit + articleList = [] -#1 Load up the list of article names + logging.debug(f"Starting run at {datetime.datetime.now()}") - with open(articleFile, 'r') as infileHandle: - theInfile = csv.reader(infileHandle) - next(theInfile) #skip header - for currentLine in theInfile: - articleList.append(currentLine) + #1 Load up the list of article names - j_Out = outputPath + "dailyviews" + queryDate + ".json" - t_Out = outputPath + "dailyviews" + queryDate + ".tsv" + j_Out = f"{outputPath}dailyviews{queryDate}.json" + t_Out = f"{outputPath}dailyviews{queryDate}.tsv" - j = [] + with open(articleFile, 'r') as infile: + articleList = list(infile) - i = 0 #iterator to deal with end of file + j = [] + success = 0 #for logging how many work/fail + failure = 0 -#2 Repeatedly call the API with that list of names + #2 Repeatedly call the API with that list of names for a in articleList: - a = a[0] #destringify - i = i+1 - url= "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/" - url= url + a + "/daily/" + queryDate + "/" + queryDate #for now, single date at a time + a = a.strip("\"\n") #destringify + url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}/{queryDate}" + response = requests.get(url) if response.ok: jd = json.loads(response.content) j.append(jd["items"][0]) time.sleep(.1) + success = success + 1 + else: + failure = failure + 1 + logging.warning(f"Failure: {response.status_code} from {url}") -#3 Save results as a JSON and TSV + #3 Save results as a JSON and TSV #all data in j now, make json file + logging.info(f"Processed {success} successful URLs and {failure} failures.") + with open(j_Out, 'w') as j_outfile: json.dump(j, j_outfile, indent=2) @@ -88,9 +114,10 @@ def main(): dw.writeheader() dw.writerows(j) + logging.debug(f"Run complete at {datetime.datetime.now()}") - f_Out = outputPath + "dailyviews" + queryDate + ".feather" - #read the json back in and make a feather file? + # f_Out = outputPath + "dailyviews" + queryDate + ".feather" + # read the json back in and make a feather file? if __name__ == "__main__":