From 04e00f363bec0c5aa349e6c85f9a78d6ee70c3fd Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Wed, 1 Apr 2020 15:14:05 -0500 Subject: [PATCH] address confusion with date The timestamps in files should be the day that the exports are done. For the view data, the query date needs to be the day before but this shouldn't be the timestamp we use in files, etc. --- wikipedia/scripts/fetch_enwiki_daily_views.py | 14 +++++++------- wikipedia/scripts/fetch_enwiki_revisions.py | 14 ++++---------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/wikipedia/scripts/fetch_enwiki_daily_views.py b/wikipedia/scripts/fetch_enwiki_daily_views.py index 225b06d..f766ed8 100755 --- a/wikipedia/scripts/fetch_enwiki_daily_views.py +++ b/wikipedia/scripts/fetch_enwiki_daily_views.py @@ -23,7 +23,6 @@ from csv import DictWriter #import feather #TBD def parse_args(): - parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.') parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str) parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str) @@ -42,10 +41,10 @@ def main(): #handle -d if args.query_date: - queryDate = args.query_date + query_date = args.query_date else: yesterday = datetime.datetime.today() - datetime.timedelta(days=1) - queryDate = yesterday.strftime("%Y%m%d") + query_date = yesterday.strftime("%Y%m%d") #handle -L loglevel_mapping = { 'debug' : logging.DEBUG, @@ -69,13 +68,14 @@ def main(): export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip() export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip() export_time = str(datetime.datetime.now()) + export_date = datetime.datetime.today().strftime("%Y%m%d") logging.info(f"Starting run at {export_time}") logging.info(f"Last commit: {export_git_hash}") #1 Load up the list of article names - j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.json") - t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.tsv") + j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.json") + t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.tsv") with open(articleFile, 'r') as infile: articleList = list(infile) @@ -90,7 +90,7 @@ def main(): #2 Repeatedly call the API with that list of names for a in articleList: a = a.strip("\"\n") #destringify - url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}00/{queryDate}00" + url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{query_date}00/{query_date}00" response = requests.get(url) if response.ok: @@ -115,7 +115,7 @@ def main(): # write out of the csv file dw.writerow(jd) - # f_Out = outputPath + "dailyviews" + queryDate + ".feather" + # f_Out = outputPath + "dailyviews" + query_date + ".feather" # read the json back in and make a feather file? logging.debug(f"Run complete at {datetime.datetime.now()}") logging.info(f"Processed {success} successful URLs and {failure} failures.") diff --git a/wikipedia/scripts/fetch_enwiki_revisions.py b/wikipedia/scripts/fetch_enwiki_revisions.py index 3ecd04f..e8f177d 100755 --- a/wikipedia/scripts/fetch_enwiki_revisions.py +++ b/wikipedia/scripts/fetch_enwiki_revisions.py @@ -1,4 +1,4 @@ -#!yusr/bin/env python3 +#!/usr/bin/env python3 ############################################################################### # @@ -26,7 +26,6 @@ def parse_args(): parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.') parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str) parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str) - parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str) parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=str), parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str), args = parser.parse_args() @@ -37,12 +36,6 @@ def main(): output_path = args.output_folder article_filename = args.article_file - #handle -d - if args.query_date: - query_date = args.query_date - else: - yesterday = datetime.datetime.today() - datetime.timedelta(days=1) - query_date = yesterday.strftime("%Y%m%d") #handle -L loglevel_mapping = { 'debug' : logging.DEBUG, @@ -66,12 +59,13 @@ def main(): export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip() export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip() export_time = str(datetime.datetime.now()) + export_date = datetime.datetime.today().strftime("%Y%m%d") logging.info(f"Starting run at {export_time}") logging.info(f"Last commit: {export_git_hash}") - json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{query_date}.json") - tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{query_date}.tsv") + json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json") + tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv") api_session = api.Session("https://en.wikipedia.org/w/api.php") -- 2.39.2