From 1cec120dfab8768c3ade5b0bfa6e81e19e3d833c Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Thu, 2 Apr 2020 13:28:34 -0500 Subject: [PATCH] changes to allow historical view data collection - fix bug where it would fail if the first essay had no view data - add ability to override dates in the cron script --- cron-wikipedia_views.sh | 6 +++--- wikipedia/scripts/fetch_enwiki_daily_views.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cron-wikipedia_views.sh b/cron-wikipedia_views.sh index 4b39caa..4afe380 100644 --- a/cron-wikipedia_views.sh +++ b/cron-wikipedia_views.sh @@ -1,13 +1,13 @@ #!/bin/bash -x TZ="UTC" -date_string=$(date +%Y%m%d) +date_string=${OVERRIDE_DATE_STRING:-$(date +%Y%m%d)} view_log="enwp-daily_views-${date_string}.log" -./wikipedia/scripts/wikiproject_scraper.py 2> >(tee wikipedia/logs/{$view_log}) +./wikipedia/scripts/wikiproject_scraper.py 2> >(tee wikipedia/logs/${view_log}) # get the list of files -./wikipedia/scripts/fetch_enwiki_daily_views.py 2> >(tee -a wikipedia/logs/${view_log}) +./wikipedia/scripts/fetch_enwiki_daily_views.py -d "${date_string}" 2> >(tee -a wikipedia/logs/${view_log}) mv wikipedia/logs/${view_log} /var/www/covid19/wikipedia/logs/${view_log} mv wikipedia/data/digobs_covid19-wikipedia-enwiki_dailyviews-${date_string}.tsv /var/www/covid19/wikipedia/ diff --git a/wikipedia/scripts/fetch_enwiki_daily_views.py b/wikipedia/scripts/fetch_enwiki_daily_views.py index 829343d..9f147e0 100755 --- a/wikipedia/scripts/fetch_enwiki_daily_views.py +++ b/wikipedia/scripts/fetch_enwiki_daily_views.py @@ -58,8 +58,8 @@ def main(): logging.info(f"Last commit: {digobs.git_hash()}") #1 Load up the list of article names - j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.json") - t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.tsv") + j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{query_date}.json") + t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{query_date}.tsv") with open(articleFile, 'r') as infile: articleList = list(map(str.strip, infile)) @@ -82,6 +82,7 @@ def main(): else: failure = failure + 1 logging.warning(f"Failure: {response.status_code} from {url}") + continue # start writing the CSV File if it doesn't exist yet try: -- 2.39.2