address confusion with date
authorBenjamin Mako Hill <mako@atdot.cc>
Wed, 1 Apr 2020 20:14:05 +0000 (15:14 -0500)
committerBenjamin Mako Hill <mako@atdot.cc>
Wed, 1 Apr 2020 20:14:05 +0000 (15:14 -0500)
The timestamps in files should be the day that the exports are done. For
the view data, the query date needs to be the day before but this
shouldn't be the timestamp we use in files, etc.

wikipedia/scripts/fetch_enwiki_daily_views.py
wikipedia/scripts/fetch_enwiki_revisions.py

index 225b06d7aaaa0135a78b75cbcdc7625dce5f366f..f766ed8eb6ffaa56ba1fa1bf08614cd7fb4c12ba 100755 (executable)
@@ -23,7 +23,6 @@ from csv import DictWriter
 #import feather #TBD
 
 def parse_args():
-
     parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.')
     parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
     parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
@@ -42,10 +41,10 @@ def main():
 
     #handle -d
     if args.query_date:
-        queryDate = args.query_date
+        query_date = args.query_date
     else:
         yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
-        queryDate = yesterday.strftime("%Y%m%d")
+        query_date = yesterday.strftime("%Y%m%d")
 
     #handle -L
     loglevel_mapping = { 'debug' : logging.DEBUG,
@@ -69,13 +68,14 @@ def main():
     export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
     export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
     export_time = str(datetime.datetime.now())
+    export_date = datetime.datetime.today().strftime("%Y%m%d")
 
     logging.info(f"Starting run at {export_time}")
     logging.info(f"Last commit: {export_git_hash}")
 
     #1 Load up the list of article names
-    j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.json")
-    t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.tsv")
+    j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.json")
+    t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.tsv")
 
     with open(articleFile, 'r') as infile:
         articleList = list(infile)
@@ -90,7 +90,7 @@ def main():
         #2 Repeatedly call the API with that list of names
         for a in articleList:
             a = a.strip("\"\n") #destringify
-            url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}00/{queryDate}00"
+            url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{query_date}00/{query_date}00"
 
             response = requests.get(url)
             if response.ok:
@@ -115,7 +115,7 @@ def main():
             # write out of the csv file
             dw.writerow(jd)
 
-    # f_Out = outputPath + "dailyviews" + queryDate + ".feather"
+    # f_Out = outputPath + "dailyviews" + query_date + ".feather"
     # read the json back in and make a feather file? 
     logging.debug(f"Run complete at {datetime.datetime.now()}")
     logging.info(f"Processed {success} successful URLs and {failure} failures.")
index 3ecd04f5370f890a04d45f11a39cb8f76b17d84c..e8f177db93aa6fb950a3e5a3c568e81a05469e19 100755 (executable)
@@ -1,4 +1,4 @@
-#!yusr/bin/env python3
+#!/usr/bin/env python3
 
 ###############################################################################
 #
@@ -26,7 +26,6 @@ def parse_args():
     parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')
     parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
     parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
-    parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str)
     parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=str), 
     parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str), 
     args = parser.parse_args()
@@ -37,12 +36,6 @@ def main():
 
     output_path = args.output_folder
     article_filename = args.article_file
-    #handle -d
-    if args.query_date:
-        query_date = args.query_date
-    else:
-        yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
-        query_date = yesterday.strftime("%Y%m%d")
 
     #handle -L
     loglevel_mapping = { 'debug' : logging.DEBUG,
@@ -66,12 +59,13 @@ def main():
     export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
     export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
     export_time = str(datetime.datetime.now())
+    export_date = datetime.datetime.today().strftime("%Y%m%d")
 
     logging.info(f"Starting run at {export_time}")
     logging.info(f"Last commit: {export_git_hash}")
 
-    json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{query_date}.json")
-    tsv_output_filename =  os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{query_date}.tsv")
+    json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json")
+    tsv_output_filename =  os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv")
     
     api_session = api.Session("https://en.wikipedia.org/w/api.php")
 

Community Data Science Collective || Want to submit a patch?