wikipedia/scripts/fetch_enwiki_daily_views.py

   1 #!/usr/bin/env python3
   2
   3 ###############################################################################
   4 #
   5 # This script assumes the presence of the COVID-19 repo.
   6 #
   7 # It (1) reads in the article list and then (2) calls the Wikimedia API to
   8 # fetch view information for each article. Output is to (3) JSON and TSV.
   9 #
  10 ###############################################################################
  11
  12 import sys
  13 import subprocess
  14 import requests
  15 import argparse
  16 import json
  17 import time
  18 import os.path
  19 import argparse
  20 import datetime
  21 import logging
  22 from csv import DictWriter
  23 #import feather #TBD
  24
  25 def parse_args():
  26     parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.')
  27     parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
  28     parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
  29     parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str)
  30     parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=str),
  31     parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
  32     args = parser.parse_args()
  33     return(args)
  34
  35 def main():
  36
  37     args = parse_args()
  38
  39     outputPath = args.output_folder
  40     articleFile = args.article_file
  41
  42     #handle -d
  43     if args.query_date:
  44         query_date = args.query_date
  45     else:
  46         yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
  47         query_date = yesterday.strftime("%Y%m%d")
  48
  49     #handle -L
  50     loglevel_mapping = { 'debug' : logging.DEBUG,
  51                          'info' : logging.INFO,
  52                          'warning' : logging.WARNING,
  53                          'error' : logging.ERROR,
  54                          'critical' : logging.CRITICAL }
  55
  56     if args.logging_level in loglevel_mapping:
  57         loglevel = loglevel_mapping[args.logging_level]
  58     else:
  59         print("Choose a valid log level: debug, info, warning, error, or critical")
  60         exit
  61
  62     #handle -W
  63     if args.logging_destination:
  64         logging.basicConfig(filename=args.logging_destination, filemode='a', level=loglevel)
  65     else:
  66         logging.basicConfig(level=loglevel)
  67
  68     export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
  69     export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
  70     export_time = str(datetime.datetime.now())
  71     export_date = datetime.datetime.today().strftime("%Y%m%d")
  72
  73     logging.info(f"Starting run at {export_time}")
  74     logging.info(f"Last commit: {export_git_hash}")
  75
  76     #1 Load up the list of article names
  77     j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.json")
  78     t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.tsv")
  79
  80     with open(articleFile, 'r') as infile:
  81         articleList = list(infile)
  82
  83     success = 0 #for logging how many work/fail
  84     failure = 0
  85
  86     #3 Save results as a JSON and TSV
  87     with open(j_outfilename, 'w') as j_outfile, \
  88          open(t_outfilename, 'w') as t_outfile:
  89
  90         #2 Repeatedly call the API with that list of names
  91         for a in articleList:
  92             a = a.strip("\"\n") #destringify
  93             url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{query_date}00/{query_date}00"
  94
  95             response = requests.get(url)
  96             if response.ok:
  97                 jd = response.json()["items"][0]
  98                 success = success + 1
  99             else:
 100                 failure = failure + 1
 101                 logging.warning(f"Failure: {response.status_code} from {url}")
 102
 103             # start writing the CSV File if it doesn't exist yet
 104             try:
 105                 dw
 106             except NameError:
 107                 dw = DictWriter(t_outfile, sorted(jd.keys()), delimiter='\t')
 108                 dw.writeheader()
 109
 110             logging.debug(f"printing data: {jd}")
 111
 112             # write out the line of the json file
 113             print(json.dumps(jd), file=j_outfile)
 114
 115             # write out of the csv file
 116             dw.writerow(jd)
 117
 118     # f_Out = outputPath + "dailyviews" + query_date + ".feather"
 119     # read the json back in and make a feather file?
 120     logging.debug(f"Run complete at {datetime.datetime.now()}")
 121     logging.info(f"Processed {success} successful URLs and {failure} failures.")
 122
 123
 124 if __name__ == "__main__":
 125
 126     main()