wikipedia_views/scripts/fetch_enwiki_daily_views.py

   1 #!/usr/bin/env python3
   2
   3 ###############################################################################
   4 #
   5 # This script assumes the presence of the COVID-19 repo.
   6 #
   7 # It (1) reads in the article list and then (2) calls the Wikimedia API to
   8 # fetch view information for each article. Output is to (3) JSON and TSV.
   9 #
  10 ###############################################################################
  11
  12 import sys
  13 import subprocess
  14 import requests
  15 import argparse
  16 import json
  17 import time
  18 import os.path
  19 import argparse
  20 import datetime
  21 import logging
  22 from csv import DictWriter
  23 #import feather #TBD
  24
  25 def parse_args():
  26
  27     parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.')
  28     parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia_views/data", type=str)
  29     parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia_views/resources/enwp_wikiproject_covid19_articles.txt", type=str)
  30     parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str)
  31     parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=str),
  32     parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
  33     args = parser.parse_args()
  34     return(args)
  35
  36 def main():
  37
  38     args = parse_args()
  39
  40     outputPath = args.output_folder
  41     articleFile = args.article_file
  42
  43     #handle -d
  44     if args.query_date:
  45         queryDate = args.query_date
  46     else:
  47         yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
  48         queryDate = yesterday.strftime("%Y%m%d")
  49
  50     queryDate = queryDate + "00" #requires specifying hours
  51
  52     #handle -L
  53     loglevel_mapping = { 'debug' : logging.DEBUG,
  54                          'info' : logging.INFO,
  55                          'warning' : logging.WARNING,
  56                          'error' : logging.ERROR,
  57                          'critical' : logging.CRITICAL }
  58
  59     if args.logging_level in loglevel_mapping:
  60         loglevel = loglevel_mapping[args.logging_level]
  61     else:
  62         print("Choose a valid log level: debug, info, warning, error, or critical")
  63         exit
  64
  65     #handle -W
  66     if args.logging_destination:
  67         logging.basicConfig(filename=args.logging_destination, filemode='a', level=loglevel)
  68     else:
  69         logging.basicConfig(level=loglevel)
  70
  71     export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
  72     export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
  73     export_time = str(datetime.datetime.now())
  74
  75     logging.info(f"Starting run at {export_time}")
  76     logging.info(f"Last commit: {export_git_hash}")
  77
  78     #1 Load up the list of article names
  79     j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.json")
  80     t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.tsv")
  81
  82     with open(articleFile, 'r') as infile:
  83         articleList = list(infile)
  84
  85     success = 0 #for logging how many work/fail
  86     failure = 0
  87
  88     #3 Save results as a JSON and TSV
  89     with open(j_outfilename, 'w') as j_outfile, \
  90          open(t_outfilename, 'w') as t_outfile:
  91
  92         #2 Repeatedly call the API with that list of names
  93         for a in articleList:
  94             a = a.strip("\"\n") #destringify
  95             url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}/{queryDate}"
  96
  97             response = requests.get(url)
  98             if response.ok:
  99                 jd = response.json()["items"][0]
 100                 success = success + 1
 101             else:
 102                 failure = failure + 1
 103                 logging.warning(f"Failure: {response.status_code} from {url}")
 104
 105             # start writing the CSV File if it doesn't exist yet
 106             try:
 107                 dw
 108             except NameError:
 109                 dw = DictWriter(t_outfile, sorted(jd.keys()), delimiter='\t')
 110                 dw.writeheader()
 111
 112             logging.debug(f"printing data: {jd}")
 113
 114             # write out the line of the json file
 115             print(json.dumps(jd), file=j_outfile)
 116
 117             # write out of the csv file
 118             dw.writerow(jd)
 119
 120     # f_Out = outputPath + "dailyviews" + queryDate + ".feather"
 121     # read the json back in and make a feather file?
 122     logging.debug(f"Run complete at {datetime.datetime.now()}")
 123     logging.info(f"Processed {success} successful URLs and {failure} failures.")
 124
 125
 126 if __name__ == "__main__":
 127
 128     main()