wikipedia/scripts/fetch_enwiki_daily_views.py

   1 #!/usr/bin/env python3
   2
   3 ###############################################################################
   4 #
   5 # This script assumes the presence of the COVID-19 repo.
   6 #
   7 # It (1) reads in the article list and then (2) calls the Wikimedia API to
   8 # fetch view information for each article. Output is to (3) JSON and TSV.
   9 #
  10 ###############################################################################
  11
  12 import sys
  13 import requests
  14 import argparse
  15 import json
  16 import time
  17 import os.path
  18 import datetime
  19 import logging
  20 from csv import DictWriter
  21 import digobs
  22 #import feather #TBD
  23
  24 def parse_args():
  25     parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.')
  26     parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
  27     parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
  28     parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str)
  29     parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=str),
  30     parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
  31     args = parser.parse_args()
  32     return(args)
  33
  34 def main():
  35
  36     args = parse_args()
  37
  38     outputPath = args.output_folder
  39     articleFile = args.article_file
  40
  41     #handle -d
  42     if args.query_date:
  43         query_date = args.query_date
  44     else:
  45         yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
  46         query_date = yesterday.strftime("%Y%m%d")
  47
  48     #handle -L
  49     loglevel = digobs.get_loglevel(args.logging_level)
  50
  51     #handle -W
  52     if args.logging_destination:
  53         logging.basicConfig(filename=args.logging_destination, filemode='a', level=loglevel)
  54     else:
  55         logging.basicConfig(level=loglevel)
  56
  57     export_time = str(datetime.datetime.now())
  58     export_date = datetime.datetime.today().strftime("%Y%m%d")
  59
  60     logging.info(f"Starting run at {export_time}")
  61     logging.info(f"Last commit: {digobs.git_hash()}")
  62
  63     #1 Load up the list of article names
  64     j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.json")
  65     t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.tsv")
  66
  67     with open(articleFile, 'r') as infile:
  68         articleList = list(map(str.strip, infile))
  69
  70     success = 0 #for logging how many work/fail
  71     failure = 0
  72
  73     #3 Save results as a JSON and TSV
  74     with open(j_outfilename, 'w') as j_outfile, \
  75          open(t_outfilename, 'w') as t_outfile:
  76
  77         #2 Repeatedly call the API with that list of names
  78         for a in articleList:
  79             url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{query_date}00/{query_date}00"
  80
  81             response = requests.get(url)
  82             if response.ok:
  83                 jd = response.json()["items"][0]
  84                 success = success + 1
  85             else:
  86                 failure = failure + 1
  87                 logging.warning(f"Failure: {response.status_code} from {url}")
  88
  89             # start writing the CSV File if it doesn't exist yet
  90             try:
  91                 dw
  92             except NameError:
  93                 dw = DictWriter(t_outfile, sorted(jd.keys()), delimiter='\t')
  94                 dw.writeheader()
  95
  96             logging.debug(f"printing data: {jd}")
  97
  98             # write out the line of the json file
  99             print(json.dumps(jd), file=j_outfile)
 100
 101             # write out of the csv file
 102             dw.writerow(jd)
 103
 104     # f_Out = outputPath + "dailyviews" + query_date + ".feather"
 105     # read the json back in and make a feather file?
 106     logging.debug(f"Run complete at {datetime.datetime.now()}")
 107     logging.info(f"Processed {success} successful URLs and {failure} failures.")
 108
 109
 110 if __name__ == "__main__":
 111
 112     main()