wikipedia/scripts/fetch_enwiki_revisions.py

   1 #!yusr/bin/env python3
   2
   3 ###############################################################################
   4 #
   5 # This script assumes the presence of the COVID-19 repo.
   6 #
   7 # It (1) reads in the article list and then (2) calls the Wikimedia API to
   8 # fetch view information for each article. Output is to (3) JSON and TSV.
   9 #
  10 ###############################################################################
  11
  12 import argparse
  13 import logging
  14 import os.path
  15 import json
  16 import subprocess
  17 import datetime
  18
  19 from requests import Request
  20 from csv import DictWriter
  21 from mw import api
  22
  23
  24 def parse_args():
  25
  26     parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')
  27     parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
  28     parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
  29     parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str)
  30     parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=str),
  31     parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
  32     args = parser.parse_args()
  33     return(args)
  34
  35 def main():
  36     args = parse_args()
  37
  38     output_path = args.output_folder
  39     article_filename = args.article_file
  40     #handle -d
  41     if args.query_date:
  42         query_date = args.query_date
  43     else:
  44         yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
  45         query_date = yesterday.strftime("%Y%m%d")
  46
  47     #handle -L
  48     loglevel_mapping = { 'debug' : logging.DEBUG,
  49                          'info' : logging.INFO,
  50                          'warning' : logging.WARNING,
  51                          'error' : logging.ERROR,
  52                          'critical' : logging.CRITICAL }
  53
  54     if args.logging_level in loglevel_mapping:
  55         loglevel = loglevel_mapping[args.logging_level]
  56     else:
  57         print("Choose a valid log level: debug, info, warning, error, or critical")
  58         exit
  59
  60     #handle -W
  61     if args.logging_destination:
  62         logging.basicConfig(filename=args.logging_destination, filemode='a', level=loglevel)
  63     else:
  64         logging.basicConfig(level=loglevel)
  65
  66     export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
  67     export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
  68     export_time = str(datetime.datetime.now())
  69
  70     logging.info(f"Starting run at {export_time}")
  71     logging.info(f"Last commit: {export_git_hash}")
  72
  73     json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{query_date}.json")
  74     tsv_output_filename =  os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{query_date}.tsv")
  75
  76     api_session = api.Session("https://en.wikipedia.org/w/api.php")
  77
  78     # list of properties from the API we want to gather (basically all of
  79     # them supported by mediawik-utilities)
  80
  81     rv_props =  {'revid' : 'ids',
  82                  'timestamp' : 'timestamp',
  83                  'user' : 'user',
  84                  'userid' : 'userid',
  85                  'size' : 'size',
  86                  'sha1' : 'sha1',
  87                  'contentmodel' : 'contentmodel',
  88                  'tags' : 'tags',
  89                  'comment' : 'comment',
  90                  'content' : 'content' }
  91
  92     exclude_from_tsv = ['tags', 'comment', 'content']
  93
  94     # load the list of articles
  95     with open(article_filename, 'r') as infile:
  96         article_list = [art.strip() for art in list(infile)]
  97
  98     def get_revisions_for_page(title):
  99         return api_session.revisions.query(properties=rv_props.values(),
 100                                            titles={title},
 101                                            direction="newer")
 102
 103     tsv_fields = ['title', 'pageid', 'namespace']
 104     tsv_fields = tsv_fields + list(rv_props.keys())
 105
 106     # drop fields that we identified for exclusion
 107     tsv_fields = [e for e in tsv_fields if e not in exclude_from_tsv]
 108
 109     # add special export fields
 110     tsv_fields = tsv_fields + ['url', 'export_timestamp', 'export_commit']
 111
 112     export_info = { 'git_commit' : export_git_hash,
 113                     'timestamp' : export_time }
 114
 115     with open(json_output_filename, 'w') as json_output, \
 116          open(tsv_output_filename, 'w') as tsv_output:
 117
 118         tsv_writer = DictWriter(tsv_output, fieldnames=tsv_fields, delimiter="\t")
 119         tsv_writer.writeheader()
 120
 121         for article in article_list:
 122             logging.info(f"pulling revisiosn for: {article}")
 123             for rev in get_revisions_for_page(article):
 124                 logging.debug(f"processing raw revision: {rev}")
 125
 126                 # add export metadata
 127                 rev['exported'] = export_info
 128
 129                 # save the json version of the code
 130                 print(json.dumps(rev), file=json_output)
 131
 132                 # handle missing data
 133                 if "sha1" not in rev:
 134                     rev["sha1"] = ""
 135
 136                 # add page title information
 137                 rev['title'] = rev['page']['title']
 138                 rev['pageid'] = rev['page']['pageid']
 139                 rev['namespace'] = rev['page']['ns']
 140
 141                 # construct a URL
 142                 rev['url'] = Request('GET', 'https://en.wikipedia.org/w/index.php',
 143                                      params={'title' : rev['title'].replace(" ", "_"),
 144                                             'oldid' : rev['revid']}).prepare().url
 145
 146                 rev['export_timestamp'] = export_time
 147                 rev['export_commit'] = export_git_short_hash
 148
 149                 tsv_writer.writerow({k: rev[k] for k in tsv_fields})
 150             break
 151
 152 if __name__ == "__main__":
 153
 154     main()