wikipedia/scripts/wikiproject_scraper.py

   1 #!/usr/bin/env python3
   2
   3 ###############################################################################
   4 #
   5 # This script scrapes the Covid-19 Wikiproject
   6 #
   7 # It (1) hits the fcgi to find out how many rounds. Then (2) hit the fcgi
   8 # that many rounds, cooking that information down to just a list of article names and
   9 # then (3) saves it out.
  10 #
  11 # At time of writing:
  12 # the fCGI returns only 1000 max, no matter what you put in the limit. page 1 looks like this....
  13 # https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=1&sorta=Importance&sortb=Quality
  14 #
  15 # and page 2 looks like this
  16 # https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?namespace=&run=yes&projecta=COVID-19&score=&sorta=Importance&importance=&limit=1000&pagename=&quality=&sortb=Quality&&offset=1001
  17 #
  18 ###############################################################################
  19
  20 import argparse
  21 import subprocess
  22 import requests
  23 import datetime
  24 import logging
  25 import re
  26 import math
  27 from bs4 import BeautifulSoup
  28 import digobs
  29
  30 def parse_args():
  31
  32     parser = argparse.ArgumentParser(description='Get a list of pages tracked by the COVID-19 Wikiproject.')
  33     parser.add_argument('-o', '--output_file', help='Where to save output', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
  34     parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel),
  35     parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
  36     args = parser.parse_args()
  37
  38     return(args)
  39
  40 def main():
  41
  42     args = parse_args()
  43     outputFile = args.output_file
  44
  45     #handle -W
  46     if args.logging_destination:
  47         logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level)
  48     else:
  49         logging.basicConfig(level=args.logging_level)
  50
  51     export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
  52     export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
  53     export_time = str(datetime.datetime.now())
  54
  55     logging.info(f"Starting at {export_time} and destructively outputting article list to {outputFile}.")
  56     logging.info(f"Last commit: {export_git_hash}")
  57
  58     #1 How many hits to the fcgi?
  59     session = requests.Session()
  60
  61     originalURL = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=1&sorta=Importance&sortb=Quality"
  62     headURL = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset="
  63     tailURL = "&sorta=Importance&sortb=Quality" #head + offset + tail = original when offset = 1
  64
  65     # find out how many results we have
  66     response = session.get(originalURL)
  67
  68     soup = BeautifulSoup(response.text, features="html.parser")
  69     nodes = soup.find_all('div', class_="navbox")
  70     rx = re.compile("Total results:\D*(\d+)")
  71     m = rx.search(nodes[0].get_text())
  72     #print(nodes[0].get_text())
  73     numResults = int(m.group(1))
  74
  75     logging.debug(f"fcgi returned {numResults}")
  76     rounds = math.ceil(numResults/1000)
  77
  78     #2 Fetch and parse down to just the article names
  79     articleNames = []
  80
  81     for i in range(1, rounds+1):
  82         offset = (i - 1)*1000 + 1 #offset is 1, then 1001, then 2001
  83         url = f"{headURL}{offset}{tailURL}"
  84         response = session.get(url)
  85         soup = BeautifulSoup(response.text, features="html.parser") #make fresh soup
  86         article_rows = soup.find_all('tr', class_="list-odd") #just the odds first
  87         for row in article_rows:
  88             a = row.find('a')
  89             articleNames.append(a.get_text())
  90         article_rows = soup.find_all('tr', class_="list-even") #now the events
  91         for row in article_rows:
  92             a = row.find('a')
  93             articleNames.append(a.get_text())
  94
  95     #3 Saves the list to a file
  96
  97     with open(outputFile, 'w') as f:
  98         f.write('\n'.join(articleNames)+'\n')
  99     logging.debug(f"Finished scrape and made a new article file at {datetime.datetime.now()}")
 100
 101
 102 if __name__ == "__main__":
 103
 104     main()
 105