wikipedia_views/scripts/wikiproject_scraper.py

   1 #!/usr/bin/env python3
   2
   3 ###############################################################################
   4 #
   5 # This script scrapes the Covid-19 Wikiproject
   6 #
   7 # It (1) hits the fcgi to find out how many rounds. Then (2) hit the fcgi
   8 # that many rounds, cooking that information down to just a list of article names and
   9 # then (3) saves it out.
  10 #
  11 # At time of writing:
  12 # the fCGI returns only 1000 max, no matter what you put in the limit. page 1 looks like this....
  13 # https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=1&sorta=Importance&sortb=Quality
  14 #
  15 # and page 2 looks like this
  16 # https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?namespace=&run=yes&projecta=COVID-19&score=&sorta=Importance&importance=&limit=1000&pagename=&quality=&sortb=Quality&&offset=1001
  17 #
  18 ###############################################################################
  19
  20 import argparse
  21 import requests
  22 import datetime
  23 import logging
  24 import re
  25 import math
  26 from bs4 import BeautifulSoup
  27
  28 def parse_args():
  29
  30     parser = argparse.ArgumentParser(description='Get a list of pages tracked by the COVID-19 Wikiproject.')
  31     parser.add_argument('-o', '--output_folder', help='Where to save output', default="../resources/", type=str)
  32     parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info'),
  33     parser.add_argument('-W', '--logging_destination', help='Logging destination.', default='../logs/')
  34     args = parser.parse_args()
  35
  36     return(args)
  37
  38
  39 def main():
  40
  41     args = parse_args()
  42
  43     outputPath = args.output_folder
  44
  45     #handle -W
  46     today = datetime.datetime.today().strftime('%Y%m%d')
  47     dest = args.logging_destination
  48     logHome = f"{dest}scraping{today}"
  49
  50     #handle -L
  51     loglevel = args.logging_level
  52     if loglevel == 'debug':
  53         logging.basicConfig(filename=logHome, filemode='a', level=logging.DEBUG)
  54     elif loglevel == 'info':
  55         logging.basicConfig(filename=logHome, filemode='a', level=logging.INFO)
  56     elif loglevel == 'warning':
  57         logging.basicConfig(filename=logHome, filemode='a', level=logging.WARNING)
  58     elif loglevel == 'error':
  59         logging.basicConfig(filename=logHome, filemode='a', level=logging.ERROR)
  60     elif loglevel == 'critical':
  61         logging.basicConfig(filename=logHome, filemode='a', level=logging.CRITICAL)
  62     else:
  63         print("Choose a valid log level: debug, info, warning, error, or critical")
  64         exit
  65
  66
  67     outputFile = f"{outputPath}articles.txt"
  68     logging.debug(f"Starting scrape at {datetime.datetime.now()} and destructively outputting article list to {outputFile}.")
  69     #1 How many hits to the fcgi?
  70
  71     #make a session
  72
  73     session = requests.Session()
  74
  75     originalURL = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=1&sorta=Importance&sortb=Quality"
  76     headURL = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset="
  77     tailURL = "&sorta=Importance&sortb=Quality" #head + offset + tail = original when offset = 1
  78
  79     # find out how many results we have
  80     response = session.get(originalURL)
  81
  82     soup = BeautifulSoup(response.text, features="html.parser")
  83     nodes = soup.find_all('div', class_="navbox")
  84     rx = re.compile("Total results:\D*(\d+)")
  85     m = rx.search(nodes[0].get_text())
  86     #print(nodes[0].get_text())
  87     numResults = int(m.group(1))
  88
  89     logging.debug(f"fcgi returned {numResults}")
  90     rounds = math.ceil(numResults/1000)
  91
  92     #2 Fetch and parse down to just the article names
  93     articleNames = []
  94
  95     for i in range(1, rounds+1):
  96         offset = (i - 1)*1000 + 1 #offset is 1, then 1001, then 2001
  97         url = f"{headURL}{offset}{tailURL}"
  98         response = session.get(url)
  99         soup = BeautifulSoup(response.text, features="html.parser") #make fresh soup
 100         article_rows = soup.find_all('tr', class_="list-odd") #just the odds first
 101         for row in article_rows:
 102             a = row.find('a')
 103             articleNames.append(a.get_text())
 104         article_rows = soup.find_all('tr', class_="list-even") #now the events
 105         for row in article_rows:
 106             a = row.find('a')
 107             articleNames.append(a.get_text())
 108
 109     #3 Saves the list to a file
 110
 111     with open(outputFile, 'w') as f:
 112         f.write('\n'.join(articleNames)+'\n')
 113     logging.debug(f"Finished scrape and made a new article file at {datetime.datetime.now()}")
 114
 115
 116 if __name__ == "__main__":
 117
 118     main()
 119