X-Git-Url: https://code.communitydata.science/covid19.git/blobdiff_plain/34f8b9a23ee8dba8300d610c3aa88e86f7c5347b:/wikipedia_views/scripts/wikiproject_scraper.py..ff96d52cb92966dd0c487e6452aeab70773cf3e6:/wikipedia/scripts/static/gitweb.css diff --git a/wikipedia_views/scripts/wikiproject_scraper.py b/wikipedia_views/scripts/wikiproject_scraper.py deleted file mode 100755 index 9d209a9..0000000 --- a/wikipedia_views/scripts/wikiproject_scraper.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env python3 - -############################################################################### -# -# This script scrapes the Covid-19 Wikiproject -# -# It (1) hits the fcgi to find out how many rounds. Then (2) hit the fcgi -# that many rounds, cooking that information down to just a list of article names and -# then (3) saves it out. -# -# At time of writing: -# the fCGI returns only 1000 max, no matter what you put in the limit. page 1 looks like this.... -# https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=1&sorta=Importance&sortb=Quality -# -# and page 2 looks like this -# https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?namespace=&run=yes&projecta=COVID-19&score=&sorta=Importance&importance=&limit=1000&pagename=&quality=&sortb=Quality&&offset=1001 -# -############################################################################### - -import argparse -import requests -import datetime -import logging -import re -import math -from bs4 import BeautifulSoup - -def parse_args(): - - parser = argparse.ArgumentParser(description='Get a list of pages tracked by the COVID-19 Wikiproject.') - parser.add_argument('-o', '--output_folder', help='Where to save output', default="../resources/", type=str) - parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info'), - parser.add_argument('-W', '--logging_destination', help='Logging destination.', default='../logs/') - args = parser.parse_args() - - return(args) - - -def main(): - - args = parse_args() - - outputPath = args.output_folder - - #handle -W - today = datetime.datetime.today().strftime('%Y%m%d') - dest = args.logging_destination - logHome = f"{dest}scraping{today}" - - #handle -L - loglevel = args.logging_level - if loglevel == 'debug': - logging.basicConfig(filename=logHome, filemode='a', level=logging.DEBUG) - elif loglevel == 'info': - logging.basicConfig(filename=logHome, filemode='a', level=logging.INFO) - elif loglevel == 'warning': - logging.basicConfig(filename=logHome, filemode='a', level=logging.WARNING) - elif loglevel == 'error': - logging.basicConfig(filename=logHome, filemode='a', level=logging.ERROR) - elif loglevel == 'critical': - logging.basicConfig(filename=logHome, filemode='a', level=logging.CRITICAL) - else: - print("Choose a valid log level: debug, info, warning, error, or critical") - exit - - - outputFile = f"{outputPath}articles.txt" - logging.debug(f"Starting scrape at {datetime.datetime.now()} and destructively outputting article list to {outputFile}.") - #1 How many hits to the fcgi? - - #make a session - - session = requests.Session() - - originalURL = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=1&sorta=Importance&sortb=Quality" - headURL = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=" - tailURL = "&sorta=Importance&sortb=Quality" #head + offset + tail = original when offset = 1 - - # find out how many results we have - response = session.get(originalURL) - - soup = BeautifulSoup(response.text, features="html.parser") - nodes = soup.find_all('div', class_="navbox") - rx = re.compile("Total results:\D*(\d+)") - m = rx.search(nodes[0].get_text()) - #print(nodes[0].get_text()) - numResults = int(m.group(1)) - - logging.debug(f"fcgi returned {numResults}") - rounds = math.ceil(numResults/1000) - - #2 Fetch and parse down to just the article names - articleNames = [] - - for i in range(1, rounds+1): - offset = (i - 1)*1000 + 1 #offset is 1, then 1001, then 2001 - url = f"{headURL}{offset}{tailURL}" - response = session.get(url) - soup = BeautifulSoup(response.text, features="html.parser") #make fresh soup - article_rows = soup.find_all('tr', class_="list-odd") #just the odds first - for row in article_rows: - a = row.find('a') - articleNames.append(a.get_text()) - article_rows = soup.find_all('tr', class_="list-even") #now the events - for row in article_rows: - a = row.find('a') - articleNames.append(a.get_text()) - - #3 Saves the list to a file - - with open(outputFile, 'w') as f: - f.write('\n'.join(articleNames)+'\n') - logging.debug(f"Finished scrape and made a new article file at {datetime.datetime.now()}") - - -if __name__ == "__main__": - - main() -