]> code.communitydata.science - covid19.git/blob - wikipedia/scripts/wikiproject_scraper.py
Merge pull request #15 from aaronshaw/master
[covid19.git] / wikipedia / scripts / wikiproject_scraper.py
1 #!/usr/bin/env python3
2
3 ###############################################################################
4 #
5 # This script scrapes the Covid-19 Wikiproject
6
7 # It (1) hits the fcgi to find out how many rounds. Then (2) hit the fcgi 
8 # that many rounds, cooking that information down to just a list of article names and
9 # then (3) saves it out.
10 #
11 # At time of writing:
12 # the fCGI returns only 1000 max, no matter what you put in the limit. page 1 looks like this....
13 # https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=1&sorta=Importance&sortb=Quality
14 #
15 # and page 2 looks like this
16 # https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?namespace=&run=yes&projecta=COVID-19&score=&sorta=Importance&importance=&limit=1000&pagename=&quality=&sortb=Quality&&offset=1001
17 #
18 ###############################################################################
19
20 import argparse
21 import subprocess
22 import requests
23 import datetime
24 import logging
25 import re
26 import math
27 from bs4 import BeautifulSoup
28 import digobs
29
30 def parse_args():
31
32     parser = argparse.ArgumentParser(description='Get a list of pages tracked by the COVID-19 Wikiproject.')
33     parser.add_argument('-o', '--output_file', help='Where to save output', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
34     parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=digobs.get_loglevel), 
35     parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str), 
36     args = parser.parse_args()
37
38     return(args)
39
40 def main():
41
42     args = parse_args()
43     outputFile = args.output_file
44
45     #handle -W
46     if args.logging_destination:
47         logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level)
48     else:
49         logging.basicConfig(level=args.logging_level)
50
51     export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
52     export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
53     export_time = str(datetime.datetime.now())
54
55     logging.info(f"Starting at {export_time} and destructively outputting article list to {outputFile}.")
56     logging.info(f"Last commit: {export_git_hash}")
57
58     #1 How many hits to the fcgi?
59     session = requests.Session()
60
61     originalURL = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=1&sorta=Importance&sortb=Quality"
62     headURL = "https://tools.wmflabs.org/enwp10/cgi-bin/list2.fcgi?run=yes&projecta=COVID-19&namespace=&pagename=&quality=&importance=&score=&limit=1000&offset=" 
63     tailURL = "&sorta=Importance&sortb=Quality" #head + offset + tail = original when offset = 1
64
65     # find out how many results we have
66     response = session.get(originalURL)
67
68     soup = BeautifulSoup(response.text, features="html.parser")
69     nodes = soup.find_all('div', class_="navbox")
70     rx = re.compile("Total results:\D*(\d+)") 
71     m = rx.search(nodes[0].get_text())
72     #print(nodes[0].get_text())
73     numResults = int(m.group(1))
74
75     logging.debug(f"fcgi returned {numResults}")
76     rounds = math.ceil(numResults/1000) 
77
78     #2 Fetch and parse down to just the article names
79     articleNames = []
80
81     for i in range(1, rounds+1):
82         offset = (i - 1)*1000 + 1 #offset is 1, then 1001, then 2001 
83         url = f"{headURL}{offset}{tailURL}"
84         response = session.get(url)
85         soup = BeautifulSoup(response.text, features="html.parser") #make fresh soup
86         article_rows = soup.find_all('tr', class_="list-odd") #just the odds first
87         for row in article_rows:
88             a = row.find('a')
89             articleNames.append(a.get_text())
90         article_rows = soup.find_all('tr', class_="list-even") #now the events
91         for row in article_rows:
92             a = row.find('a')
93             articleNames.append(a.get_text())
94
95     #3 Saves the list to a file
96
97     with open(outputFile, 'w') as f:
98         f.write('\n'.join(articleNames)+'\n')
99     logging.debug(f"Finished scrape and made a new article file at {datetime.datetime.now()}")
100
101
102 if __name__ == "__main__":
103
104     main()
105

Community Data Science Collective || Want to submit a patch?