]> code.communitydata.science - covid19.git/blob - wikipedia/scripts/digobs.py
monitor pages from dsaez's wikidata crawler
[covid19.git] / wikipedia / scripts / digobs.py
1 #!/usr/bin/env python3
2 from requests import Request
3 from datetime import datetime
4 import sys
5 import subprocess
6 import logging
7 import itertools
8 import requests
9 from functools import partial
10 from csv import DictWriter
11 import json
12 import mwapi as api
13
14 user_agent = "COVID-19 Digital Observatory, a Community Data Science Collective project. (https://github.com/CommunityDataScienceCollective/COVID-19_Digital_Observatory)"
15
16 def git_hash(short=False):
17     if short:
18         return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
19     else:
20         subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
21
22
23 def init_logging(args):
24     #handle -W
25     if args.logging_destination:
26         logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level)
27     else:
28         logging.basicConfig(level=args.logging_level)
29
30     export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
31     export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
32     export_time = str(datetime.now())
33
34     logging.info(f"Starting at {export_time}.")
35     logging.info(f"Last commit: {export_git_hash}")
36     return(logging)
37
38 def call_view_api(page, project, query_date):
39     url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{project}/all-access/all-agents/{page}/daily/{query_date}00/{query_date}00"
40     response = requests.get(url)
41     if response.ok:
42         return response.json().get('items',[None])[0]
43
44     else:
45         logging.warning(f"Failure: {response.status_code} from {url}")
46         print(response.json().get("detail",None))
47         return None
48
49 # This function writes out the view data to a json file (j_outfile)
50 # and a tsv file (t_outfile), keeps track of failures,
51 # and returns the number of successes and failures
52 def process_view_responses(responses, j_outfile, t_outfile, logging = None):
53     
54     failures = len(list(itertools.takewhile(lambda r: r is None, responses)))
55     successes = 1
56
57     try:
58         first_response = next(responses)
59     except StopIteration:
60         logging.error("No valid responses")
61         exit()
62
63     dw = DictWriter(t_outfile, sorted(first_response.keys()), delimiter='\t')
64     dw.writeheader()
65     json.dump(first_response, j_outfile)
66     dw.writerow(first_response)
67
68     for response in responses:
69         if response is None:
70             failures = failures + 1
71             continue
72         else:
73             successes = successes + 1
74
75         if logging is not None:
76             logging.debug(f"printing data: {response}")
77
78         json.dump(response, j_outfile)
79         dw.writerow(response)
80
81     return (successes,failures)
82         
83 def get_loglevel(arg_loglevel):
84     loglevel_mapping = { 'debug' : logging.DEBUG,
85                          'info' : logging.INFO,
86                          'warning' : logging.WARNING,
87                          'error' : logging.ERROR,
88                          'critical' : logging.CRITICAL }
89
90     if arg_loglevel in loglevel_mapping:
91         loglevel = loglevel_mapping[arg_loglevel]
92         return loglevel
93     else:
94         print("Choose a valid log level: debug, info, warning, error, or critical", file=sys.stderr)
95         return logging.INFO
96
97
98 def get_revisions_for_page(title, api_session, logging, rv_props):
99   
100     result = api_session.get(action='query',
101                              prop='revisions',
102                              rvprop=rv_props.values(),
103                              titles = {title},
104                              rvdir='newer',
105                              rvslots='*',
106                              continuation=True)
107     return result
108
109 def get_pages_revisions(titles, project, logging, rv_props):
110     logging.info(f"pulling revisions for: {project}")
111
112     api_session = api.Session(f"https://{project}.org/w/api.php",
113                               user_agent=user_agent
114                               )
115     
116     return itertools.chain(* map(partial(get_revisions_for_page, api_session = api_session, logging = logging, rv_props = rv_props), titles))
117
118 def rev_batch_to_json(rev, export_info, json_output = None):
119     rev['exported'] = export_info
120     if json_output is None:
121         return json.dumps(rev)
122     else:
123         json.dump(rev, json_output)
124
125 def rev_batch_to_tsv(batch, tsv_fields, export_info, project, tsv_writer=None):
126     batch = batch.get('query',dict())
127     pages = batch.get('pages',dict())
128     for pageid, page in pages.items():
129         pageid = page.get('pageid',None)
130         ns = page.get('ns',None)
131         title = page.get('title','')
132         logging.info(f"pulling revisions for: {title}")
133         revs = page.get('revisions',[])
134         for rev in revs:
135
136             # handle missing data
137             if "sha1" not in rev:
138                 rev["sha1"] = ""
139
140             if "userhidden" in rev:
141                 rev["user"] = ""
142                 rev["userid"] = ""
143
144             # recode anon so it's true or false instead of present/missing
145             if "anon" in rev:
146                 rev["anon"] = True
147             else:
148                 rev["anon"] = False
149
150             # let's recode "minor" in the same way
151             if "minor" in rev:
152                 rev["minor"] = True
153             else:
154                 rev["minor"] = False
155
156             # add page title information
157             rev['title'] = title
158             rev['pageid'] = pageid
159             rev['namespace'] = ns
160             rev['contentmodel'] = rev['slots']['main']['contentmodel']
161             # construct a URL
162             rev['url'] = Request('GET', f'https://{project}.org/w/index.php',
163                                  params={'title' : rev['title'].replace(" ", "_"),
164                                          'oldid' : rev['revid']}).prepare().url
165
166             rev['export_timestamp'] = export_info['export_timestamp']
167             rev['export_commit'] = export_info['export_commit']
168             tsv_writer.writerow({k: rev[k] for k in tsv_fields})

Community Data Science Collective || Want to submit a patch?