+def get_revisions_for_page(title, api_session, logging, rv_props):
+
+ result = api_session.get(action='query',
+ prop='revisions',
+ rvprop=rv_props.values(),
+ titles = {title},
+ rvdir='newer',
+ rvslots='*',
+ continuation=True)
+ return result
+
+def get_pages_revisions(titles, project, logging, rv_props):
+ logging.info(f"pulling revisions for: {project}")
+
+ api_session = api.Session(f"https://{project}.org/w/api.php",
+ user_agent=user_agent
+ )
+
+ return itertools.chain(* map(partial(get_revisions_for_page, api_session = api_session, logging = logging, rv_props = rv_props), titles))
+
+def rev_batch_to_json(rev, export_info, json_output = None):
+ rev['exported'] = export_info
+ if json_output is None:
+ return json.dumps(rev)
+ else:
+ json.dump(rev, json_output)
+
+def rev_batch_to_tsv(batch, tsv_fields, export_info, project, tsv_writer=None):
+ batch = batch.get('query',dict())
+ pages = batch.get('pages',dict())
+ for pageid, page in pages.items():
+ pageid = page.get('pageid',None)
+ ns = page.get('ns',None)
+ title = page.get('title','')
+ logging.info(f"pulling revisions for: {title}")
+ revs = page.get('revisions',[])
+ for rev in revs:
+
+ # handle missing data
+ if "sha1" not in rev:
+ rev["sha1"] = ""
+
+ if "userhidden" in rev:
+ rev["user"] = ""
+ rev["userid"] = ""
+
+ # recode anon so it's true or false instead of present/missing
+ if "anon" in rev:
+ rev["anon"] = True
+ else:
+ rev["anon"] = False
+
+ # let's recode "minor" in the same way
+ if "minor" in rev:
+ rev["minor"] = True
+ else:
+ rev["minor"] = False
+
+ # add page title information
+ rev['title'] = title
+ rev['pageid'] = pageid
+ rev['namespace'] = ns
+ rev['contentmodel'] = rev['slots']['main']['contentmodel']
+ # construct a URL
+ rev['url'] = Request('GET', f'https://{project}.org/w/index.php',
+ params={'title' : rev['title'].replace(" ", "_"),
+ 'oldid' : rev['revid']}).prepare().url
+
+ rev['export_timestamp'] = export_info['export_timestamp']
+ rev['export_commit'] = export_info['export_commit']
+ tsv_writer.writerow({k: rev[k] for k in tsv_fields})