mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/revisions.py

   1 import logging
   2
   3 from ...util import none_or
   4 from ..errors import MalformedResponse
   5 from .collection import Collection
   6
   7 logger = logging.getLogger("mw.api.collections.revisions")
   8
   9
  10 class Revisions(Collection):
  11     """
  12     A collection of revisions indexes by title, page_id and user_text.
  13     Note that revisions of deleted pages are queriable via
  14     :class:`mw.api.DeletedRevs`.
  15     """
  16
  17     PROPERTIES = {'ids', 'flags', 'timestamp', 'user', 'userid', 'size',
  18                   'sha1', 'contentmodel', 'comment', 'parsedcomment',
  19                   'content', 'tags', 'flagged'}
  20
  21     DIFF_TO = {'prev', 'next', 'cur'}
  22
  23     # This is *not* the right way to do this, but it should work for all queries.
  24     MAX_REVISIONS = 50
  25
  26     def get(self, rev_id, **kwargs):
  27         """
  28         Get a single revision based on it's ID.  Throws a :py:class:`KeyError`
  29         if the rev_id cannot be found.
  30
  31         :Parameters:
  32             rev_id : int
  33                 Revision ID
  34             ``**kwargs``
  35                 Passed to :py:meth:`query`
  36
  37         :Returns:
  38             A single rev dict
  39         """
  40         rev_id = int(rev_id)
  41
  42         revs = list(self.query(revids={rev_id}, **kwargs))
  43
  44         if len(revs) < 1:
  45             raise KeyError(rev_id)
  46         else:
  47             return revs[0]
  48
  49     def query(self, *args, limit=None, **kwargs):
  50         """
  51         Get revision information.
  52         See `<https://www.mediawiki.org/wiki/API:Properties#revisions_.2F_rv>`_
  53
  54         :Parameters:
  55             properties : set(str)
  56                 Which properties to get for each revision:
  57
  58                 * ids            - The ID of the revision
  59                 * flags          - Revision flags (minor)
  60                 * timestamp      - The timestamp of the revision
  61                 * user           - User that made the revision
  62                 * userid         - User id of revision creator
  63                 * size           - Length (bytes) of the revision
  64                 * sha1           - SHA-1 (base 16) of the revision
  65                 * contentmodel   - Content model id
  66                 * comment        - Comment by the user for revision
  67                 * parsedcomment  - Parsed comment by the user for the revision
  68                 * content        - Text of the revision
  69                 * tags           - Tags for the revision
  70             limit : int
  71                 Limit how many revisions will be returned
  72                 No more than 500 (5000 for bots) allowed
  73             start_id : int
  74                 From which revision id to start enumeration (enum)
  75             end_id : int
  76                 Stop revision enumeration on this revid
  77             start : :class:`mw.Timestamp`
  78                 From which revision timestamp to start enumeration (enum)
  79             end : :class:`mw.Timestamp`
  80                 Enumerate up to this timestamp
  81             direction : str
  82                 "newer" or "older"
  83             user : str
  84                 Only include revisions made by user_text
  85             excludeuser : bool
  86                 Exclude revisions made by user
  87             tag : str
  88                 Only list revisions tagged with this tag
  89             expandtemplates : bool
  90                 Expand templates in revision content (requires "content" propery)
  91             generatexml : bool
  92                 Generate XML parse tree for revision content (requires "content" propery)
  93             parse : bool
  94                 Parse revision content (requires "content" propery)
  95             section : int
  96                 Only retrieve the content of this section number
  97             token : set(str)
  98                 Which tokens to obtain for each revision
  99
 100                 * rollback - See `<https://www.mediawiki.org/wiki/API:Edit_-_Rollback#Token>`_
 101             rvcontinue : str
 102                 When more results are available, use this to continue
 103             diffto : int
 104                 Revision ID to diff each revision to. Use "prev", "next" and
 105                 "cur" for the previous, next and current revision respectively
 106             difftotext : str
 107                 Text to diff each revision to. Only diffs a limited number of
 108                 revisions. Overrides diffto. If section is set, only that
 109                 section will be diffed against this text
 110             contentformat : str
 111                 Serialization format used for difftotext and expected for output of content
 112
 113                 * text/x-wiki
 114                 * text/javascript
 115                 * text/css
 116                 * text/plain
 117                 * application/json
 118
 119         :Returns:
 120             An iterator of rev dicts returned from the API.
 121         """
 122
 123         revisions_yielded = 0
 124         done = False
 125         while not done:
 126             if limit == None:
 127                 kwargs['limit'] = self.MAX_REVISIONS
 128             else:
 129                 kwargs['limit'] = min(limit - revisions_yielded, self.MAX_REVISIONS)
 130
 131             rev_docs, rvcontinue = self._query(*args, **kwargs)
 132
 133             for doc in rev_docs:
 134                 yield doc
 135                 revisions_yielded += 1
 136
 137                 if limit != None and revisions_yielded >= limit:
 138                     done = True
 139                     break
 140
 141             if rvcontinue != None and len(rev_docs) > 0:
 142                 kwargs['rvcontinue'] = rvcontinue
 143             else:
 144                 done = True
 145
 146
 147     def _query(self, revids=None, titles=None, pageids=None, properties=None,
 148                      limit=None, start_id=None, end_id=None, start=None,
 149                      end=None, direction=None, user=None, excludeuser=None,
 150                      tag=None, expandtemplates=None, generatexml=None,
 151                      parse=None, section=None, token=None, rvcontinue=None,
 152                      diffto=None, difftotext=None, contentformat=None):
 153
 154         params = {
 155             'action': "query",
 156             'prop': "revisions",
 157             'rawcontinue': ''
 158         }
 159
 160         params['revids'] = self._items(revids, type=int)
 161         params['titles'] = self._items(titles)
 162         params['pageids'] = self._items(pageids, type=int)
 163
 164         params['rvprop'] = self._items(properties, levels=self.PROPERTIES)
 165
 166         if revids == None: # Can't have a limit unless revids is none
 167             params['rvlimit'] = none_or(limit, int)
 168
 169         params['rvstartid'] = none_or(start_id, int)
 170         params['rvendid'] = none_or(end_id, int)
 171         params['rvstart'] = self._check_timestamp(start)
 172         params['rvend'] = self._check_timestamp(end)
 173
 174         params['rvdir'] = self._check_direction(direction)
 175         params['rvuser'] = none_or(user, str)
 176         params['rvexcludeuser'] = none_or(excludeuser, int)
 177         params['rvtag'] = none_or(tag, str)
 178         params['rvexpandtemplates'] = none_or(expandtemplates, bool)
 179         params['rvgeneratexml'] = none_or(generatexml, bool)
 180         params['rvparse'] = none_or(parse, bool)
 181         params['rvsection'] = none_or(section, int)
 182         params['rvtoken'] = none_or(token, str)
 183         params['rvcontinue'] = none_or(rvcontinue, str)
 184         params['rvdiffto'] = self._check_diffto(diffto)
 185         params['rvdifftotext'] = none_or(difftotext, str)
 186         params['rvcontentformat'] = none_or(contentformat, str)
 187
 188         doc = self.session.get(params)
 189
 190         try:
 191             if 'query-continue' in doc:
 192                 rvcontinue = doc['query-continue']['revisions']['rvcontinue']
 193             else:
 194                 rvcontinue = None
 195
 196             pages = doc['query'].get('pages', {}).values()
 197             rev_docs = []
 198
 199             for page_doc in pages:
 200                 if 'missing' in page_doc or 'revisions' not in page_doc: continue
 201
 202                 page_rev_docs = page_doc['revisions']
 203                 del page_doc['revisions']
 204
 205                 for rev_doc in page_rev_docs:
 206                     rev_doc['page'] = page_doc
 207
 208                 rev_docs.extend(page_rev_docs)
 209
 210             return rev_docs, rvcontinue
 211
 212         except KeyError as e:
 213             raise MalformedResponse(str(e), doc)
 214
 215
 216     def _check_diffto(self, diffto):
 217         if diffto == None or diffto in self.DIFF_TO:
 218             return diffto
 219         else:
 220             return int(diffto)