mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/page.py

   1 from ...types import serializable
   2 from ...util import none_or
   3 from ..errors import MalformedXML
   4 from .redirect import Redirect
   5 from .revision import Revision
   6
   7
   8 class Page(serializable.Type):
   9     """
  10     Page meta data and a :class:`~mw.xml_dump.Revision` iterator.  Instances of
  11     this class can be called as iterators directly.  E.g.
  12
  13     .. code-block:: python
  14
  15         page = mw.xml_dump.Page( ... )
  16
  17         for revision in page:
  18             print("{0} {1}".format(revision.id, page_id))
  19
  20     """
  21     __slots__ = (
  22         'id',
  23         'title',
  24         'namespace',
  25         'redirect',
  26         'restrictions'
  27     )
  28
  29     def __init__(self, id, title, namespace, redirect, restrictions, revisions=None):
  30         self.id = none_or(id, int)
  31         """
  32         Page ID : `int`
  33         """
  34
  35         self.title = none_or(title, str)
  36         """
  37         Page title (namespace excluded) : `str`
  38         """
  39
  40         self.namespace = none_or(namespace, int)
  41         """
  42         Namespace ID : `int`
  43         """
  44
  45         self.redirect = none_or(redirect, Redirect)
  46         """
  47         Page is currently redirect? : :class:`~mw.xml_dump.Redirect` | `None`
  48         """
  49
  50         self.restrictions = serializable.List.deserialize(restrictions)
  51         """
  52         A list of page editing restrictions (empty unless restrictions are specified) : list( `str` )
  53         """
  54
  55         # Should be a lazy generator
  56         self.__revisions = revisions or []
  57
  58     def __iter__(self):
  59         return self.__revisions
  60
  61     def __next__(self):
  62         return next(self.__revisions)
  63
  64     @classmethod
  65     def load_revisions(cls, first_revision, element):
  66         yield Revision.from_element(first_revision)
  67
  68         for sub_element in element:
  69             tag = sub_element.tag
  70
  71             if tag == "revision":
  72                 yield Revision.from_element(sub_element)
  73             else:
  74                 raise MalformedXML("Expected to see 'revision'.  " +
  75                                    "Instead saw '{0}'".format(tag))
  76
  77     @classmethod
  78     def from_element(cls, element):
  79         title = None
  80         namespace = None
  81         id = None
  82         redirect = None
  83         restrictions = []
  84
  85         first_revision = None
  86
  87         # Consume each of the elements until we see <id> which should come last.
  88         for sub_element in element:
  89             tag = sub_element.tag
  90             if tag == "title":
  91                 title = sub_element.text
  92             elif tag == "ns":
  93                 namespace = sub_element.text
  94             elif tag == "id":
  95                 id = int(sub_element.text)
  96             elif tag == "redirect":
  97                 redirect = Redirect.from_element(sub_element)
  98             elif tag == "restrictions":
  99                 restrictions.append(sub_element.text)
 100             elif tag == "DiscussionThreading":
 101                 continue
 102             elif tag == "sha1":
 103                 continue
 104             elif tag == "revision":
 105                 first_revision = sub_element
 106                 break
 107             # Assuming that the first revision seen marks the end of page
 108             # metadata.  I'm not too keen on this assumption, so I'm leaving
 109             # this long comment to warn whoever ends up maintaining this.
 110             else:
 111                 raise MalformedXML("Unexpected tag found when processing " +
 112                                    "a <page>: '{0}'".format(tag))
 113
 114         # Assuming that I got here by seeing a <revision> tag.  See verbose
 115         # comment above.
 116         revisions = cls.load_revisions(first_revision, element)
 117
 118         return cls(id, title, namespace, redirect, restrictions, revisions)