3 from ...types import serializable
4 from ...util import none_or
5 from ..element_iterator import ElementIterator
6 from ..errors import MalformedXML
7 from .namespace import Namespace
11 class ConcatinatingTextReader(io.TextIOBase):
13 def __init__(self, *items):
14 self.items = [io.StringIO(i) if isinstance(i, str) else i
17 def read(self, size=-1):
18 return "".join(self._read(size))
22 if len(self.items) > 0:
23 line = self.items[0].readline()
24 if line == "": self.items.pop(0)
30 def _read(self, size):
32 while len(self.items) > 0:
33 byte_vals = self.items[0].read(size)
35 if len(byte_vals) < size:
36 size = size - len(byte_vals) # Decrement bytes
42 for item in self.items:
48 def concat(*stream_items):
49 return ConcatinatingTextReader(*stream_items)
52 class Iterator(serializable.Type):
54 XML Dump Iterator. Dump file meta data and a
55 :class:`~mw.xml_dump.Page` iterator. Instances of this class can be
56 called as an iterator directly. E.g.::
58 from mw.xml_dump import Iterator
60 # Construct dump file iterator
61 dump = Iterator.from_file(open("example/dump.xml"))
63 # Iterate through pages
66 # Iterate through a page's revisions
72 __slots__ = ('site_name', 'base', 'generator', 'case', 'namespaces',
75 def __init__(self, site_name=None, dbname=None, base=None, generator=None,
76 case=None, namespaces=None, pages=None):
78 self.site_name = none_or(site_name, str)
80 The name of the site. : str | `None` (if not specified in the XML)
83 self.dbname = none_or(dbname, str)
85 The database name of the site. : str | `None` (if not specified in the
89 self.base = none_or(base, str)
91 TODO: ??? : str | `None` (if not specified in the XML)
94 self.generator = none_or(generator, str)
96 TODO: ??? : str | `None` (if not specified in the XML)
99 self.case = none_or(case, str)
101 TODO: ??? : str | `None` (if not specified in the XML)
104 self.namespaces = none_or(namespaces, list)
106 A list of :class:`mw.Namespace` | `None` (if not specified in the XML)
109 # Should be a lazy generator of page info
116 return next(self.__pages)
119 def load_namespaces(cls, element):
121 for sub_element in element:
122 tag = sub_element.tag
124 if tag == "namespace":
125 namespace = Namespace.from_element(sub_element)
126 namespaces.append(namespace)
128 assert False, "This should never happen"
133 def load_site_info(cls, element):
142 for sub_element in element:
143 if sub_element.tag == 'sitename':
144 site_name = sub_element.text
145 if sub_element.tag == 'dbname':
146 dbname = sub_element.text
147 elif sub_element.tag == 'base':
148 base = sub_element.text
149 elif sub_element.tag == 'generator':
150 generator = sub_element.text
151 elif sub_element.tag == 'case':
152 case = sub_element.text
153 elif sub_element.tag == 'namespaces':
154 namespaces = cls.load_namespaces(sub_element)
156 return site_name, dbname, base, generator, case, namespaces
159 def load_pages(cls, element):
161 for sub_element in element:
162 tag = sub_element.tag
165 yield Page.from_element(sub_element)
167 assert MalformedXML("Expected to see 'page'. " +
168 "Instead saw '{0}'".format(tag))
171 def from_element(cls, element):
180 for sub_element in element:
181 tag = sub_element.tag
182 if tag == "siteinfo":
183 site_name, dbname, base, generator, case, namespaces = \
184 cls.load_site_info(sub_element)
188 pages = cls.load_pages(element)
190 return cls(site_name, dbname, base, generator, case, namespaces, pages)
193 def from_file(cls, f):
194 element = ElementIterator.from_file(f)
195 assert element.tag == "mediawiki"
196 return cls.from_element(element)
199 def from_string(cls, string):
200 f = io.StringIO(string)
201 element = ElementIterator.from_file(f)
202 assert element.tag == "mediawiki"
203 return cls.from_element(element)
206 def from_page_xml(cls, page_xml):
208 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/"
209 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
210 xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/
211 http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5"
219 footer = "</mediawiki>"
221 return cls.from_file(concat(header, page_xml, footer))