]> code.communitydata.science - rises_declines_wikia_code.git/blob - mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/element_iterator.py
Initial commit
[rises_declines_wikia_code.git] / mediawiki_dump_tools / Mediawiki-Utilities / mw / xml_dump / element_iterator.py
1
2 try:
3         import xml.etree.cElementTree as etree
4 except ImportError:
5         import xml.etree.ElementTree as etree
6
7 from xml.etree.ElementTree import ParseError
8
9 from .errors import MalformedXML
10
11
12 def trim_ns(tag):
13     return tag[tag.find("}") + 1:]
14
15
16 class EventPointer:
17     def __init__(self, etree_events):
18         self.tag_stack = []
19         self.etree_events = etree_events
20
21     def __next__(self):
22         event, element = next(self.etree_events)
23
24         tag = trim_ns(element.tag)
25
26         if event == "start":
27             self.tag_stack.append(tag)
28         else:
29             if self.tag_stack[-1] == tag:
30                 self.tag_stack.pop()
31             else:
32                 raise MalformedXML("Expected {0}, but saw {1}.".format(
33                     self.tag_stack[-1],
34                     tag)
35                 )
36
37         return event, element
38
39     def depth(self):
40         return len(self.tag_stack)
41
42     @classmethod
43     def from_file(cls, f):
44         return EventPointer(etree.iterparse(f, events=("start", "end")))
45
46
47 class ElementIterator:
48     def __init__(self, element, pointer):
49         self.pointer = pointer
50         self.element = element
51         self.depth = pointer.depth() - 1
52
53         self.done = False
54
55     def __iter__(self):
56
57         while not self.done and self.pointer.depth() > self.depth:
58             event, element = next(self.pointer)
59
60             if event == "start":
61                 sub_iterator = ElementIterator(element, self.pointer)
62
63                 yield sub_iterator
64
65                 sub_iterator.clear()
66
67         self.done = True
68
69     def complete(self):
70
71         while not self.done and self.pointer.depth() > self.depth:
72             event, element = next(self.pointer)
73             if self.pointer.depth() > self.depth:
74                 element.clear()
75
76         self.done = True
77
78     def clear(self):
79         self.complete()
80         self.element.clear()
81
82     def attr(self, key, alt=None):
83         return self.element.attrib.get(key, alt)
84
85     def __getattr__(self, attr):
86         if attr == "tag":
87             return trim_ns(self.element.tag)
88         elif attr == "text":
89             self.complete()
90             return self.element.text
91         else:
92             raise AttributeError("%s has no attribute %r" % (self.__class__.__name__, attr))
93
94     @classmethod
95     def from_file(cls, f):
96         
97         try:
98             pointer = EventPointer.from_file(f)
99             event, element = next(pointer)
100             return cls(element, pointer)
101         except ParseError as e:
102             raise ParseError(
103                     "{0}: {1}...".format(str(e),
104                                          str(f.read(500), 'utf-8', 'replace')))

Community Data Science Collective || Want to submit a patch?