]> code.communitydata.science - rises_declines_wikia_code.git/blob - mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/test_iterator.py
Initial commit
[rises_declines_wikia_code.git] / mediawiki_dump_tools / Mediawiki-Utilities / mw / xml_dump / iteration / tests / test_iterator.py
1 import io
2
3 from nose.tools import eq_, assert_is_instance
4
5 from ....types import Timestamp
6 from ..iterator import Iterator
7 from ..comment import Comment
8 from ..text import Text
9 from ..revision import Revision
10 from ..page import Page
11
12
13 SAMPLE_XML = """
14 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http
15 ://www.mediawiki.org/xml/export-0.8/ http://www.mediawiki.org/xml/export-0.8.xsd" version="0.8" xml:lang="en">
16   <siteinfo>
17     <sitename>Wikipedia</sitename>
18     <base>http://en.wikipedia.org/wiki/Main_Page</base>
19     <generator>MediaWiki 1.22wmf2</generator>
20     <case>first-letter</case>
21     <namespaces>
22       <namespace key="0" case="first-letter" />
23       <namespace key="1" case="first-letter">Talk</namespace>
24     </namespaces>
25   </siteinfo>
26   <page>
27     <title>Foo</title>
28     <ns>0</ns>
29     <id>1</id>
30     <revision beginningofpage="true">
31       <id>1</id>
32       <timestamp>2004-08-09T09:04:08Z</timestamp>
33       <contributor>
34         <username>Gen0cide</username>
35         <id>92182</id>
36       </contributor>
37       <text xml:space="preserve" bytes="234" id="55">Revision 1 text</text>
38       <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
39       <model>wikitext</model>
40       <format>text/x-wiki</format>
41     </revision>
42     <revision>
43       <id>2</id>
44       <timestamp>2004-08-10T09:04:08Z</timestamp>
45       <contributor>
46         <ip>222.152.210.109</ip>
47       </contributor>
48       <text xml:space="preserve" bytes="235" id="56">Revision 2 text</text>
49       <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
50       <model>wikitext</model>
51       <comment>Comment 2</comment>
52       <format>text/x-wiki</format>
53     </revision>
54   </page>
55   <page>
56     <title>Bar</title>
57     <ns>1</ns>
58     <id>2</id>
59     <redirect title="Computer accessibility" />
60     <restrictions>edit=sysop:move=sysop</restrictions>
61     <revision beginningofpage="true">
62       <id>3</id>
63       <timestamp>2004-08-11T09:04:08Z</timestamp>
64       <contributor>
65         <ip>222.152.210.22</ip>
66       </contributor>
67       <text xml:space="preserve" bytes="236" id="57">Revision 3 text</text>
68       <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
69       <model>wikitext</model>
70       <format>text/x-wiki</format>
71     </revision>
72     <revision>
73       <id>4</id>
74       <timestamp>2004-08-12T09:04:08Z</timestamp>
75       <text id="58" bytes="237" />
76       <sha1>6ixvq7o1yg75n9g9chqqg94myzq11c5</sha1>
77       <model>wikitext</model>
78       <format>text/x-wiki</format>
79     </revision>
80   </page>
81 </mediawiki>"""
82
83
84 def test_complete():
85     f = io.StringIO(SAMPLE_XML)
86
87     dump = Iterator.from_file(f)
88     eq_([0, 1], list(ns.id for ns in dump.namespaces))
89
90     page = next(dump)
91     eq_(page.title, "Foo")
92     eq_(page.namespace, 0)
93     eq_(page.id, 1)
94     eq_(page.redirect, None)
95     eq_(page.restrictions, [])
96
97     revision = next(page)
98     eq_(revision.id, 1)
99     eq_(revision.timestamp, Timestamp("2004-08-09T09:04:08Z"))
100     eq_(revision.contributor.id, 92182)
101     eq_(revision.contributor.user_text, "Gen0cide")
102     assert_is_instance(revision.text, Text)
103     eq_(revision.text, "Revision 1 text")
104     eq_(revision.text.bytes, 234)
105     eq_(revision.text.id, 55)
106     eq_(revision.text, "Revision 1 text")
107     eq_(revision.sha1, "g9chqqg94myzq11c56ixvq7o1yg75n9")
108     eq_(revision.comment, None)
109     eq_(revision.model, "wikitext")
110     eq_(revision.format, "text/x-wiki")
111     eq_(revision.beginningofpage, True)
112
113     revision = next(page)
114     eq_(revision.id, 2)
115     eq_(revision.timestamp, Timestamp("2004-08-10T09:04:08Z"))
116     eq_(revision.contributor.id, None)
117     eq_(revision.contributor.user_text, "222.152.210.109")
118     eq_(revision.text, "Revision 2 text")
119     eq_(revision.text.bytes, 235)
120     eq_(revision.text.id, 56)
121     eq_(revision.sha1, "g9chqqg94myzq11c56ixvq7o1yg75n9")
122     assert_is_instance(revision.comment, Comment)
123     eq_(revision.comment, "Comment 2")
124     eq_(revision.model, "wikitext")
125     eq_(revision.format, "text/x-wiki")
126     eq_(revision.beginningofpage, False)
127
128     page = next(dump)
129     assert_is_instance(page, Page)
130     eq_(page.title, "Bar")
131     eq_(page.namespace, 1)
132     eq_(page.id, 2)
133     eq_(page.redirect.title, "Computer accessibility")
134     eq_(page.restrictions, ["edit=sysop:move=sysop"])
135
136     revision = next(page)
137     assert_is_instance(revision, Revision)
138     eq_(revision.id, 3)
139     eq_(revision.timestamp, Timestamp("2004-08-11T09:04:08Z"))
140     eq_(revision.contributor.id, None)
141     eq_(revision.contributor.user_text, "222.152.210.22")
142     assert_is_instance(revision.text, Text)
143     eq_(revision.text.bytes, 236)
144     eq_(revision.text.id, 57)
145     eq_(revision.text, "Revision 3 text")
146     eq_(revision.sha1, "g9chqqg94myzq11c56ixvq7o1yg75n9")
147     eq_(revision.comment, None)
148     eq_(revision.model, "wikitext")
149     eq_(revision.format, "text/x-wiki")
150     assert_is_instance(str(page), str)
151
152     revision = next(page)
153     assert_is_instance(revision, Revision)
154     eq_(revision.id, 4)
155     eq_(revision.timestamp, Timestamp("2004-08-12T09:04:08Z"))
156     eq_(revision.contributor, None)
157     assert_is_instance(revision.text, Text)
158     eq_(revision.text.bytes, 237)
159     eq_(revision.text.id, 58)
160     eq_(revision.text, "")
161     eq_(revision.sha1, "6ixvq7o1yg75n9g9chqqg94myzq11c5")
162     eq_(revision.comment, None)
163     eq_(revision.model, "wikitext")
164     eq_(revision.format, "text/x-wiki")
165     assert_is_instance(str(revision), str)
166
167
168 def test_skipping():
169     f = io.StringIO(SAMPLE_XML)
170
171     dump = Iterator.from_file(f)
172
173     page = next(dump)
174     eq_(page.title, "Foo")
175     eq_(page.namespace, 0)
176     eq_(page.id, 1)
177
178     page = next(dump)
179     eq_(page.title, "Bar")
180     eq_(page.namespace, 1)
181     eq_(page.id, 2)
182
183     revision = next(page)
184     eq_(revision.id, 3)
185     eq_(revision.timestamp, Timestamp("2004-08-11T09:04:08Z"))
186     eq_(revision.contributor.id, None)
187     eq_(revision.contributor.user_text, "222.152.210.22")
188     assert_is_instance(revision.text, Text)
189     eq_(revision.text, "Revision 3 text")
190     eq_(revision.sha1, "g9chqqg94myzq11c56ixvq7o1yg75n9")
191     eq_(revision.comment, None)
192     eq_(revision.model, "wikitext")
193     eq_(revision.format, "text/x-wiki")
194
195
196 def test_serialization():
197     f = io.StringIO(SAMPLE_XML)
198
199     dump = Iterator.from_file(f)
200
201     eq_(dump, Iterator.deserialize(dump.serialize()))
202
203 def test_from_page_xml():
204     page_xml = """
205     <page>
206       <title>Foo</title>
207       <ns>0</ns>
208       <id>1</id>
209       <revision>
210         <id>1</id>
211         <timestamp>2004-08-09T09:04:08Z</timestamp>
212         <contributor>
213           <username>Gen0cide</username>
214           <id>92182</id>
215         </contributor>
216         <text xml:space="preserve">Revision 1 text</text>
217         <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
218         <model>wikitext</model>
219         <format>text/x-wiki</format>
220       </revision>
221       <revision>
222         <id>2</id>
223         <timestamp>2004-08-10T09:04:08Z</timestamp>
224         <contributor>
225           <ip>222.152.210.109</ip>
226         </contributor>
227         <text xml:space="preserve">Revision 2 text</text>
228         <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
229         <model>wikitext</model>
230         <comment>Comment 2</comment>
231         <format>text/x-wiki</format>
232       </revision>
233     </page>
234     """
235
236     dump = Iterator.from_page_xml(io.StringIO(page_xml))
237
238     # You have a `namespaces`, but it's empty.
239     eq_(dump.namespaces, [])
240
241     page = next(dump)
242     eq_(page.title, "Foo")
243     eq_(page.namespace, 0)
244     eq_(page.id, 1)
245
246     revision = next(page)
247     eq_(revision.id, 1)
248     eq_(revision.timestamp, Timestamp("2004-08-09T09:04:08Z"))
249     eq_(revision.contributor.id, 92182)
250     eq_(revision.contributor.user_text, "Gen0cide")
251     eq_(revision.text, "Revision 1 text")
252     eq_(revision.sha1, "g9chqqg94myzq11c56ixvq7o1yg75n9")
253     eq_(revision.comment, None)
254     eq_(revision.model, "wikitext")
255     eq_(revision.format, "text/x-wiki")
256
257     revision = next(page)
258     eq_(revision.id, 2)
259     eq_(revision.timestamp, Timestamp("2004-08-10T09:04:08Z"))
260     eq_(revision.contributor.id, None)
261     eq_(revision.contributor.user_text, "222.152.210.109")
262     eq_(revision.text, "Revision 2 text")
263     eq_(revision.sha1, "g9chqqg94myzq11c56ixvq7o1yg75n9")
264     eq_(revision.comment, "Comment 2")
265     eq_(revision.model, "wikitext")
266     eq_(revision.format, "text/x-wiki")

Community Data Science Collective || Want to submit a patch?