]> code.communitydata.science - rises_declines_wikia_code.git/blob - mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/7zfile.py
Initial commit
[rises_declines_wikia_code.git] / mediawiki_dump_tools / Mediawiki-Utilities / mw / xml_dump / 7zfile.py
1 """
2 This is a failed attempt.  See
3 https://github.com/halfak/Mediawiki-Utilities/issues/13 for more details.
4 """
5
6 '''
7 import os
8
9 import py7zlib
10
11
12 class SevenZFileError(py7zlib.ArchiveError):
13     pass
14
15 class SevenZFile(object):
16     @classmethod
17     def is_7zfile(cls, filepath):
18         """ Determine if filepath points to a valid 7z archive. """
19         is7z = False
20         fp = None
21         try:
22             fp = open(filepath, 'rb')
23             archive = py7zlib.Archive7z(fp)
24             n = len(archive.getnames())
25             is7z = True
26         finally:
27             if fp: fp.close()
28         return is7z
29
30     def __init__(self, filepath):
31         fp = open(filepath, 'rb')
32         self.filepath = filepath
33         self.archive = py7zlib.Archive7z(fp)
34
35     def __contains__(self, name):
36         return name in self.archive.getnames()
37
38     def bytestream(self, name):
39         """ Iterate stream of bytes from an archive member. """
40         if name not in self:
41             raise SevenZFileError('member %s not found in %s' %
42                                   (name, self.filepath))
43         else:
44             member = self.archive.getmember(name)
45             for byte in member.read():
46                 if not byte: break
47                 yield byte
48
49     def readlines(self, name):
50         """ Iterate lines from an archive member. """
51         linesep = os.linesep[-1]
52         line = ''
53         for ch in self.bytestream(name):
54             line += ch
55             if ch == linesep:
56                 yield line
57                 line = ''
58         if line: yield line
59         
60     
61 import os
62
63 import py7zlib
64
65 with open("/mnt/data/xmldatadumps/public/simplewiki/20141122/simplewiki-20141122-pages-meta-history.xml.7z", "rb") as f:
66     a = py7zlib.Archive7z(f)
67     
68     print(a.getmember(a.getnames()[0]).read())
69 '''

Community Data Science Collective || Want to submit a patch?