]> code.communitydata.science - rises_declines_wikia_code.git/blob - mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokens.py
Initial commit
[rises_declines_wikia_code.git] / mediawiki_dump_tools / Mediawiki-Utilities / mw / lib / persistence / tokens.py
1 class Token:
2     """
3     Represents a chunk of text and the revisions of a page that it survived.
4     """
5     __slots__ = ('text', 'revisions')
6
7     def __init__(self, text, revisions=None):
8         self.text = text
9         """
10         The text of the token.
11         """
12
13         self.revisions = revisions if revisions is not None else []
14         """
15         The meta data for the revisions that the token has appeared within.
16         """
17
18     def persist(self, revision):
19         self.revisions.append(revision)
20
21     def __repr__(self):
22         return "{0}({1})".format(
23             self.__class__.__name__,
24             ", ".join([
25                 "text={0}".format(repr(self.text)),
26                 "revisions={0}".format(repr(self.revisions))
27             ])
28         )
29
30
31 class Tokens(list):
32     """
33     Represents a :class:`list` of :class:`~mw.lib.persistence.Token` with some
34     useful helper functions.
35
36     :Example:
37
38         >>> from mw.lib.persistence import Token, Tokens
39         >>>
40         >>> tokens = Tokens()
41         >>> tokens.append(Token("foo"))
42         >>> tokens.extend([Token(" "), Token("bar")])
43         >>>
44         >>> tokens[0]
45         Token(text='foo', revisions=[])
46         >>>
47         >>> "".join(tokens.texts())
48         'foo bar'
49     """
50
51     def __init__(self, *args, **kwargs):
52         super().__init__(*args, **kwargs)
53
54     def persist(self, revision):
55         for token in self:
56             token.persist(revision)
57
58     def texts(self):
59         for token in self:
60             yield token.text
61
62     def compare(self, new, diff):
63         old = self.texts()
64
65         return self.apply_diff(diff(old, new), self, new)
66
67     @classmethod
68     def apply_diff(cls, ops, old, new):
69
70         tokens = cls()
71         tokens_added = cls()
72         tokens_removed = cls()
73
74         for code, a_start, a_end, b_start, b_end in ops:
75             if code == "insert":
76                 for token_text in new[b_start:b_end]:
77                     token = Token(token_text)
78                     tokens.append(token)
79                     tokens_added.append(token)
80
81             elif code == "replace":
82                 for token_text in new[b_start:b_end]:
83                     token = Token(token_text)
84                     tokens.append(token)
85                     tokens_added.append(token)
86
87                 tokens_removed.extend(t for t in old[a_start:a_end])
88
89             elif code == "equal":
90                 tokens.extend(old[a_start:a_end])
91             elif code == "delete":
92                 tokens_removed.extend(old[a_start:a_end])
93
94             else:
95                 assert False, \
96                     "encounted an unrecognized operation code: " + repr(code)
97
98         return (tokens, tokens_added, tokens_removed)

Community Data Science Collective || Want to submit a patch?