1 from hashlib import sha1
5 from .tokens import Token, Tokens
17 Represents the state of word persistence in a page.
18 See `<https://meta.wikimedia.org/wiki/Research:Content_persistence>`_
21 tokenize : function( `str` ) --> list( `str` )
23 diff : function(list( `str` ), list( `str` )) --> list( `ops` )
24 A function to perform a difference between token lists
26 a positive integer indicating the maximum revision distance that a revert can span.
27 revert_detector : :class:`mw.lib.reverts.Detector`
28 a revert detector to start process with
30 >>> from pprint import pprint
31 >>> from mw.lib import persistence
33 >>> state = persistence.State()
35 >>> pprint(state.process("Apples are red.", revision=1))
36 ([Token(text='Apples', revisions=[1]),
37 Token(text=' ', revisions=[1]),
38 Token(text='are', revisions=[1]),
39 Token(text=' ', revisions=[1]),
40 Token(text='red', revisions=[1]),
41 Token(text='.', revisions=[1])],
42 [Token(text='Apples', revisions=[1]),
43 Token(text=' ', revisions=[1]),
44 Token(text='are', revisions=[1]),
45 Token(text=' ', revisions=[1]),
46 Token(text='red', revisions=[1]),
47 Token(text='.', revisions=[1])],
49 >>> pprint(state.process("Apples are blue.", revision=2))
50 ([Token(text='Apples', revisions=[1, 2]),
51 Token(text=' ', revisions=[1, 2]),
52 Token(text='are', revisions=[1, 2]),
53 Token(text=' ', revisions=[1, 2]),
54 Token(text='blue', revisions=[2]),
55 Token(text='.', revisions=[1, 2])],
56 [Token(text='blue', revisions=[2])],
57 [Token(text='red', revisions=[1])])
58 >>> pprint(state.process("Apples are red.", revision=3)) # A revert!
59 ([Token(text='Apples', revisions=[1, 2, 3]),
60 Token(text=' ', revisions=[1, 2, 3]),
61 Token(text='are', revisions=[1, 2, 3]),
62 Token(text=' ', revisions=[1, 2, 3]),
63 Token(text='red', revisions=[1, 3]),
64 Token(text='.', revisions=[1, 2, 3])],
69 def __init__(self, tokenize=defaults.TOKENIZE, diff=defaults.DIFF,
70 revert_radius=reverts.defaults.RADIUS,
71 revert_detector=None):
72 self.tokenize = tokenize
75 # Either pass a detector or the revert radius so I can make one
76 if revert_detector is None:
77 self.revert_detector = reverts.Detector(int(revert_radius))
79 self.revert_detector = revert_detector
81 # Stores the last tokens
84 def process(self, text, revision=None, checksum=None):
86 Modifies the internal state based a change to the content and returns
87 the sets of words added and removed.
91 The text content of a revision
95 A checksum hash of the text content (will be generated if not provided)
98 Three :class:`~mw.lib.persistence.Tokens` lists
100 current_tokens : :class:`~mw.lib.persistence.Tokens`
101 A sequence of :class:`~mw.lib.persistence.Token` for the
103 tokens_added : :class:`~mw.lib.persistence.Tokens`
104 A set of tokens that were inserted by the processed revision
105 tokens_removed : :class:`~mw.lib.persistence.Tokens`
106 A sequence of :class:`~mw.lib.persistence.Token` removed by the
111 checksum = sha1(bytes(text, 'utf8')).hexdigest()
115 revert = self.revert_detector.process(checksum, version)
116 if revert is not None: # Revert
119 tokens_added = Tokens()
120 tokens_removed = Tokens()
122 # Extract reverted_to revision
123 _, _, reverted_to = revert
124 version.tokens = reverted_to.tokens
128 if self.last is None: # First version of the page!
130 version.tokens = Tokens(Token(t) for t in self.tokenize(text))
131 tokens_added = version.tokens
132 tokens_removed = Tokens()
136 # NOTICE: HEAVY COMPUTATION HERE!!!
138 # OK. It's not that heavy. It's just performing a diff,
139 # but you're still going to spend most of your time here.
140 # Diffs usually run in O(n^2) -- O(n^3) time and most tokenizers
141 # produce a lot of tokens.
142 version.tokens, tokens_added, tokens_removed = \
143 self.last.tokens.compare(self.tokenize(text), self.diff)
145 version.tokens.persist(revision)
149 return version.tokens, tokens_added, tokens_removed