]> code.communitydata.science - mediawiki_dump_tools.git/blobdiff - wikiq
add support for persistence with segment matching
[mediawiki_dump_tools.git] / wikiq
diff --git a/wikiq b/wikiq
index bc6b06ded1f7f0ca1e8c4e81c8337e3a02930118..7cf4be26e60db633db9d210e7f5ec74addf1f357 100755 (executable)
--- a/wikiq
+++ b/wikiq
@@ -22,12 +22,18 @@ from urllib.parse import quote
 TO_ENCODE = ('title', 'editor')
 PERSISTENCE_RADIUS=7
 from deltas import SequenceMatcher
 TO_ENCODE = ('title', 'editor')
 PERSISTENCE_RADIUS=7
 from deltas import SequenceMatcher
+from deltas import SegmentMatcher
+
+class PersistMethod:
+    none = 0
+    sequence = 1
+    segment = 2
+    legacy = 3
 
 def calculate_persistence(tokens_added):
     return(sum([(len(x.revisions)-1) for x in tokens_added]),
            len(tokens_added))
 
 
 def calculate_persistence(tokens_added):
     return(sum([(len(x.revisions)-1) for x in tokens_added]),
            len(tokens_added))
 
-
 class WikiqIterator():
     def __init__(self, fh, collapse_user=False):
         self.fh = fh
 class WikiqIterator():
     def __init__(self, fh, collapse_user=False):
         self.fh = fh
@@ -116,14 +122,17 @@ class WikiqPage():
         return next(self.__revisions)
 
 class WikiqParser():
         return next(self.__revisions)
 
 class WikiqParser():
+    
+    def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False):
+        """ 
+        Parameters:
+           persist : what persistence method to use. Takes a PersistMethod value
+        """
 
 
-    def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
-        
         self.input_file = input_file
         self.output_file = output_file
         self.collapse_user = collapse_user
         self.persist = persist
         self.input_file = input_file
         self.output_file = output_file
         self.collapse_user = collapse_user
         self.persist = persist
-        self.persist_legacy = persist_legacy
         self.printed_header = False
         self.namespaces = []
         self.urlencode = urlencode
         self.printed_header = False
         self.namespaces = []
         self.urlencode = urlencode
@@ -164,13 +173,18 @@ class WikiqParser():
         for page in dump:
             rev_detector = mwreverts.Detector()
 
         for page in dump:
             rev_detector = mwreverts.Detector()
 
-            if self.persist or self.persist_legacy:
+            if self.persist != PersistMethod.none:
                 window = deque(maxlen=PERSISTENCE_RADIUS)
 
                 window = deque(maxlen=PERSISTENCE_RADIUS)
 
-                if not self.persist_legacy:
+                if self.persist == PersistMethod.sequence:
                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
                                                     revert_radius=PERSISTENCE_RADIUS)
 
                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
                                                     revert_radius=PERSISTENCE_RADIUS)
 
+                elif self.persist == PersistMethod.segment:
+                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer = wikitext_split),
+                                                    revert_radius=PERSISTENCE_RADIUS)
+
+                # self.persist == PersistMethod.legacy
                 else:
                     from mw.lib import persistence
                     state = persistence.State()
                 else:
                     from mw.lib import persistence
                     state = persistence.State()
@@ -243,14 +257,13 @@ class WikiqParser():
                 if self.collapse_user:
                     rev_data['collapsed_revs'] = rev.collapsed_revs
 
                 if self.collapse_user:
                     rev_data['collapsed_revs'] = rev.collapsed_revs
 
-                if self.persist or self.persist_legacy:
+                if self.persist != PersistMethod.none:
                     if rev.deleted.text:
                     if rev.deleted.text:
-
                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
                             old_rev_data[k] = None
                     else:
 
                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
                             old_rev_data[k] = None
                     else:
 
-                        if not self.persist_legacy:
+                        if self.persist != PersistMethod.legacy:
                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
 
                         else:
                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
 
                         else:
@@ -275,7 +288,7 @@ class WikiqParser():
 
                 rev_count += 1
 
 
                 rev_count += 1
 
-            if self.persist or self.persist_legacy:
+            if self.persist != PersistMethod.none:
                 # print out metadata for the last RADIUS revisions
                 for i, item in enumerate(window):
                     # if the window was full, we've already printed item 0
                 # print out metadata for the last RADIUS revisions
                 for i, item in enumerate(window):
                     # if the window was full, we've already printed item 0
@@ -349,17 +362,25 @@ parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
 
 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
 
-parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
-                    help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
+parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
+                    help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  Use -p=segment for advanced persistence calculation method that is robust to content moves. This might be very slow. Use -p=legacy for legacy behavior.")
 
 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
 
 
 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
 
-parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
-                    help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
-
 args = parser.parse_args()
 
 args = parser.parse_args()
 
+# set persistence method
+
+if args.persist is None:
+    persist = PersistMethod.none
+elif args.persist == "segment":
+    persist = PersistMethod.segment
+elif args.persist == "legacy":
+    persist = PersistMethod.legacy
+else:
+    persist = PersistMethod.sequence
+
 if len(args.dumpfiles) > 0:
     for filename in args.dumpfiles:
         input_file = open_input_file(filename)
 if len(args.dumpfiles) > 0:
     for filename in args.dumpfiles:
         input_file = open_input_file(filename)
@@ -378,10 +399,9 @@ if len(args.dumpfiles) > 0:
             filename = os.path.join(output_dir, os.path.basename(filename))
             output_file = open_output_file(filename)
 
             filename = os.path.join(output_dir, os.path.basename(filename))
             output_file = open_output_file(filename)
 
-        wikiq = WikiqParser(input_file, output_file, 
+            wikiq = WikiqParser(input_file, output_file, 
                             collapse_user=args.collapse_user,
                             collapse_user=args.collapse_user,
-                            persist=args.persist,
-                            persist_legacy=args.persist_legacy,
+                                persist=persist,
                             urlencode=args.urlencode)
 
 
                             urlencode=args.urlencode)
 
 
@@ -393,7 +413,7 @@ if len(args.dumpfiles) > 0:
 else:
     wikiq = WikiqParser(sys.stdin, sys.stdout,
                         collapse_user=args.collapse_user,
 else:
     wikiq = WikiqParser(sys.stdin, sys.stdout,
                         collapse_user=args.collapse_user,
-                        persist=args.persist,
+                        persist=persist,
                         persist_legacy=args.persist_legacy,
                         urlencode=args.urlencode)
     wikiq.process()
                         persist_legacy=args.persist_legacy,
                         urlencode=args.urlencode)
     wikiq.process()

Community Data Science Collective || Want to submit a patch?