3 # original wikiq headers are: title articleid revid date_time anon
 
   4 # editor editor_id minor text_size text_entropy text_md5 reversion
 
   5 # additions_size deletions_size
 
  12 from subprocess import Popen, PIPE
 
  13 from collections import deque
 
  14 from hashlib import sha1
 
  16 from mwxml import Dump
 
  18 from deltas.tokenizers import wikitext_split
 
  21 from urllib.parse import quote
 
  22 TO_ENCODE = ('title', 'editor')
 
  24 from deltas import SequenceMatcher
 
  26 def calculate_persistence(tokens_added):
 
  27     return(sum([(len(x.revisions)-1) for x in tokens_added]),
 
  31 class WikiqIterator():
 
  32     def __init__(self, fh, collapse_user=False):
 
  34         self.collapse_user = collapse_user
 
  35         self.mwiterator = Dump.from_file(self.fh)
 
  36         self.namespace_map = { ns.id : ns.name for ns in
 
  37                                self.mwiterator.site_info.namespaces }
 
  38         self.__pages = self.load_pages()
 
  41         for page in self.mwiterator:
 
  43                             namespace_map = self.namespace_map,
 
  44                             collapse_user=self.collapse_user)
 
  50         return next(self._pages)
 
  53     __slots__ = ('id', 'title', 'namespace', 'redirect',
 
  54                  'restrictions', 'mwpage', '__revisions',
 
  57     def __init__(self, page, namespace_map, collapse_user=False):
 
  59         self.namespace = page.namespace
 
  60         if page.namespace != 0:
 
  61             self.title = ':'.join([namespace_map[page.namespace], page.title])
 
  63             self.title = page.title
 
  64         self.restrictions = page.restrictions
 
  65         self.collapse_user = collapse_user
 
  67         self.__revisions = self.rev_list()
 
  70         # Outline for how we want to handle collapse_user=True
 
  71         # iteration   rev.user   prev_rev.user   add prev_rev?
 
  78         for i, rev in enumerate(self.mwpage):
 
  79             # never yield the first time
 
  81                 if self.collapse_user: 
 
  83                     rev.collapsed_revs = collapsed_revs
 
  86                 if self.collapse_user:
 
  87                     # yield if this is the last edit in a seq by a user and reset
 
  88                     # also yield if we do know who the user is
 
  90                     if rev.deleted.user or prev_rev.deleted.user:
 
  93                         rev.collapsed_revs = collapsed_revs
 
  95                     elif not rev.user.text == prev_rev.user.text:
 
  98                         rev.collapsed_revs = collapsed_revs
 
  99                     # otherwise, add one to the counter
 
 102                         rev.collapsed_revs = collapsed_revs
 
 103                 # if collapse_user is false, we always yield
 
 109         # also yield the final time
 
 113         return self.__revisions
 
 116         return next(self.__revisions)
 
 120     def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False, persist_legacy=False):
 
 122         self.input_file = input_file
 
 123         self.output_file = output_file
 
 124         self.collapse_user = collapse_user
 
 125         self.persist = persist
 
 126         self.persist_legacy = persist_legacy
 
 127         self.printed_header = False
 
 129         self.urlencode = urlencode
 
 131     def __get_namespace_from_title(self, title):
 
 134         for ns in self.namespaces:
 
 135             # skip if the namespace is not defined
 
 137                 default_ns = self.namespaces[ns]
 
 140             if title.startswith(ns + ":"):
 
 141                 return self.namespaces[ns]
 
 143         # if we've made it this far with no matches, we return the default namespace
 
 148         # create a regex that creates the output filename
 
 149         # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
 
 150         #                         r'output/wikiq-\1-\2.tsv',
 
 153         # Construct dump file iterator
 
 154         dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
 
 156         # extract list of namspaces
 
 157         self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.site_info.namespaces}
 
 163         # Iterate through pages
 
 165             rev_detector = mwreverts.Detector()
 
 167             if self.persist or self.persist_legacy:
 
 168                 window = deque(maxlen=PERSISTENCE_RADIUS)
 
 170                 if not self.persist_legacy:
 
 171                     state = mwpersistence.DiffState(SequenceMatcher(tokenizer = wikitext_split),
 
 172                                                     revert_radius=PERSISTENCE_RADIUS)
 
 175                     from mw.lib import persistence
 
 176                     state = persistence.State()
 
 178             # Iterate through a page's revisions
 
 181                 rev_data = {'revid' : rev.id,
 
 182                             'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
 
 183                             'articleid' : page.id,
 
 184                             'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id,
 
 185                             'title' : '"' + page.title + '"',
 
 186                             'namespace' : page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title),
 
 187                             'deleted' : "TRUE" if rev.deleted.text else "FALSE" } 
 
 189                 # if revisions are deleted, /many/ things will be missing
 
 191                     rev_data['text_chars'] = ""
 
 192                     rev_data['sha1'] = ""
 
 193                     rev_data['revert'] = ""
 
 194                     rev_data['reverteds'] = ""
 
 197                     # rev.text can be None if the page has no text
 
 200                     # if text exists, we'll check for a sha1 and generate one otherwise
 
 206                         text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
 
 208                     rev_data['sha1'] = text_sha1
 
 210                     # TODO rev.bytes doesn't work.. looks like a bug
 
 211                     rev_data['text_chars'] = len(rev.text)
 
 213                     # generate revert data
 
 214                     revert = rev_detector.process(text_sha1, rev.id)
 
 217                         rev_data['revert'] = "TRUE"
 
 218                         rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
 
 220                         rev_data['revert'] = "FALSE"
 
 221                         rev_data['reverteds'] = ""
 
 223                 # if the fact that the edit was minor can be hidden, this might be an issue
 
 224                 rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
 
 226                 if not rev.deleted.user:
 
 227                     # wrap user-defined editors in quotes for fread
 
 228                     rev_data['editor'] = '"' + rev.user.text + '"'
 
 229                     rev_data['anon'] = "TRUE" if rev.user.id == None else "FALSE"
 
 232                     rev_data['anon'] = ""
 
 233                     rev_data['editor'] = ""
 
 235                 #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
 
 240                 #TODO missing: additions_size deletions_size
 
 242                 # if collapse user was on, lets run that
 
 243                 if self.collapse_user:
 
 244                     rev_data['collapsed_revs'] = rev.collapsed_revs
 
 246                 if self.persist or self.persist_legacy:
 
 249                         for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
 
 250                             old_rev_data[k] = None
 
 253                         if not self.persist_legacy:
 
 254                             _, tokens_added, tokens_removed = state.update(rev.text, rev.id)
 
 257                             _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
 
 259                         window.append((rev.id, rev_data, tokens_added, tokens_removed))
 
 261                         if len(window) == PERSISTENCE_RADIUS:
 
 262                             old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
 
 264                             num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
 
 266                             old_rev_data["token_revs"] = num_token_revs
 
 267                             old_rev_data["tokens_added"] = num_tokens
 
 268                             old_rev_data["tokens_removed"] = len(old_tokens_removed)
 
 269                             old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
 
 271                             self.print_rev_data(old_rev_data)
 
 274                     self.print_rev_data(rev_data)
 
 278             if self.persist or self.persist_legacy:
 
 279                 # print out metadata for the last RADIUS revisions
 
 280                 for i, item in enumerate(window):
 
 281                     # if the window was full, we've already printed item 0
 
 282                     if len(window) == PERSISTENCE_RADIUS and i == 0:
 
 285                     rev_id, rev_data, tokens_added, tokens_removed = item
 
 286                     num_token_revs, num_tokens = calculate_persistence(tokens_added)
 
 288                     rev_data["token_revs"] = num_token_revs
 
 289                     rev_data["tokens_added"] = num_tokens
 
 290                     rev_data["tokens_removed"] = len(tokens_removed)
 
 291                     rev_data["tokens_window"] = len(window)-(i+1)
 
 293                     self.print_rev_data(rev_data)
 
 297         print("Done: %s revisions and %s pages." % (rev_count, page_count),
 
 300     def print_rev_data(self, rev_data):
 
 301         # if it's the first time through, print the header
 
 303             for field in TO_ENCODE:
 
 304                 rev_data[field] = quote(str(rev_data[field]))
 
 306         if not self.printed_header:
 
 307             print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
 
 308             self.printed_header = True
 
 310         print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
 
 313 def open_input_file(input_filename):
 
 314     if re.match(r'.*\.7z$', input_filename):
 
 315         cmd = ["7za", "x", "-so", input_filename, '*'] 
 
 316     elif re.match(r'.*\.gz$', input_filename):
 
 317         cmd = ["zcat", input_filename] 
 
 318     elif re.match(r'.*\.bz2$', input_filename):
 
 319         cmd = ["bzcat", "-dk", input_filename] 
 
 322         input_file = Popen(cmd, stdout=PIPE).stdout
 
 324         input_file = open(input_filename, 'r')
 
 328 def open_output_file(input_filename):
 
 329     # create a regex that creates the output filename
 
 330     output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
 
 331     output_filename = re.sub(r'\.xml', '', output_filename)
 
 332     output_filename = output_filename + ".tsv"
 
 333     output_file = open(output_filename, "w")
 
 337 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
 
 339 # arguments for the input direction
 
 340 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
 
 341                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
 
 343 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
 
 344                     help="Directory for output files.")
 
 346 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
 
 347                     help="Write output to standard out (do not create dump file)")
 
 349 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
 
 350                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
 
 352 parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
 
 353                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
 
 355 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
 
 356                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
 
 358 parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
 
 359                     help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
 
 361 args = parser.parse_args()
 
 363 if len(args.dumpfiles) > 0:
 
 364     for filename in args.dumpfiles:
 
 365         input_file = open_input_file(filename)
 
 367         # open directory for output
 
 369             output_dir = args.output_dir[0]
 
 373         print("Processing file: %s" % filename, file=sys.stderr)
 
 376             output_file = sys.stdout
 
 378             filename = os.path.join(output_dir, os.path.basename(filename))
 
 379             output_file = open_output_file(filename)
 
 381         wikiq = WikiqParser(input_file, output_file, 
 
 382                             collapse_user=args.collapse_user,
 
 383                             persist=args.persist,
 
 384                             persist_legacy=args.persist_legacy,
 
 385                             urlencode=args.urlencode)
 
 394     wikiq = WikiqParser(sys.stdin, sys.stdout,
 
 395                         collapse_user=args.collapse_user,
 
 396                         persist=args.persist,
 
 397                         persist_legacy=args.persist_legacy,
 
 398                         urlencode=args.urlencode)
 
 401 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
 
 402 # stop_words = stop_words.split(",")