#!/usr/bin/env python3 # original wikiq headers are: title articleid revid date_time anon # editor editor_id minor text_size text_entropy text_md5 reversion # additions_size deletions_size import argparse import sys import os sys.path.append("..") from wikiq_util import calculate_persistence from wikiq_util import WikiqIterator from wikiq_util import WikiqPage from wikiq_util import WikiqParser from wikiq_util import open_input_file from wikiq_util import open_output_file parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.') # arguments for the input direction parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.") parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1, help="Directory for output files.") parser.add_argument('-s', '--stdout', dest="stdout", action="store_true", help="Write output to standard out (do not create dump file)") parser.add_argument('--collapse-user', dest="collapse_user", action="store_true", help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.") parser.add_argument('-p', '--persistence', dest="persist", action="store_true", help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.") parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true", help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.") parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true", help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.") args = parser.parse_args() if len(args.dumpfiles) > 0: for filename in args.dumpfiles: input_file = open_input_file(filename) # open directory for output if args.output_dir: output_dir = args.output_dir[0] else: output_dir = "." print("Processing file: %s" % filename, file=sys.stderr) if args.stdout: output_file = sys.stdout else: filename = os.path.join(output_dir, os.path.basename(filename)) output_file = open_output_file(filename) wikiq = WikiqParser(input_file, output_file, collapse_user=args.collapse_user, persist=args.persist, persist_legacy=args.persist_legacy, urlencode=args.urlencode) wikiq.process() # close things input_file.close() output_file.close() else: wikiq = WikiqParser(sys.stdin, sys.stdout, collapse_user=args.collapse_user, persist=args.persist, persist_legacy=args.persist_legacy, urlencode=args.urlencode) wikiq.process() # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" # stop_words = stop_words.split(",")