3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
10 from wikiq_util import calculate_persistence
11 from wikiq_util import WikiqIterator
12 from wikiq_util import WikiqPage
13 from wikiq_util import WikiqParser
14 from wikiq_util import open_input_file
15 from wikiq_util import open_output_file
18 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
20 # arguments for the input direction
21 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
22 help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
24 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
25 help="Directory for output files.")
27 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
28 help="Write output to standard out (do not create dump file)")
30 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
31 help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
33 parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
34 help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
36 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
37 help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
39 parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
40 help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
42 args = parser.parse_args()
44 if len(args.dumpfiles) > 0:
45 for filename in args.dumpfiles:
46 input_file = open_input_file(filename)
48 # open directory for output
50 output_dir = args.output_dir[0]
54 print("Processing file: %s" % filename, file=sys.stderr)
57 output_file = sys.stdout
59 filename = os.path.join(output_dir, os.path.basename(filename))
60 output_file = open_output_file(filename)
62 wikiq = WikiqParser(input_file, output_file,
63 collapse_user=args.collapse_user,
65 persist_legacy=args.persist_legacy,
66 urlencode=args.urlencode)
75 wikiq = WikiqParser(sys.stdin, sys.stdout,
76 collapse_user=args.collapse_user,
78 persist_legacy=args.persist_legacy,
79 urlencode=args.urlencode)
82 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
83 # stop_words = stop_words.split(",")