]> code.communitydata.science - mediawiki_dump_tools.git/blob - bin/wikiq
add more variables and support for persistence
[mediawiki_dump_tools.git] / bin / wikiq
1 #!/usr/bin/env python3
2
3 # original wikiq headers are: title articleid revid date_time anon
4 # editor editor_id minor text_size text_entropy text_md5 reversion
5 # additions_size deletions_size
6 import argparse
7 import sys
8 import os
9 sys.path.append("..")
10 from wikiq_util import calculate_persistence
11 from wikiq_util import WikiqIterator
12 from wikiq_util import WikiqPage
13 from wikiq_util import WikiqParser
14 from wikiq_util import open_input_file
15 from wikiq_util import open_output_file
16
17
18 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
19
20 # arguments for the input direction
21 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
22                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
23
24 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
25                     help="Directory for output files.")
26
27 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
28                     help="Write output to standard out (do not create dump file)")
29
30 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
31                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
32
33 parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
34                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
35
36 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
37                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
38
39 parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
40                     help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
41
42 args = parser.parse_args()
43
44 if len(args.dumpfiles) > 0:
45     for filename in args.dumpfiles:
46         input_file = open_input_file(filename)
47
48         # open directory for output
49         if args.output_dir:
50             output_dir = args.output_dir[0]
51         else:
52             output_dir = "."
53
54         print("Processing file: %s" % filename, file=sys.stderr)
55
56         if args.stdout:
57             output_file = sys.stdout
58         else:
59             filename = os.path.join(output_dir, os.path.basename(filename))
60             output_file = open_output_file(filename)
61
62         wikiq = WikiqParser(input_file, output_file, 
63                             collapse_user=args.collapse_user,
64                             persist=args.persist,
65                             persist_legacy=args.persist_legacy,
66                             urlencode=args.urlencode)
67
68
69         wikiq.process()
70
71         # close things 
72         input_file.close()
73         output_file.close()
74 else:
75     wikiq = WikiqParser(sys.stdin, sys.stdout,
76                         collapse_user=args.collapse_user,
77                         persist=args.persist,
78                         persist_legacy=args.persist_legacy,
79                         urlencode=args.urlencode)
80     wikiq.process()
81
82 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
83 # stop_words = stop_words.split(",")

Community Data Science Collective || Want to submit a patch?