bin/wikiq

   1 #!/usr/bin/env python3
   2
   3 # original wikiq headers are: title articleid revid date_time anon
   4 # editor editor_id minor text_size text_entropy text_md5 reversion
   5 # additions_size deletions_size
   6 import argparse
   7 import sys
   8 import os
   9 sys.path.append("..")
  10 from wikiq_util import calculate_persistence
  11 from wikiq_util import WikiqIterator
  12 from wikiq_util import WikiqPage
  13 from wikiq_util import WikiqParser
  14 from wikiq_util import open_input_file
  15 from wikiq_util import open_output_file
  16
  17
  18 parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
  19
  20 # arguments for the input direction
  21 parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
  22                     help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
  23
  24 parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
  25                     help="Directory for output files.")
  26
  27 parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
  28                     help="Write output to standard out (do not create dump file)")
  29
  30 parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
  31                     help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
  32
  33 parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
  34                     help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
  35
  36 parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
  37                     help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
  38
  39 parser.add_argument('--persistence-legacy', dest="persist_legacy", action="store_true",
  40                     help="Legacy behavior for persistence calculation. Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
  41
  42 args = parser.parse_args()
  43
  44 if len(args.dumpfiles) > 0:
  45     for filename in args.dumpfiles:
  46         input_file = open_input_file(filename)
  47
  48         # open directory for output
  49         if args.output_dir:
  50             output_dir = args.output_dir[0]
  51         else:
  52             output_dir = "."
  53
  54         print("Processing file: %s" % filename, file=sys.stderr)
  55
  56         if args.stdout:
  57             output_file = sys.stdout
  58         else:
  59             filename = os.path.join(output_dir, os.path.basename(filename))
  60             output_file = open_output_file(filename)
  61
  62         wikiq = WikiqParser(input_file, output_file,
  63                             collapse_user=args.collapse_user,
  64                             persist=args.persist,
  65                             persist_legacy=args.persist_legacy,
  66                             urlencode=args.urlencode)
  67
  68
  69         wikiq.process()
  70
  71         # close things
  72         input_file.close()
  73         output_file.close()
  74 else:
  75     wikiq = WikiqParser(sys.stdin, sys.stdout,
  76                         collapse_user=args.collapse_user,
  77                         persist=args.persist,
  78                         persist_legacy=args.persist_legacy,
  79                         urlencode=args.urlencode)
  80     wikiq.process()
  81
  82 # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
  83 # stop_words = stop_words.split(",")