7 from json.decoder import JSONDecodeError
8 from scraper_utils import prepare_output, read_wikilist, add_parser_arguments
12 def write_logevents(logevents, out):
13 for logevent in logevents:
14 # if there is hidden information, we skip this one because there
15 # is nothing to report
16 if any(['userhidden' in logevent,
17 'actionhidden' in logevent,
18 'commenthidden' in logevent]):
21 le_output = [logevent['comment'],
22 str(logevent['logid']),
24 str(logevent['pageid']),
25 logevent['timestamp'],
28 str(logevent['user'])]
30 if "rights" in logevent:
31 le_output.extend(['false',
32 logevent['rights']['new'],
33 logevent['rights']['old']])
35 le_output.extend(['true', '', ''])
37 out.write("\t".join(le_output) + "\n")
42 def write_blockevents(logevents, out):
43 for logevent in logevents:
44 # if there is hidden information, we skip this one because there
45 # is nothing to report
46 if any(['userhidden' in logevent,
47 'actionhidden' in logevent,
48 'commenthidden' in logevent]):
51 le_output = [logevent['comment'],
52 str(logevent['logid']),
54 str(logevent['pageid']),
55 logevent['timestamp'],
58 str(logevent['user'])]
60 if "rights" in logevent:
61 le_output.extend(['false',
62 logevent['rights']['new'],
63 logevent['rights']['old']])
65 le_output.extend(['true', '', ''])
67 out.write("\t".join(le_output) + "\n")
72 def get_events_for_wiki(wikiname, url, output_dir, blocks_output=None, wikitype="wikia"):
73 out = open("{0}/{1}.tsv".format(output_dir, wikiname), "w")
75 out.write("\t".join(['comment', 'logid', 'ns', 'pageid', 'timestamp',
76 'title', 'type', 'user', 'ancient', 'rights-new',
79 if wikitype == "wikia":
80 api_url = url + '/api.php'
81 else: # wikitype == wikipedia
82 api_url = url + "/w/api.php"
86 if blocks_output is not None:
87 letype = 'rights|block|unblock'
88 blockout = open("{0}/{1}.tsv".format(blocks_output, wikiname), "w")
89 blockout.write("\t".join(['comment', 'logid', 'ns', 'pageid', 'timestamp',
90 'title', 'type', 'user', 'ancient', 'rights-new',
93 query = {'action': 'query',
101 response = requests.get(api_url, params=query)
104 except (JSONDecodeError):
105 api_url = response.url
106 # print api_url # debug
107 if wikitype == "wikia":
108 re_str = "^http://(community|www).wikia.com/"
109 else: # wikitype == "wikipedia"
110 re_str = "^(http|https)://.*wikipedia.org/"
112 if re.match(re_str, api_url):
114 # 'http://community.wikia.com/wiki/Community_Central:Not_a_valid_Wikia':
115 print("ERROR: %s no longer exists" % wikiname)
118 response = requests.get(api_url, params=query)
122 logevents = rv['query']['logevents']
124 blockevents = [e for e in logevents
125 if (e['action'] in ['block', 'unblock'])
126 or (e['type'] in ['block', 'unblock'])]
128 logevents = [e for e in logevents if e not in blockevents]
130 write_logevents(logevents, out)
132 write_blockevents(blockevents, blockout)
135 print("ERROR: %s contains no logevent data" % wikiname)
138 while 'query-continue' in rv or 'continue' in rv:
139 if 'query-continue' in rv:
140 query['lestart'] = rv['query-continue']['logevents']['lestart']
142 query['continue'] = str(rv['continue'])
143 query['lecontinue'] = str(rv['continue']['lecontinue'])
145 response = requests.get(api_url, params=query)
147 logevents = rv['query']['logevents']
148 write_logevents(logevents, out)
154 # %run userroles_from_logevents.py --sep=\\t --nuke-old --blocks-output=/com/projects/messagewalls/userroles/blockevents ../identifyWikis/wikiteamWikilist.tsv /com/projects/messagewalls/userroles/listusers
155 if __name__ == '__main__':
156 parser = argparse.ArgumentParser(
157 description="Get user roles for Wikis from the Mediawiki list users API")
159 parser = add_parser_arguments(parser)
161 parser.add_argument('--blocks-output',
163 help='Path to output block event logs. If empty, blocks are ignored.'
166 args = parser.parse_args()
167 output_path = args.output
168 blocks_output = args.blocks_output
169 header = not args.no_header
171 prepare_output(output_path, args.nuke_old)
173 if blocks_output is not None:
174 prepare_output(blocks_output, args.nuke_old)
176 wikilist = read_wikilist(args)
180 files = [os.path.join(output_path, i) for i in os.listdir(output_path)]
182 # interate through the list of wikis
183 # for line in ["anime,http://anime.wikia.com/"]:
184 # for line in ["blogging,http://blogging.wikia.com/"]:
185 wikilist = read_wikilist(args)
187 # for line in open("list_of_wikis.csv", "r").readlines():
189 for wiki, url, wikitype in wikilist:
190 if "{0}.{1}".format(path.join(output_path, wiki), 'tsv') in files:
191 print("SKIPPING: file \"%s\" already exists)" % wiki)
195 print("SKIPPING: file \"%s\" already exists)" % wiki)
198 print("Processing wiki: %s" % wiki)
202 output_dir=output_path,
203 blocks_output=blocks_output,