From: sohyeonhwang Date: Thu, 7 Nov 2019 19:28:17 +0000 (-0600) Subject: merging pull containing revert-radius with 2nd version of regex scanner w/ unit tests X-Git-Url: https://code.communitydata.science/mediawiki_dump_tools.git/commitdiff_plain/f147e1d899ea269dad715aea393153683353e946?hp=-c merging pull containing revert-radius with 2nd version of regex scanner w/ unit tests --- f147e1d899ea269dad715aea393153683353e946 diff --combined test/Wikiq_Unit_Test.py index 9011d79,9c85109..159fd10 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@@ -73,29 -73,6 +73,29 @@@ class Test_Wikipedia(unittest.TestCase) baseline = pd.read_table(baseline_file) assert_frame_equal(test,baseline) + def test_WP_revert_radius(self): + print(os.path.abspath('.')) + test_filename = "revert_radius_" + self.wikiq_out_name + test_file = os.path.join(self.test_output_dir, test_filename) + if os.path.exists(test_file): + os.remove(test_file) + + call = self.base_call.format(self.input_file, self.test_output_dir) + call = call + " -n 0 -n 1 -rr 1" + print(call) + proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True) + proc.wait() + copyfile(self.call_output, test_file) + baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename) + + # as a test let's make sure that we get equal data frames + test = pd.read_table(test_file) + num_wrong_ns = sum(~ test.namespace.isin({0,1})) + self.assertEqual(num_wrong_ns, 0) + baseline = pd.read_table(baseline_file) + assert_frame_equal(test,baseline) + + class Test_Basic(unittest.TestCase): @@@ -282,6 -259,6 +282,119 @@@ class Test_Stdout(unittest.TestCase) test = pd.read_table(StringIO(outs)) baseline = pd.read_table(baseline_file) assert_frame_equal(test,baseline) -- ++ ++class Test_Regex(unittest.TestCase): ++ ++ def setUp(self): ++ self.wiki = 'regextest' ++ self.wikiq_out_name = self.wiki + '.tsv' ++ self.infile = "{0}.xml.bz2".format(self.wiki) ++ ++ self.input_dir = "dumps" ++ self.input_file = os.path.join(".", self.input_dir,self.infile) ++ ++ if not os.path.exists("test_output"): ++ os.mkdir("test_output") ++ ++ self.test_output_dir = os.path.join(".", "test_output") ++ self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) ++ # we have two base calls, one for checking inputs and the other for checking outputs ++ self.base_call = "../wikiq {0}" ++ self.base_call_outs = "../wikiq {0} -o {1}" ++ ++ self.baseline_output_dir = "baseline_output" ++ ++ # sample inputs for checking that bad inputs get terminated / test_regex_inputs ++ self.bad_input1 = "-RP '\\b\\d+\\b'" #label is missing ++ self.bad_input2 = "-RP 'NPO V' -RP THE -RPl testlabel" #number of reg and number of labels do not match ++ self.bad_input3 = "-CP '(Tamil|Li)' -RPl testlabel" #cp but rp label ++ self.bad_input4 = "-CPl testlabel" #regex is missing ++ self.bad_input5 = "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" ++ ++ self.bad_inputs_list = [self.bad_input1,self.bad_input2,self.bad_input3,self.bad_input4,self.bad_input5] ++ ++ # sample inputs for checking the outcomes of good inputs / test_basic_regex ++ self.good_input1 = "-RP '\\b\\d{3}\\b' -RPl threedigits" ++ self.good_input2 = "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word" ++ self.good_input3 = "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning" ++ self.good_input4 = "-CP 'WP:EVADE' -CPl wp_evade" ++ ++ self.good_inputs_list = [self.good_input1,self.good_input2,self.good_input3, self.good_input4] ++ ++ # and with capture group(s) / test_capturegroup_regex ++ self.cap_input1 = "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P\\b[a-zA-Z]{3}\\b)|(?P\\b\\d+\\b)|(?P\\bcat\\b)' -CPl three" ++ self.cap_input2 = "-CP '(?P\\bTestCaseA\\b)|(?P\\bTestCaseB\\b)|(?P\\bTestCaseC\\b)|(?P\\bTestCaseD\\b)' -CPl testcase -RP '(?Pnpov|NPOV)|(?Pneutral point of view)' -RPl npov" ++ ++ self.cap_inputs_list = [self.cap_input1,self.cap_input2] ++ ++ def test_regex_inputs(self): ++ for input in self.bad_inputs_list: ++ call = self.base_call.format(self.input_file) ++ call = call + " --stdout " + input ++ print(call) ++ proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True) ++ stdout,stderr = proc.communicate() ++ #print(proc.returncode) ++ ++ # we want to check that the bad inputs were caught and sys.exit is stopping the code ++ print(stderr.decode("utf-8")) ++ self.assertNotEqual(proc.returncode,0) ++ ++ def test_basic_regex(self): ++ i = 1 ++ for input in self.good_inputs_list: ++ ++ test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) ++ #print(test_filename) ++ test_file = os.path.join(self.test_output_dir, test_filename) ++ if os.path.exists(test_file): ++ os.remove(test_file) ++ ++ call = self.base_call_outs.format(self.input_file, self.test_output_dir) ++ call = call + " " + input ++ print(call) ++ ++ proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True) ++ proc.wait() ++ ++ copyfile(self.call_output, test_file) ++ f = open(self.call_output, 'w') ++ f.close() ++ ++ # don't have a baseline file to compare a test to?? ++ # baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) ++ i += 1 ++ ++ #TODO ++ #a proper assert statement still needs to go here, but I checked out the generated files for now and it functions ++ ++ ++ def test_capturegroup_regex(self): ++ i = 1 ++ for input in self.cap_inputs_list: ++ test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) ++ print(test_filename) ++ test_file = os.path.join(self.test_output_dir, test_filename) ++ if os.path.exists(test_file): ++ os.remove(test_file) ++ ++ call = self.base_call_outs.format(self.input_file, self.test_output_dir) ++ call = call + " " + input ++ print(call) ++ ++ proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True) ++ proc.wait() ++ ++ copyfile(self.call_output, test_file) ++ f = open(self.call_output, 'w') ++ f.close() ++ ++ # don't have a baseline file to compare a test to?? ++ # baseline_file = os.path.join(".", self.baseline_output_dir, test_filename) ++ i += 1 ++ ++ #TODO ++ #a proper assert statement still needs to go here, but I checked out the generated files for now and it functions ++ if __name__ == '__main__': unittest.main() diff --combined wikiq index 67785fe,b982eaa..7a1b846 --- a/wikiq +++ b/wikiq @@@ -34,6 -34,29 +34,56 @@@ def calculate_persistence(tokens_added) return(sum([(len(x.revisions)-1) for x in tokens_added]), len(tokens_added)) -def matchmaker(rev_data, regular_expression, scanner, rev): #rev_data,self.regex,self.scanner, rev - for location in scanner: #presumably 'comment' 'text' 'comment text' made into a list by args - if location == "comment": - matching_string = rev.comment - elif location == "text": - matching_string = rev.text ++def matchmake(scanned_content, rev_data, regex, label): ++ p = re.compile(regex) ++ ++ temp_dict = {} ++ # if there are named capture groups in the regex ++ if bool(p.groupindex): ++ capture_groups = list(p.groupindex.keys()) ++ ++ # initialize the {capture_group_name:list} for each capture group ++ for cap_group in capture_groups: ++ temp_dict["{}_{}".format(label, cap_group)] = [] ++ ++ # if there are matches of some sort in this revision content, fill the lists for each cap_group ++ if p.search(scanned_content) is not None: ++ m = re.finditer(p,scanned_content) ++ matchobjects = list(m) ++ ++ for cap_group in capture_groups: ++ temp_list = [] ++ for match in matchobjects: ++ # we only want to add the match for the capture group if the match is not None ++ if match.group(cap_group) != None: ++ temp_list.append(match.group(cap_group)) ++ ++ # if temp_list of matches is empty just make that column None ++ if len(temp_list)==0: ++ temp_dict["{}_{}".format(label, cap_group)] = None ++ # else we put in the list we made in the for-loop above ++ else: ++ temp_dict["{}_{}".format(label, cap_group)] = ', '.join(temp_list) ++ ++ # there are no matches at all in this revision content, we default values to None + else: - sys.exit("regex scanner location must be 'comment' or 'text'.") - - if (re.search(regular_expression, matching_string) is not None): # we know that there is a match somewhere - m = re.finditer(regular_expression, matching_string) # all our matchObjects in a list - blob="" - for result in m: - blob = blob + "," + result.group(0) - # columns we want - rev_data['matches'] = blob #### the list of matchObjects. gleaned in post-processing ++ for cap_group in capture_groups: ++ temp_dict["{}_{}".format(label, cap_group)] = None ++ ++ # there are no capture groups, we just search for all the matches of the regex ++ else: ++ #given that there are matches to be made ++ if p.search(scanned_content) is not None: ++ m = p.findall(scanned_content) ++ temp_dict[label] = ', '.join(m) + else: - rev_data['matches'] = None - ++ temp_dict[label] = None ++ # update rev_data with our new columns ++ rev_data.update(temp_dict) ++ print(rev_data.keys()) + return rev_data + + - class WikiqIterator(): def __init__(self, fh, collapse_user=False): self.fh = fh @@@ -127,8 -150,8 +177,7 @@@ class WikiqPage() return next(self.__revisions) class WikiqParser(): -- - def __init__(self, input_file, output_file, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15): - def __init__(self, input_file, output_file, scanner, match_regex, collapse_user=False, persist=None, urlencode=False, namespaces = None): ++ def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15): """ Parameters: persist : what persistence method to use. Takes a PersistMethod value @@@ -141,7 -164,9 +190,12 @@@ self.printed_header = False self.namespaces = [] self.urlencode = urlencode - self.scanner = scanner - self.match_regex = match_regex + self.revert_radius = revert_radius ++ self.regex_match_revision = regex_match_revision ++ self.regex_revision_label = regex_revision_label ++ self.regex_match_comment = regex_match_comment ++ self.regex_comment_label = regex_comment_label + if namespaces is not None: self.namespace_filter = set(namespaces) else: @@@ -162,6 -187,7 +216,7 @@@ # if we've made it this far with no matches, we return the default namespace return default_ns + def process(self): # create a regex that creates the output filename @@@ -188,9 -214,7 +243,9 @@@ if namespace not in self.namespace_filter: continue - rev_detector = mwreverts.Detector() + print(self.revert_radius) + rev_detector = mwreverts.Detector(radius = self.revert_radius) + if self.persist != PersistMethod.none: window = deque(maxlen=PERSISTENCE_RADIUS) @@@ -210,14 -234,30 +265,52 @@@ # Iterate through a page's revisions for rev in page: - ## m = re.finditer() #so we can find all instances - ## m.groupdict() #so we can look at them all with their names - - # initialize rev_dat ++ ++ # initialize rev_data + rev_data = {} - rev_data = {'revid' : rev.id, - 'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'), - 'articleid' : page.id, - 'editor_id' : "" if rev.deleted.user == True or rev.user.id is None else rev.user.id, - 'title' : '"' + page.title + '"', - 'namespace' : namespace, - 'deleted' : "TRUE" if rev.deleted.text else "FALSE" } - if self.scanner is not None: # we know we want to do a regex search - ## comment = want to look in comment attached to revision - ## text = want to look in revision text ++ # if the command line args only gave a label (and no regular expression is given) ++ if (self.regex_revision_label != None and self.regex_match_revision == None) or (self.regex_comment_label != None and self.regex_match_comment == None): ++ sys.exit('The given regex label(s) has no corresponding regex to search for.') ++ ++ # if there's anything in the list of regex_match_revision ++ if self.regex_match_revision is not None: ++ if (self.regex_revision_label == None) or (len(self.regex_match_revision) != len(self.regex_revision_label)): ++ sys.exit('Each regular expression *must* come with a corresponding label and vice versa.') ++ ++ # initialize and construct the list of regex-label tuples ++ pairs = [] ++ for i in range(0,len(self.regex_match_revision)): ++ pairs.append((self.regex_match_revision[i], self.regex_revision_label[i])) ++ ++ # for each regex/label pair, we now run matchmake to check and output columns ++ for pair in pairs: ++ # pair[0] corresponds to the regex, pair[1] to the label ++ rev_data = matchmake(rev.text, rev_data, pair[0], pair[1]) ++ ++ # if there's anything in the list of regex_match_comment ++ if self.regex_match_comment is not None: ++ if (self.regex_comment_label == None) or (len(self.regex_match_comment) != len(self.regex_comment_label)): ++ sys.exit('Each regular expression *must* come with a corresponding label and vice versa.') ++ ++ # initialize and construct the list of regex-label tuples ++ pairs = [] ++ for i in range(0,len(self.regex_match_comment)): ++ pairs.append((self.regex_match_comment[i], self.regex_comment_label[i])) + - ### call the scanner function - rev_data = matchmaker(rev_data, self.match_regex, self.scanner, rev) - - if self.scanner is not None and rev_data['matches'] is None: - next ++ # for each regex/label pair, we now run matchmake to check and output columns ++ for pair in pairs: ++ # pair[0] corresponds to the regex, pair[1] to the label ++ rev_data = matchmake(rev.comment, rev_data, pair[0], pair[1]) + + # we fill out the rest of the data structure now + rev_data['revid'] = rev.id + rev_data['date_time'] = rev.timestamp.strftime('%Y-%m-%d %H:%M:%S') + rev_data['articleid'] = page.id + rev_data['editor_id'] = "" if rev.deleted.user == True or rev.user.id is None else rev.user.id + rev_data['title'] = '"' + page.title + '"' + rev_data['namespace'] = namespace + rev_data['deleted'] = "TRUE" if rev.deleted.text else "FALSE" # if revisions are deleted, /many/ things will be missing if rev.deleted.text: @@@ -242,7 -282,7 +335,7 @@@ # TODO rev.bytes doesn't work.. looks like a bug rev_data['text_chars'] = len(rev.text) - + # generate revert data revert = rev_detector.process(text_sha1, rev.id) @@@ -390,15 -430,11 +483,25 @@@ parser.add_argument('-u', '--url-encode parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append', help="Id number of namspace to include. Can be specified more than once.") -parser.add_argument('-rs', '--regex-scanner', dest="scanner",type=str, action='append', - help="Find the regex match specified by -R/--match searching in: (1) comment (2) text.") +parser.add_argument('-rr', + '--revert-radius', + dest="revert_radius", + type=int, + action='store', + default=15, + help="Number of edits to check when looking for reverts (default: 15)") + ++parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append', ++ help="The regular expression to search for in revision text. The regex must be surrounded by quotes.") ++ ++parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append', ++ help="The label for the outputted column based on matching the regex in revision text.") + ++parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append', ++ help="The regular expression to search for in comments of revisions.") + -parser.add_argument('-R', '--match', dest="match_regex", type=str, - help="The regular expression you would like to find in the string and put in capture group") ++parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append', ++ help="The label for the outputted column based on matching the regex in comments.") args = parser.parse_args() @@@ -436,13 -472,13 +539,17 @@@ if len(args.dumpfiles) > 0 filename = os.path.join(output_dir, os.path.basename(filename)) output_file = open_output_file(filename) - wikiq = WikiqParser(input_file, output_file, + wikiq = WikiqParser(input_file, + output_file, collapse_user=args.collapse_user, persist=persist, urlencode=args.urlencode, - namespaces = namespaces, - match_regex=args.match_regex, # adding in the new 2 args for regex searching - scanner=args.scanner) + namespaces=namespaces, - revert_radius=args.revert_radius) ++ revert_radius=args.revert_radius, ++ regex_match_revision = args.regex_match_revision, ++ regex_revision_label = args.regex_revision_label, ++ regex_match_comment = args.regex_match_comment, ++ regex_comment_label = args.regex_comment_label) wikiq.process() @@@ -450,15 -486,15 +557,20 @@@ input_file.close() output_file.close() else: - wikiq = WikiqParser(sys.stdin, sys.stdout, + wikiq = WikiqParser(sys.stdin, + sys.stdout, collapse_user=args.collapse_user, persist=persist, - persist_legacy=args.persist_legacy, + #persist_legacy=args.persist_legacy, urlencode=args.urlencode, - namespaces = namespaces, - match_regex=args.match_regex, # adding in the new 2 args for regex searching - scanner=args.scanner) - wikiq.process() + namespaces=namespaces, - revert_radius=args.revert_radius) - wikiq.process() ++ revert_radius=args.revert_radius, ++ regex_match_revision = args.regex_match_revision, ++ regex_revision_label = args.regex_revision_label, ++ regex_match_comment = args.regex_match_comment, ++ regex_comment_label = args.regex_comment_label) ++ ++ wikiq.process() # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" # stop_words = stop_words.split(",")