X-Git-Url: https://code.communitydata.science/articlequality_ordinal.git/blobdiff_plain/29abd26b97b7666c9b7de4521c4861e50f6a6f2c..2c733a87881c9aa70dcfe9d2c7db697c8eb14886:/ores_scores_sample.py diff --git a/ores_scores_sample.py b/ores_scores_sample.py new file mode 100644 index 0000000..b881e4c --- /dev/null +++ b/ores_scores_sample.py @@ -0,0 +1,97 @@ +import mwapi +from revscoring import Model +import articlequality +import pyarrow +import pandas as pd +import scoring_utils +from itertools import chain, zip_longest +from multiprocessing import Pool +from functools import partial +from pyRemembeR import Remember +import fire +from pathlib import Path +import tqdm +remember = Remember("score_sample_articles.RDS") + +def get_revision_text(revid_batch, api): + revid_batch = filter(lambda rid: rid is not None, revid_batch) + doc = api.get(action='query', + prop='revisions', + revids=revid_batch, + rvprop=['ids','content'], + rvslots=['main']) + pages = doc.get('query',{}).get('pages',{}) + for pageid, doc in pages.items(): + revisions = doc.get('revisions',[]) + for revision in revisions: + text = revision.get('slots',{}).get('main',{}).get('*',{}) + yield {'revid':revision.get('revid',{}), 'text':text} + +def grouper(n, iterable, fillvalue=None): + "grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx" + args = [iter(iterable)] * n + return zip_longest(fillvalue=fillvalue, *args) + +def pull_revision_texts(revids, api, api_batch_size): + batches = grouper(api_batch_size,revids) + get_revision_text_2 = partial(get_revision_text,api=api) + revs = chain(* map(get_revision_text_2, batches)) + yield from revs + +def score_revisions(revids, api, api_batch_size=50, parallel=True): + + revs = pull_revision_texts(revids, api, api_batch_size) + + ncores = 28 + pool = Pool(ncores) + scorer_model = Model.load(open('articlequality/models/enwiki.nettrom_wp10.gradient_boosting.model', 'rb')) + add_score = partial(scoring_utils.add_score, scorer_model=scorer_model) + + if parallel: + ncores = 48 + pool = Pool(ncores) + + revs = pool.imap_unordered(add_score, revs, chunksize = api_batch_size*4) + else: + revs = map(add_score,revs) + + to_pddict = partial(scoring_utils.to_pddict,kept_keys=['revid']) + revs = map(to_pddict, revs) + yield from revs + +#sample_file_parquet = "data/article_sample_set.parquet"; output_feather="data/scored_article_sample.feather"; + +sample_file="/data/nti9383home/production_functions/data/20200301_article_labelings_sample.feather";output="/data/nti9383home/production_functions/data/scored_article_sample.feather" + +def score_sample(sample_file = "data/article_sample_set.feather", output="data/scored_article_sample.feather"): + + sample = pd.read_feather(sample_file) + + revids = set(sample.revid) + user_agent = "Nate TeBlunthuis . What's the relationship between contributors and article quality?" + api = mwapi.Session("https://en.wikipedia.org",user_agent=user_agent) + + scores = tqdm.tqdm(score_revisions(revids, api, 50, True),total=len(revids),miniters=100,smoothing=0.2) + + p = Path(output) + output_csv = Path(str(p).replace("".join(p.suffixes), ".csv")) + output_json = Path(str(p).replace("".join(p.suffixes), ".json")) + output_feather = Path(str(p).replace("".join(p.suffixes), ".feather")) + + saved_scores = list() + with open(output_json,'w') as of: + for score in scores: + of.write(str(score) + '\n') + saved_scores.append(score) + + + scored_revids = pd.DataFrame(saved_scores) + sample_1 = sample.merge(scored_revids,left_on="revid",right_on="revid") + remember(sample_1.shape[0],"sample_size_unscored") + + remember(sample_1.shape[0],"sample_size_scored") + sample_1.to_feather(output_feather) + sample_1.to_csv(output_csv) + +if __name__ == "__main__": + fire.Fire(score_sample)