# Functions common to both scrapers # Copyright (C) 2018 Nathan TeBlunthuis # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . # returns an iterator of wiki,url tuples import pandas as pd from os import makedirs, path from shutil import rmtree from itertools import islice def _add_wikitype(tpl): print(tpl) wiki, url = tpl[0:2] wikitype = "NA" if "wikipedia.org" in url: wikitype = "wikipedia" url = url + '/w/' elif "wikia.com" in url: wikitype = 'wikia' print(url) print(wiki) url = url.strip() wiki = wiki.strip() tpl = (wiki, url, wikitype) return tpl def read_wikilist(args): if args.sep in ['\\t', '\t', 'tab', 't']: sep = '\t' else: sep = args.sep if not args.no_header: wikilist = pd.read_table(args.wikilist, sep=sep) wikilist = ((t.dbname, t.url) for t in wikilist.loc[:, ['dbname', 'url']].itertuples()) else: j, k = [int(i) for i in args.i.split(',')[0:2]] print(args.i) wikilist = open(args.wikilist) wikilist = (line.split(sep) for line in wikilist) wikilist = ((fields[j], fields[k]) for fields in wikilist) wikilist = islice(wikilist, 1, None) wikilist = (_add_wikitype(t) for t in wikilist) return wikilist def add_parser_arguments(parser): parser.add_argument('--no-header', action='store_true', help='does the wikilist have no header?') parser.add_argument('--nuke-old', action='store_true', help='remove old files.') parser.add_argument('--sep', type=str, help='input table delimiter', default=',') parser.add_argument( 'wikilist', type=str, help='path to the input file: a wiki list with wiki\turl\filename') parser.add_argument( 'output', type=str, help='path to put the logs we scrape e.g. /com/projects/messagewalls/allusers/') parser.add_argument('-i', type=str, help=' two 0-based indices for wiki and url in the csv, default=0,1', default='0,1') return(parser) def prepare_output(output_path, nuke_old): if not path.exists(output_path): makedirs(output_path) if nuke_old: rmtree(output_path) if not path.exists(output_path): makedirs(output_path)