# Functions common to both scrapers
# Copyright (C) 2018 Nathan TeBlunthuis
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# returns an iterator of wiki,url tuples
import pandas as pd
from os import makedirs, path
from shutil import rmtree
from itertools import islice
def _add_wikitype(tpl):
print(tpl)
wiki, url = tpl[0:2]
wikitype = "NA"
if "wikipedia.org" in url:
wikitype = "wikipedia"
url = url + '/w/'
elif "wikia.com" in url:
wikitype = 'wikia'
print(url)
print(wiki)
url = url.strip()
wiki = wiki.strip()
tpl = (wiki, url, wikitype)
return tpl
def read_wikilist(args):
if args.sep in ['\\t', '\t', 'tab', 't']:
sep = '\t'
else:
sep = args.sep
if not args.no_header:
wikilist = pd.read_table(args.wikilist, sep=sep)
wikilist = ((t.dbname, t.url)
for t in wikilist.loc[:, ['dbname', 'url']].itertuples())
else:
j, k = [int(i) for i in args.i.split(',')[0:2]]
print(args.i)
wikilist = open(args.wikilist)
wikilist = (line.split(sep) for line in wikilist)
wikilist = ((fields[j], fields[k]) for fields in wikilist)
wikilist = islice(wikilist, 1, None)
wikilist = (_add_wikitype(t) for t in wikilist)
return wikilist
def add_parser_arguments(parser):
parser.add_argument('--no-header', action='store_true',
help='does the wikilist have no header?')
parser.add_argument('--nuke-old', action='store_true',
help='remove old files.')
parser.add_argument('--sep', type=str,
help='input table delimiter', default=',')
parser.add_argument(
'wikilist',
type=str,
help='path to the input file: a wiki list with wiki\turl\filename')
parser.add_argument(
'output',
type=str,
help='path to put the logs we scrape e.g. /com/projects/messagewalls/allusers/')
parser.add_argument('-i',
type=str,
help=' two 0-based indices for wiki and url in the csv, default=0,1',
default='0,1')
return(parser)
def prepare_output(output_path, nuke_old):
if not path.exists(output_path):
makedirs(output_path)
if nuke_old:
rmtree(output_path)
if not path.exists(output_path):
makedirs(output_path)