1 # Functions common to both scrapers
2 # Copyright (C) 2018 Nathan TeBlunthuis
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 # returns an iterator of wiki,url tuples
19 from os import makedirs, path
20 from shutil import rmtree
21 from itertools import islice
24 def _add_wikitype(tpl):
29 if "wikipedia.org" in url:
30 wikitype = "wikipedia"
33 elif "wikia.com" in url:
40 tpl = (wiki, url, wikitype)
44 def read_wikilist(args):
45 if args.sep in ['\\t', '\t', 'tab', 't']:
50 if not args.no_header:
51 wikilist = pd.read_table(args.wikilist, sep=sep)
52 wikilist = ((t.dbname, t.url)
53 for t in wikilist.loc[:, ['dbname', 'url']].itertuples())
56 j, k = [int(i) for i in args.i.split(',')[0:2]]
58 wikilist = open(args.wikilist)
59 wikilist = (line.split(sep) for line in wikilist)
60 wikilist = ((fields[j], fields[k]) for fields in wikilist)
61 wikilist = islice(wikilist, 1, None)
63 wikilist = (_add_wikitype(t) for t in wikilist)
67 def add_parser_arguments(parser):
68 parser.add_argument('--no-header', action='store_true',
69 help='does the wikilist have no header?')
71 parser.add_argument('--nuke-old', action='store_true',
72 help='remove old files.')
74 parser.add_argument('--sep', type=str,
75 help='input table delimiter', default=',')
80 help='path to the input file: a wiki list with wiki\turl\filename')
85 help='path to put the logs we scrape e.g. /com/projects/messagewalls/allusers/')
87 parser.add_argument('-i',
89 help='<j,k> two 0-based indices for wiki and url in the csv, default=0,1',
95 def prepare_output(output_path, nuke_old):
96 if not path.exists(output_path):
100 if not path.exists(output_path):
101 makedirs(output_path)