scraper_utils.py

   1 # Functions common to both scrapers
   2 # Copyright (C) 2018  Nathan TeBlunthuis
   3
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU General Public License for more details.
  13
  14 # You should have received a copy of the GNU General Public License
  15 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
  16
  17 # returns an iterator of wiki,url tuples
  18 import pandas as pd
  19 from os import makedirs, path
  20 from shutil import rmtree
  21 from itertools import islice
  22
  23
  24 def _add_wikitype(tpl):
  25     print(tpl)
  26     wiki, url = tpl[0:2]
  27     wikitype = "NA"
  28
  29     if "wikipedia.org" in url:
  30         wikitype = "wikipedia"
  31         url = url + '/w/'
  32
  33     elif "wikia.com" in url:
  34         wikitype = 'wikia'
  35
  36     print(url)
  37     print(wiki)
  38     url = url.strip()
  39     wiki = wiki.strip()
  40     tpl = (wiki, url, wikitype)
  41     return tpl
  42
  43
  44 def read_wikilist(args):
  45     if args.sep in ['\\t', '\t', 'tab', 't']:
  46         sep = '\t'
  47     else:
  48         sep = args.sep
  49
  50     if not args.no_header:
  51         wikilist = pd.read_table(args.wikilist, sep=sep)
  52         wikilist = ((t.dbname, t.url)
  53                     for t in wikilist.loc[:, ['dbname', 'url']].itertuples())
  54
  55     else:
  56         j, k = [int(i) for i in args.i.split(',')[0:2]]
  57         print(args.i)
  58         wikilist = open(args.wikilist)
  59         wikilist = (line.split(sep) for line in wikilist)
  60         wikilist = ((fields[j], fields[k]) for fields in wikilist)
  61         wikilist = islice(wikilist, 1, None)
  62
  63     wikilist = (_add_wikitype(t) for t in wikilist)
  64     return wikilist
  65
  66
  67 def add_parser_arguments(parser):
  68     parser.add_argument('--no-header', action='store_true',
  69                         help='does the wikilist have no header?')
  70
  71     parser.add_argument('--nuke-old', action='store_true',
  72                         help='remove old files.')
  73
  74     parser.add_argument('--sep', type=str,
  75                         help='input table delimiter', default=',')
  76
  77     parser.add_argument(
  78         'wikilist',
  79         type=str,
  80         help='path to the input file: a wiki list with wiki\turl\filename')
  81
  82     parser.add_argument(
  83         'output',
  84         type=str,
  85         help='path to put the logs we scrape e.g. /com/projects/messagewalls/allusers/')
  86
  87     parser.add_argument('-i',
  88                         type=str,
  89                         help='<j,k> two 0-based indices for wiki and url in the csv, default=0,1',
  90                         default='0,1')
  91
  92     return(parser)
  93
  94
  95 def prepare_output(output_path, nuke_old):
  96     if not path.exists(output_path):
  97         makedirs(output_path)
  98     if nuke_old:
  99         rmtree(output_path)
 100     if not path.exists(output_path):
 101         makedirs(output_path)