scraper_utils.py

   1 # Functions common to both scrapers
   2 # Copyright (C) 2018  Nathan TeBlunthuis
   3
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8
   9 # returns an iterator of wiki,url tuples
  10 import pandas as pd
  11 from os import makedirs, path
  12 from shutil import rmtree
  13 from itertools import islice
  14
  15
  16 def _add_wikitype(tpl):
  17     print(tpl)
  18     wiki, url = tpl[0:2]
  19     wikitype = "NA"
  20
  21     if "wikipedia.org" in url:
  22         wikitype = "wikipedia"
  23         url = url + '/w/'
  24
  25     elif "wikia.com" in url:
  26         wikitype = 'wikia'
  27
  28     print(url)
  29     print(wiki)
  30     url = url.strip()
  31     wiki = wiki.strip()
  32     tpl = (wiki, url, wikitype)
  33     return tpl
  34
  35
  36 def read_wikilist(args):
  37     if args.sep in ['\\t', '\t', 'tab', 't']:
  38         sep = '\t'
  39     else:
  40         sep = args.sep
  41
  42     if not args.no_header:
  43         wikilist = pd.read_table(args.wikilist, sep=sep)
  44         wikilist = ((t.dbname, t.url)
  45                     for t in wikilist.loc[:, ['dbname', 'url']].itertuples())
  46
  47     else:
  48         j, k = [int(i) for i in args.i.split(',')[0:2]]
  49         print(args.i)
  50         wikilist = open(args.wikilist)
  51         wikilist = (line.split(sep) for line in wikilist)
  52         wikilist = ((fields[j], fields[k]) for fields in wikilist)
  53         wikilist = islice(wikilist, 1, None)
  54
  55     wikilist = (_add_wikitype(t) for t in wikilist)
  56     return wikilist
  57
  58
  59 def add_parser_arguments(parser):
  60     parser.add_argument('--no-header', action='store_true',
  61                         help='does the wikilist have no header?')
  62
  63     parser.add_argument('--nuke-old', action='store_true',
  64                         help='remove old files.')
  65
  66     parser.add_argument('--sep', type=str,
  67                         help='input table delimiter', default=',')
  68
  69     parser.add_argument(
  70         'wikilist',
  71         type=str,
  72         help='path to the input file: a wiki list with wiki\turl\filename')
  73
  74     parser.add_argument(
  75         'output',
  76         type=str,
  77         help='path to put the logs we scrape e.g. /com/projects/messagewalls/allusers/')
  78
  79     parser.add_argument('-i',
  80                         type=str,
  81                         help='<j,k> two 0-based indices for wiki and url in the csv, default=0,1',
  82                         default='0,1')
  83
  84     return(parser)
  85
  86
  87 def prepare_output(output_path, nuke_old):
  88     if not path.exists(output_path):
  89         makedirs(output_path)
  90     if nuke_old:
  91         rmtree(output_path)
  92     if not path.exists(output_path):
  93         makedirs(output_path)