summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
06d2fd1)
The timestamps in files should be the day that the exports are done. For
the view data, the query date needs to be the day before but this
shouldn't be the timestamp we use in files, etc.
#import feather #TBD
def parse_args():
#import feather #TBD
def parse_args():
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia view data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
#handle -d
if args.query_date:
#handle -d
if args.query_date:
- queryDate = args.query_date
+ query_date = args.query_date
else:
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
else:
yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
- queryDate = yesterday.strftime("%Y%m%d")
+ query_date = yesterday.strftime("%Y%m%d")
#handle -L
loglevel_mapping = { 'debug' : logging.DEBUG,
#handle -L
loglevel_mapping = { 'debug' : logging.DEBUG,
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
export_time = str(datetime.datetime.now())
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
export_time = str(datetime.datetime.now())
+ export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {export_git_hash}")
#1 Load up the list of article names
logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {export_git_hash}")
#1 Load up the list of article names
- j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.json")
- t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{queryDate}.tsv")
+ j_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.json")
+ t_outfilename = os.path.join(outputPath, f"digobs_covid19-wikipedia-enwiki_dailyviews-{export_date}.tsv")
with open(articleFile, 'r') as infile:
articleList = list(infile)
with open(articleFile, 'r') as infile:
articleList = list(infile)
#2 Repeatedly call the API with that list of names
for a in articleList:
a = a.strip("\"\n") #destringify
#2 Repeatedly call the API with that list of names
for a in articleList:
a = a.strip("\"\n") #destringify
- url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{queryDate}00/{queryDate}00"
+ url= f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{a}/daily/{query_date}00/{query_date}00"
response = requests.get(url)
if response.ok:
response = requests.get(url)
if response.ok:
# write out of the csv file
dw.writerow(jd)
# write out of the csv file
dw.writerow(jd)
- # f_Out = outputPath + "dailyviews" + queryDate + ".feather"
+ # f_Out = outputPath + "dailyviews" + query_date + ".feather"
# read the json back in and make a feather file?
logging.debug(f"Run complete at {datetime.datetime.now()}")
logging.info(f"Processed {success} successful URLs and {failure} failures.")
# read the json back in and make a feather file?
logging.debug(f"Run complete at {datetime.datetime.now()}")
logging.info(f"Processed {success} successful URLs and {failure} failures.")
###############################################################################
#
###############################################################################
#
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
parser = argparse.ArgumentParser(description='Call the views API to collect Wikipedia revision data.')
parser.add_argument('-o', '--output_folder', help='Where to save output', default="wikipedia/data", type=str)
parser.add_argument('-i', '--article_file', help='File listing article names', default="wikipedia/resources/enwp_wikiproject_covid19_articles.txt", type=str)
- parser.add_argument('-d', '--query_date', help='Date if not yesterday, in YYYYMMDD format.', type=str)
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=str),
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
args = parser.parse_args()
parser.add_argument('-L', '--logging_level', help='Logging level. Options are debug, info, warning, error, critical. Default: info.', default='info', type=str),
parser.add_argument('-W', '--logging_destination', help='Logging destination file. (default: standard error)', type=str),
args = parser.parse_args()
output_path = args.output_folder
article_filename = args.article_file
output_path = args.output_folder
article_filename = args.article_file
- #handle -d
- if args.query_date:
- query_date = args.query_date
- else:
- yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
- query_date = yesterday.strftime("%Y%m%d")
#handle -L
loglevel_mapping = { 'debug' : logging.DEBUG,
#handle -L
loglevel_mapping = { 'debug' : logging.DEBUG,
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
export_time = str(datetime.datetime.now())
export_git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
export_git_short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
export_time = str(datetime.datetime.now())
+ export_date = datetime.datetime.today().strftime("%Y%m%d")
logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {export_git_hash}")
logging.info(f"Starting run at {export_time}")
logging.info(f"Last commit: {export_git_hash}")
- json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{query_date}.json")
- tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{query_date}.tsv")
+ json_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json")
+ tsv_output_filename = os.path.join(output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv")
api_session = api.Session("https://en.wikipedia.org/w/api.php")
api_session = api.Session("https://en.wikipedia.org/w/api.php")