From: Nathan TeBlunthuis Date: Sat, 28 Mar 2020 20:55:52 +0000 (-0700) Subject: Read entire input files before making api calls. X-Git-Url: https://code.communitydata.science/covid19.git/commitdiff_plain/207b1f8b95096f2dcde913f7556003ad59e41123?ds=inline Read entire input files before making api calls. This is nicer style to not hold onto resources for as long. It will use a bit more memory. --- diff --git a/transliterations/src/collect_trends.py b/transliterations/src/collect_trends.py new file mode 100644 index 0000000..16a5c93 --- /dev/null +++ b/transliterations/src/collect_trends.py @@ -0,0 +1,76 @@ +# this follows a similar approach to nick's trends.js but in python +from pytrends.request import TrendReq +from datetime import datetime +from os import path +import csv +from itertools import islice, chain, zip_longest +import pandas as pd + + +# from itertools recipes +#https://docs.python.org/3.6/library/itertools.html#itertools-recipes +def grouper(iterable, n, fillvalue=None): + "Collect data into fixed-length chunks or blocks" + # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" + args = [iter(iterable)] * n + return zip_longest(*args, fillvalue=fillvalue) + +def get_daily_trends(): + trendReq = TrendReq(backoff_factor=0.2) + today_trending = trendReq.today_searches() + daily_trends_outfile = path.join("..","data","output","daily_google_trends.csv") + + write_header = False + header = ['date','term','top'] + + if not path.exists(daily_trends_outfile): + write_header = True + + with open("../data/output/daily_google_trends.csv",'a',newline='') as of: + writer = csv.writer(of) + if write_header: + writer.writerow(header) + + for i, trend in enumerate(today_trending): + writer.writerow([str(datetime.now().date()),trend,i]) + +def get_related_queries(stems): + # we have to batch these in sets of 5 + trendReq = TrendReq(backoff_factor=0.2) + def _get_related_queries(chunk): + kw_list = list(filter(lambda x: x is not None, chunk)) + trendReq.build_payload(kw_list=kw_list) + related_queries = trendReq.related_queries() + for term, results in related_queries.items(): + for key, df in results.items(): + if df is not None: + df["term"] = term + yield (key,df) + + l = chain(*map(_get_related_queries, grouper(stems,5))) + out = {} + for key, value in l: + if key in out: + out[key].append(value) + else: + out[key] = [value] + + for k in out.keys(): + df = pd.concat(out[k]) + df['date'] = str(datetime.now().date()) + out[k] = df + outfile = path.join('..','data','output',f"related_searches_{k}.csv") + if path.exists(outfile): + mode = 'a' + header = False + else: + mode = 'w' + header = True + + df.to_csv(outfile, mode=mode, header=header,index=False) + +stems = [t.strip() for t in open("../data/input/base_terms.txt",'r')] + +get_daily_trends() + +get_related_queries(stems) diff --git a/transliterations/src/wikidata_search.py b/transliterations/src/wikidata_search.py index 21e8598..a3abbc0 100644 --- a/transliterations/src/wikidata_search.py +++ b/transliterations/src/wikidata_search.py @@ -68,12 +68,12 @@ def read_google_trends_files(terms_files): def trawl_google_trends(terms_files, outfile = None, mode='w'): - terms = read_google_trends_files(terms_files) + terms = list(read_google_trends_files(terms_files)) resultset = run_wikidata_searches(terms) resultset.to_csv(outfile, mode) def trawl_base_terms(infiles, outfile = None, mode='w'): - terms = chain(* (open(infile,'r') for infile in infiles)) + terms = list(chain(* (open(infile,'r') for infile in infiles))) resultset = run_wikidata_searches(terms) resultset.to_csv(outfile, mode) diff --git a/transliterations/src/wikidata_transliterations.py b/transliterations/src/wikidata_transliterations.py index d878354..0856c9c 100644 --- a/transliterations/src/wikidata_transliterations.py +++ b/transliterations/src/wikidata_transliterations.py @@ -23,7 +23,7 @@ def GetAllLabels(in_csvs, outfile, topNs): def load_entity_ids(in_csv, topN=5): with open(in_csv,'r',newline='') as infile: - reader = csv.DictReader(infile) + reader = list(csv.DictReader(infile)) for row in reader: if int(row['search_position']) < topN: yield row["entityid"]