X-Git-Url: https://code.communitydata.science/covid19.git/blobdiff_plain/09d171608f699eafe6528325938a892937a65302..98b07b8098611287eaa775b09622d1f3514303c8:/keywords/src/collect_trends.py diff --git a/keywords/src/collect_trends.py b/keywords/src/collect_trends.py new file mode 100644 index 0000000..1820bc5 --- /dev/null +++ b/keywords/src/collect_trends.py @@ -0,0 +1,76 @@ +# this follows a similar approach to nick's trends.js but in python +from pytrends.request import TrendReq +from datetime import datetime +from os import path +import csv +from itertools import islice, chain, zip_longest +import pandas as pd + + +# from itertools recipes +#https://docs.python.org/3.6/library/itertools.html#itertools-recipes +def grouper(iterable, n, fillvalue=None): + "Collect data into fixed-length chunks or blocks" + # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" + args = [iter(iterable)] * n + return zip_longest(*args, fillvalue=fillvalue) + +def get_daily_trends(): + trendReq = TrendReq(backoff_factor=0.2) + today_trending = trendReq.today_searches() + daily_trends_outfile = path.join("..","output","daily_google_trends.csv") + + write_header = False + header = ['date','term','top'] + + if not path.exists(daily_trends_outfile): + write_header = True + + with open("../output/intermediate/daily_google_trends.csv",'a',newline='') as of: + writer = csv.writer(of) + if write_header: + writer.writerow(header) + + for i, trend in enumerate(today_trending): + writer.writerow([str(datetime.now().date()),trend,i]) + +def get_related_queries(stems): + # we have to batch these in sets of 5 + trendReq = TrendReq(backoff_factor=0.2) + def _get_related_queries(chunk): + kw_list = list(filter(lambda x: x is not None, chunk)) + trendReq.build_payload(kw_list=kw_list) + related_queries = trendReq.related_queries() + for term, results in related_queries.items(): + for key, df in results.items(): + if df is not None: + df["term"] = term + yield (key,df) + + l = chain(*map(_get_related_queries, grouper(stems,5))) + out = {} + for key, value in l: + if key in out: + out[key].append(value) + else: + out[key] = [value] + + for k in out.keys(): + df = pd.concat(out[k]) + df['date'] = str(datetime.now().date()) + out[k] = df + outfile = path.join('..','output','intermediate',f"related_searches_{k}.csv") + if path.exists(outfile): + mode = 'a' + header = False + else: + mode = 'w' + header = True + + df.to_csv(outfile, mode=mode, header=header,index=False) + +stems = [t.strip() for t in open("../resources/base_terms.txt",'r')] + +get_daily_trends() + +get_related_queries(stems)