X-Git-Url: https://code.communitydata.science/covid19.git/blobdiff_plain/09d171608f699eafe6528325938a892937a65302..98b07b8098611287eaa775b09622d1f3514303c8:/transliterations/src/collect_trends.py diff --git a/transliterations/src/collect_trends.py b/transliterations/src/collect_trends.py deleted file mode 100644 index 1820bc5..0000000 --- a/transliterations/src/collect_trends.py +++ /dev/null @@ -1,76 +0,0 @@ -# this follows a similar approach to nick's trends.js but in python -from pytrends.request import TrendReq -from datetime import datetime -from os import path -import csv -from itertools import islice, chain, zip_longest -import pandas as pd - - -# from itertools recipes -#https://docs.python.org/3.6/library/itertools.html#itertools-recipes -def grouper(iterable, n, fillvalue=None): - "Collect data into fixed-length chunks or blocks" - # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" - args = [iter(iterable)] * n - return zip_longest(*args, fillvalue=fillvalue) - -def get_daily_trends(): - trendReq = TrendReq(backoff_factor=0.2) - today_trending = trendReq.today_searches() - daily_trends_outfile = path.join("..","output","daily_google_trends.csv") - - write_header = False - header = ['date','term','top'] - - if not path.exists(daily_trends_outfile): - write_header = True - - with open("../output/intermediate/daily_google_trends.csv",'a',newline='') as of: - writer = csv.writer(of) - if write_header: - writer.writerow(header) - - for i, trend in enumerate(today_trending): - writer.writerow([str(datetime.now().date()),trend,i]) - -def get_related_queries(stems): - # we have to batch these in sets of 5 - trendReq = TrendReq(backoff_factor=0.2) - def _get_related_queries(chunk): - kw_list = list(filter(lambda x: x is not None, chunk)) - trendReq.build_payload(kw_list=kw_list) - related_queries = trendReq.related_queries() - for term, results in related_queries.items(): - for key, df in results.items(): - if df is not None: - df["term"] = term - yield (key,df) - - l = chain(*map(_get_related_queries, grouper(stems,5))) - out = {} - for key, value in l: - if key in out: - out[key].append(value) - else: - out[key] = [value] - - for k in out.keys(): - df = pd.concat(out[k]) - df['date'] = str(datetime.now().date()) - out[k] = df - outfile = path.join('..','output','intermediate',f"related_searches_{k}.csv") - if path.exists(outfile): - mode = 'a' - header = False - else: - mode = 'w' - header = True - - df.to_csv(outfile, mode=mode, header=header,index=False) - -stems = [t.strip() for t in open("../resources/base_terms.txt",'r')] - -get_daily_trends() - -get_related_queries(stems)