]> code.communitydata.science - covid19.git/commitdiff
Read entire input files before making api calls.
authorNathan TeBlunthuis <nathante@uw.edu>
Sat, 28 Mar 2020 20:55:52 +0000 (13:55 -0700)
committerNathan TeBlunthuis <nathante@uw.edu>
Sat, 28 Mar 2020 20:55:52 +0000 (13:55 -0700)
This is nicer style to not hold onto resources for as long.
It will use a bit more memory.

transliterations/src/collect_trends.py [new file with mode: 0644]
transliterations/src/wikidata_search.py
transliterations/src/wikidata_transliterations.py

diff --git a/transliterations/src/collect_trends.py b/transliterations/src/collect_trends.py
new file mode 100644 (file)
index 0000000..16a5c93
--- /dev/null
@@ -0,0 +1,76 @@
+# this follows a similar approach to nick's trends.js but in python
+from pytrends.request import TrendReq
+from datetime import datetime
+from os import path
+import csv
+from itertools import islice, chain, zip_longest
+import pandas as pd
+
+
+# from itertools recipes
+#https://docs.python.org/3.6/library/itertools.html#itertools-recipes
+def grouper(iterable, n, fillvalue=None):
+    "Collect data into fixed-length chunks or blocks"
+    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
+    args = [iter(iterable)] * n
+    return zip_longest(*args, fillvalue=fillvalue)
+
+def get_daily_trends():
+    trendReq = TrendReq(backoff_factor=0.2)
+    today_trending = trendReq.today_searches()
+    daily_trends_outfile = path.join("..","data","output","daily_google_trends.csv")
+
+    write_header = False
+    header = ['date','term','top']
+
+    if not path.exists(daily_trends_outfile):
+        write_header = True
+
+    with open("../data/output/daily_google_trends.csv",'a',newline='') as of:
+        writer = csv.writer(of)
+        if write_header:
+            writer.writerow(header)
+
+        for i, trend in enumerate(today_trending):
+            writer.writerow([str(datetime.now().date()),trend,i])
+
+def get_related_queries(stems):
+    # we have to batch these in sets of 5
+    trendReq = TrendReq(backoff_factor=0.2)
+    def _get_related_queries(chunk):
+        kw_list = list(filter(lambda x: x is not None, chunk))
+        trendReq.build_payload(kw_list=kw_list)
+        related_queries = trendReq.related_queries()
+        for term, results in related_queries.items():
+            for key, df in results.items():
+                if df is not None:
+                    df["term"] = term
+                yield (key,df)
+
+    l = chain(*map(_get_related_queries, grouper(stems,5)))
+    out = {}
+    for key, value in l:
+        if key in out:
+            out[key].append(value)
+        else:
+            out[key] = [value]
+
+    for k in out.keys():
+        df = pd.concat(out[k])
+        df['date'] = str(datetime.now().date())
+        out[k] = df
+        outfile = path.join('..','data','output',f"related_searches_{k}.csv")
+        if path.exists(outfile):
+            mode = 'a'
+            header = False
+        else:
+            mode = 'w'
+            header = True
+
+        df.to_csv(outfile, mode=mode, header=header,index=False)
+
+stems = [t.strip() for t in open("../data/input/base_terms.txt",'r')]
+
+get_daily_trends()
+
+get_related_queries(stems)
index 21e8598a2045f3cbdafb09bef848ff57adcd0116..a3abbc0d800732298cc6481261dc967945586a18 100644 (file)
@@ -68,12 +68,12 @@ def read_google_trends_files(terms_files):
 
 
 def trawl_google_trends(terms_files, outfile = None, mode='w'):
-    terms = read_google_trends_files(terms_files)
+    terms = list(read_google_trends_files(terms_files))
     resultset = run_wikidata_searches(terms)
     resultset.to_csv(outfile, mode)
 
 def trawl_base_terms(infiles, outfile = None, mode='w'):
-    terms = chain(* (open(infile,'r') for infile in infiles))
+    terms = list(chain(* (open(infile,'r') for infile in infiles)))
     resultset = run_wikidata_searches(terms)
     resultset.to_csv(outfile, mode)
 
index d878354876bc10fd229fc1bd0959c2518f3fe2e1..0856c9cc01d132cd2825522c06ac850eaa48c4f9 100644 (file)
@@ -23,7 +23,7 @@ def GetAllLabels(in_csvs, outfile, topNs):
 
     def load_entity_ids(in_csv, topN=5):
         with open(in_csv,'r',newline='') as infile:
-            reader = csv.DictReader(infile)
+            reader = list(csv.DictReader(infile))
             for row in reader:
                 if int(row['search_position']) < topN:
                     yield row["entityid"]

Community Data Science Collective || Want to submit a patch?