From: Nathan TeBlunthuis Date: Sat, 28 Mar 2020 20:49:19 +0000 (-0700) Subject: Keep better track of time. X-Git-Url: https://code.communitydata.science/covid19.git/commitdiff_plain/282208507a28e1b77f93e0548906e3ffbb485bde?ds=inline Keep better track of time. - Add timestamp ot transliterations output file. - Append wikidata search terms instead of overwriting --- diff --git a/transliterations/src/compile_transliterated_phrases.sh b/transliterations/src/compile_transliterated_phrases.sh index 09f3bb5..55fe211 100755 --- a/transliterations/src/compile_transliterated_phrases.sh +++ b/transliterations/src/compile_transliterated_phrases.sh @@ -12,4 +12,5 @@ echo "Searching for Wikidata entities using Google trends" python3 wikidata_search.py ../data/output/related_searches_rising.csv ../data/output/related_searches_top.csv --use-gtrends --output ../data/output/wikidata_search_results_from_gtrends.csv echo "Finding transliterations from Wikidata using sparql" -python3 wikidata_transliterations.py ../data/output/wikidata_search_results_from_gtrends.csv ../data/output/wikidata_search_results.csv --topN 10 20 --output ../data/output/wikidata_entity_labels.csv +python3 wikidata_transliterations.py ../data/output/wikidata_search_results_from_gtrends.csv ../data/output/wikidata_search_results.csv --topN 10 20 --output ../data/output/$(date '+%Y-%m-%d')_wikidata_entity_labels.csv + diff --git a/transliterations/src/wikidata_search.py b/transliterations/src/wikidata_search.py index e774f68..21e8598 100644 --- a/transliterations/src/wikidata_search.py +++ b/transliterations/src/wikidata_search.py @@ -15,12 +15,15 @@ class Wikidata_ResultSet: for i, result in enumerate(results)) ) - def to_csv(self, outfile=None): + def to_csv(self, outfile=None, mode='w'): if outfile is None: of = stdout else: - of = open(outfile,'w',newline='') + if path.exists(outfile) and mode != 'w': + of = open(outfile,'a',newline='') + else: + of = open(outfile,'w',newline='') writer = csv.writer(of) writer.writerow(Wikidata_Result.__slots__) writer.writerows(map(Wikidata_Result.to_list, chain(* self.results))) @@ -64,15 +67,15 @@ def read_google_trends_files(terms_files): yield row['query'] -def trawl_google_trends(terms_files, outfile = None): +def trawl_google_trends(terms_files, outfile = None, mode='w'): terms = read_google_trends_files(terms_files) resultset = run_wikidata_searches(terms) - resultset.to_csv(outfile) + resultset.to_csv(outfile, mode) -def trawl_base_terms(infiles, outfile = None): +def trawl_base_terms(infiles, outfile = None, mode='w'): terms = chain(* (open(infile,'r') for infile in infiles)) resultset = run_wikidata_searches(terms) - resultset.to_csv(outfile) + resultset.to_csv(outfile, mode) ## search each of the base terms in wikidata @@ -84,6 +87,7 @@ if __name__ == "__main__": parser.add_argument('inputs', type=str, nargs='+', help='one or more files to read') parser.add_argument('--use-gtrends', action='store_true', help = 'toggle whether the input is the output from google trends') parser.add_argument('--output', type=str, help='an output file. defaults to stdout') + parser.add_argument('--overwrite', action='store_true', help = 'overwrite existing output files instead of appending') args = parser.parse_args() if args.use_gtrends: trawl_google_trends(args.inputs, args.output)