From: ntq8312 Date: Wed, 9 Nov 2022 22:18:48 +0000 (-0600) Subject: git-annex in ntq8312@kibo:/data/ntq8312/ml_measurement_error_public X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/commitdiff_plain/c42b94110b18264fdd66ada100ee05232b7b81bb git-annex in ntq8312@kibo:/data/ntq8312/ml_measurement_error_public --- diff --git a/civil_comments/get_perspective_scores.py b/civil_comments/get_perspective_scores.py index e8e542b..1f164e7 100644 --- a/civil_comments/get_perspective_scores.py +++ b/civil_comments/get_perspective_scores.py @@ -11,16 +11,17 @@ API_KEY = open('perspective_api_key').read() client = discovery.build("commentanalyzer","v1alpha",developerKey=API_KEY,discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",static_discovery=False,) -csvreader = csv.DictReader(open("all_data.csv")) +csvreader = csv.DictReader(open("all_data.csv"), dialect='unix') outfile = Path("perspective_results.json") already_scored = set() if outfile.exists(): already_scored = set([json.loads(l)['id'] for l in open(str(outfile),'r')]) -with open("perspective_results.json",'w') as of: +print(f"loaded {len(already_scored)} scored comments") +with open("perspective_results.json",'a') as of: for line in csvreader: - if line['id'] not in already_scored: + if line['id'] not in already_scored and len(line.get('comment_text','')) > 0: analyze_request = {'comment':{'text':line['comment_text']}, 'languages':['en'], 'requestedAttributes':{'TOXICITY':{}, diff --git a/civil_comments/perspective_results.json b/civil_comments/perspective_results.json new file mode 120000 index 0000000..37fcc10 --- /dev/null +++ b/civil_comments/perspective_results.json @@ -0,0 +1 @@ +../.git/annex/objects/ZV/z8/SHA256E-s2293825121--6cdc8f8fb64fad2e51027e2564928e8938bf5fc6ca0cd6c31cb2e67aafe0a203.json/SHA256E-s2293825121--6cdc8f8fb64fad2e51027e2564928e8938bf5fc6ca0cd6c31cb2e67aafe0a203.json \ No newline at end of file