From: Nathan TeBlunthuis Date: Thu, 3 Nov 2022 00:45:35 +0000 (-0700) Subject: add stuff to get perspective scores from civil comments X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/commitdiff_plain/e17a52e23619aff74eebc144c74514f7b02d093e?ds=inline;hp=-c add stuff to get perspective scores from civil comments --- e17a52e23619aff74eebc144c74514f7b02d093e diff --git a/civil_comments/all_data.csv b/civil_comments/all_data.csv new file mode 120000 index 0000000..e30ea0d --- /dev/null +++ b/civil_comments/all_data.csv @@ -0,0 +1 @@ +../.git/annex/objects/6v/fJ/SHA256E-s916052376--a85b5ba7e9a8cda38b91ea6e3957a4f2bfff17bb52f22c935595cbe47cc54d94.csv/SHA256E-s916052376--a85b5ba7e9a8cda38b91ea6e3957a4f2bfff17bb52f22c935595cbe47cc54d94.csv \ No newline at end of file diff --git a/civil_comments/get_perspective_scores.py b/civil_comments/get_perspective_scores.py new file mode 100644 index 0000000..e8e542b --- /dev/null +++ b/civil_comments/get_perspective_scores.py @@ -0,0 +1,38 @@ +from googleapiclient import discovery +import json +import csv +from pathlib import Path + +from time import sleep + +from itertools import islice + +API_KEY = open('perspective_api_key').read() + +client = discovery.build("commentanalyzer","v1alpha",developerKey=API_KEY,discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",static_discovery=False,) + +csvreader = csv.DictReader(open("all_data.csv")) + +outfile = Path("perspective_results.json") +already_scored = set() +if outfile.exists(): + already_scored = set([json.loads(l)['id'] for l in open(str(outfile),'r')]) + +with open("perspective_results.json",'w') as of: + for line in csvreader: + if line['id'] not in already_scored: + analyze_request = {'comment':{'text':line['comment_text']}, + 'languages':['en'], + 'requestedAttributes':{'TOXICITY':{}, + "SEVERE_TOXICITY":{}, + "IDENTITY_ATTACK":{}, + "INSULT":{}, + "PROFANITY":{}, + "THREAT":{}}} + response = client.comments().analyze(body=analyze_request).execute() + response['id'] = line['id'] + result = json.dumps(response) + of.write(result + '\n') + of.flush() + + sleep(0.10) diff --git a/civil_comments/identity_individual_annotations.csv b/civil_comments/identity_individual_annotations.csv new file mode 120000 index 0000000..20c95ea --- /dev/null +++ b/civil_comments/identity_individual_annotations.csv @@ -0,0 +1 @@ +../.git/annex/objects/qP/Xw/SHA256E-s106388260--7b8e9f21c5110d32e337137f8b4fe50987ec1b59fdbfd56a4717cdc13e509ec3.csv/SHA256E-s106388260--7b8e9f21c5110d32e337137f8b4fe50987ec1b59fdbfd56a4717cdc13e509ec3.csv \ No newline at end of file diff --git a/civil_comments/perspective_api_key.gpg b/civil_comments/perspective_api_key.gpg new file mode 100644 index 0000000..fbeda15 Binary files /dev/null and b/civil_comments/perspective_api_key.gpg differ diff --git a/civil_comments/toxicity_individual_annotations.csv b/civil_comments/toxicity_individual_annotations.csv new file mode 120000 index 0000000..b02f3cb --- /dev/null +++ b/civil_comments/toxicity_individual_annotations.csv @@ -0,0 +1 @@ +../.git/annex/objects/FF/WZ/SHA256E-s417648663--c85bda15b964a24869ae11f76092bde6f4b18236dd1cbe17539526b3b5b736cf.csv/SHA256E-s417648663--c85bda15b964a24869ae11f76092bde6f4b18236dd1cbe17539526b3b5b736cf.csv \ No newline at end of file