]> code.communitydata.science - ml_measurement_error_public.git/commitdiff
add stuff to get perspective scores from civil comments
authorNathan TeBlunthuis <nathante@uw.edu>
Thu, 3 Nov 2022 00:45:35 +0000 (17:45 -0700)
committerNathan TeBlunthuis <nathante@uw.edu>
Thu, 3 Nov 2022 00:45:35 +0000 (17:45 -0700)
civil_comments/all_data.csv [new symlink]
civil_comments/get_perspective_scores.py [new file with mode: 0644]
civil_comments/identity_individual_annotations.csv [new symlink]
civil_comments/perspective_api_key.gpg [new file with mode: 0644]
civil_comments/toxicity_individual_annotations.csv [new symlink]

diff --git a/civil_comments/all_data.csv b/civil_comments/all_data.csv
new file mode 120000 (symlink)
index 0000000..e30ea0d
--- /dev/null
@@ -0,0 +1 @@
+../.git/annex/objects/6v/fJ/SHA256E-s916052376--a85b5ba7e9a8cda38b91ea6e3957a4f2bfff17bb52f22c935595cbe47cc54d94.csv/SHA256E-s916052376--a85b5ba7e9a8cda38b91ea6e3957a4f2bfff17bb52f22c935595cbe47cc54d94.csv
\ No newline at end of file
diff --git a/civil_comments/get_perspective_scores.py b/civil_comments/get_perspective_scores.py
new file mode 100644 (file)
index 0000000..e8e542b
--- /dev/null
@@ -0,0 +1,38 @@
+from googleapiclient import discovery
+import json
+import csv
+from pathlib import Path
+
+from time import sleep
+
+from itertools import islice
+
+API_KEY = open('perspective_api_key').read()
+
+client = discovery.build("commentanalyzer","v1alpha",developerKey=API_KEY,discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",static_discovery=False,)
+
+csvreader = csv.DictReader(open("all_data.csv"))
+
+outfile = Path("perspective_results.json")
+already_scored = set()
+if outfile.exists():
+    already_scored = set([json.loads(l)['id'] for l in open(str(outfile),'r')])
+
+with open("perspective_results.json",'w') as of:
+    for line in csvreader:
+        if line['id'] not in already_scored:
+            analyze_request = {'comment':{'text':line['comment_text']},
+                               'languages':['en'],
+                               'requestedAttributes':{'TOXICITY':{},
+                                                      "SEVERE_TOXICITY":{},
+                                                      "IDENTITY_ATTACK":{},
+                                                      "INSULT":{},
+                                                      "PROFANITY":{},
+                                                      "THREAT":{}}}
+            response = client.comments().analyze(body=analyze_request).execute()
+            response['id'] = line['id']
+            result = json.dumps(response)
+            of.write(result + '\n')
+            of.flush()
+
+            sleep(0.10)
diff --git a/civil_comments/identity_individual_annotations.csv b/civil_comments/identity_individual_annotations.csv
new file mode 120000 (symlink)
index 0000000..20c95ea
--- /dev/null
@@ -0,0 +1 @@
+../.git/annex/objects/qP/Xw/SHA256E-s106388260--7b8e9f21c5110d32e337137f8b4fe50987ec1b59fdbfd56a4717cdc13e509ec3.csv/SHA256E-s106388260--7b8e9f21c5110d32e337137f8b4fe50987ec1b59fdbfd56a4717cdc13e509ec3.csv
\ No newline at end of file
diff --git a/civil_comments/perspective_api_key.gpg b/civil_comments/perspective_api_key.gpg
new file mode 100644 (file)
index 0000000..fbeda15
Binary files /dev/null and b/civil_comments/perspective_api_key.gpg differ
diff --git a/civil_comments/toxicity_individual_annotations.csv b/civil_comments/toxicity_individual_annotations.csv
new file mode 120000 (symlink)
index 0000000..b02f3cb
--- /dev/null
@@ -0,0 +1 @@
+../.git/annex/objects/FF/WZ/SHA256E-s417648663--c85bda15b964a24869ae11f76092bde6f4b18236dd1cbe17539526b3b5b736cf.csv/SHA256E-s417648663--c85bda15b964a24869ae11f76092bde6f4b18236dd1cbe17539526b3b5b736cf.csv
\ No newline at end of file

Community Data Science Collective || Want to submit a patch?