]> code.communitydata.science - ml_measurement_error_public.git/commitdiff
git-annex in ntq8312@kibo:/data/ntq8312/ml_measurement_error_public synced/master
authorntq8312 <ntq8312@kibo.communitydata.science>
Wed, 9 Nov 2022 22:18:48 +0000 (16:18 -0600)
committerntq8312 <ntq8312@kibo.communitydata.science>
Wed, 9 Nov 2022 22:18:48 +0000 (16:18 -0600)
civil_comments/get_perspective_scores.py
civil_comments/perspective_results.json [new symlink]

index e8e542b9bec110f051e3384ad958489b9b0e31b1..1f164e7ed28ca9aa3a00f5082d1c647ba190cc2d 100644 (file)
@@ -11,16 +11,17 @@ API_KEY = open('perspective_api_key').read()
 
 client = discovery.build("commentanalyzer","v1alpha",developerKey=API_KEY,discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",static_discovery=False,)
 
 
 client = discovery.build("commentanalyzer","v1alpha",developerKey=API_KEY,discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",static_discovery=False,)
 
-csvreader = csv.DictReader(open("all_data.csv"))
+csvreader = csv.DictReader(open("all_data.csv"), dialect='unix')
 
 outfile = Path("perspective_results.json")
 already_scored = set()
 if outfile.exists():
     already_scored = set([json.loads(l)['id'] for l in open(str(outfile),'r')])
 
 
 outfile = Path("perspective_results.json")
 already_scored = set()
 if outfile.exists():
     already_scored = set([json.loads(l)['id'] for l in open(str(outfile),'r')])
 
-with open("perspective_results.json",'w') as of:
+print(f"loaded {len(already_scored)} scored comments")
+with open("perspective_results.json",'a') as of:
     for line in csvreader:
     for line in csvreader:
-        if line['id'] not in already_scored:
+        if line['id'] not in already_scored and len(line.get('comment_text','')) > 0:
             analyze_request = {'comment':{'text':line['comment_text']},
                                'languages':['en'],
                                'requestedAttributes':{'TOXICITY':{},
             analyze_request = {'comment':{'text':line['comment_text']},
                                'languages':['en'],
                                'requestedAttributes':{'TOXICITY':{},
diff --git a/civil_comments/perspective_results.json b/civil_comments/perspective_results.json
new file mode 120000 (symlink)
index 0000000..37fcc10
--- /dev/null
@@ -0,0 +1 @@
+../.git/annex/objects/ZV/z8/SHA256E-s2293825121--6cdc8f8fb64fad2e51027e2564928e8938bf5fc6ca0cd6c31cb2e67aafe0a203.json/SHA256E-s2293825121--6cdc8f8fb64fad2e51027e2564928e8938bf5fc6ca0cd6c31cb2e67aafe0a203.json
\ No newline at end of file

Community Data Science Collective || Want to submit a patch?