From: Nate E TeBlunthuis <nathante@n2347.hyak.local>
Date: Tue, 4 Aug 2020 05:55:10 +0000 (-0700)
Subject: improve tokenizer.
X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/commitdiff_plain/b3ffaaba1d065614f3f19ee0cbc876185dc220e1?ds=inline;hp=-c

improve tokenizer.
---

b3ffaaba1d065614f3f19ee0cbc876185dc220e1
diff --git a/tf_reddit_comments.py b/tf_reddit_comments.py
index 010b759..ec2dd2c 100644
--- a/tf_reddit_comments.py
+++ b/tf_reddit_comments.py
@@ -7,6 +7,7 @@ from collections import Counter
 import pandas as pd
 import os
 import datetime
+from nltk import wordpunct_tokenize, MWETokenizer
 
 # compute term frequencies for comments in each subreddit by week
 def weekly_tf(partition):
@@ -36,13 +37,15 @@ def weekly_tf(partition):
 
     subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week))
 
+    tokenizer = MWETokenizer()
+
     def tf_comments(subreddit_weeks):
         for key, posts in subreddit_weeks:
             subreddit, week = key
             tfs = Counter([])
 
             for post in posts:
-                tfs.update(post.body.split())
+                tfs.update(tokenizer.tokenize(wordpunct_tokenize(post.body.lower())))
 
             for term, tf in tfs.items():
                 yield [subreddit, term, week, tf]
@@ -55,6 +58,7 @@ def weekly_tf(partition):
         while True:
             chunk = islice(outrows,outchunksize)
             pddf = pd.DataFrame(chunk, columns=schema.names)
+            print(pddf)
             table = pa.Table.from_pandas(pddf,schema=schema)
             if table.shape[0] == 0:
                 break