improve tokenizer.

author Nate E TeBlunthuis <nathante@n2347.hyak.local>

Tue, 4 Aug 2020 05:55:10 +0000 (22:55 -0700)

committer Nate E TeBlunthuis <nathante@n2347.hyak.local>

Tue, 4 Aug 2020 05:55:10 +0000 (22:55 -0700)
author Nate E TeBlunthuis <nathante@n2347.hyak.local>
Tue, 4 Aug 2020 05:55:10 +0000 (22:55 -0700)
committer Nate E TeBlunthuis <nathante@n2347.hyak.local>
Tue, 4 Aug 2020 05:55:10 +0000 (22:55 -0700)
diff --git a/tf_reddit_comments.py b/tf_reddit_comments.py

index 010b75935e761585a87c01914b92b84c1c02d806..ec2dd2cddbb0d1cfbdce215b111820758f6a78d6 100644 (file)
--- a/tf_reddit_comments.py
+++ b/tf_reddit_comments.py
@@ -7,6 +7,7 @@ from collections import Counter
  import pandas as pd
  import os
  import datetime
+from nltk import wordpunct_tokenize, MWETokenizer
  
  # compute term frequencies for comments in each subreddit by week
  def weekly_tf(partition):
@@ -36,13 +37,15 @@ def weekly_tf(partition):
  
      subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week))
  
+    tokenizer = MWETokenizer()
+
      def tf_comments(subreddit_weeks):
          for key, posts in subreddit_weeks:
              subreddit, week = key
              tfs = Counter([])
  
              for post in posts:
-                tfs.update(post.body.split())
+                tfs.update(tokenizer.tokenize(wordpunct_tokenize(post.body.lower())))
  
              for term, tf in tfs.items():
                  yield [subreddit, term, week, tf]
@@ -55,6 +58,7 @@ def weekly_tf(partition):
          while True:
              chunk = islice(outrows,outchunksize)
              pddf = pd.DataFrame(chunk, columns=schema.names)
+            print(pddf)
              table = pa.Table.from_pandas(pddf,schema=schema)
              if table.shape[0] == 0:
                  break
author	Nate E TeBlunthuis <nathante@n2347.hyak.local>
	Tue, 4 Aug 2020 05:55:10 +0000 (22:55 -0700)
committer	Nate E TeBlunthuis <nathante@n2347.hyak.local>
	Tue, 4 Aug 2020 05:55:10 +0000 (22:55 -0700)