From b3ffaaba1d065614f3f19ee0cbc876185dc220e1 Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Mon, 3 Aug 2020 22:55:10 -0700 Subject: [PATCH] improve tokenizer. --- tf_reddit_comments.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tf_reddit_comments.py b/tf_reddit_comments.py index 010b759..ec2dd2c 100644 --- a/tf_reddit_comments.py +++ b/tf_reddit_comments.py @@ -7,6 +7,7 @@ from collections import Counter import pandas as pd import os import datetime +from nltk import wordpunct_tokenize, MWETokenizer # compute term frequencies for comments in each subreddit by week def weekly_tf(partition): @@ -36,13 +37,15 @@ def weekly_tf(partition): subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week)) + tokenizer = MWETokenizer() + def tf_comments(subreddit_weeks): for key, posts in subreddit_weeks: subreddit, week = key tfs = Counter([]) for post in posts: - tfs.update(post.body.split()) + tfs.update(tokenizer.tokenize(wordpunct_tokenize(post.body.lower()))) for term, tf in tfs.items(): yield [subreddit, term, week, tf] @@ -55,6 +58,7 @@ def weekly_tf(partition): while True: chunk = islice(outrows,outchunksize) pddf = pd.DataFrame(chunk, columns=schema.names) + print(pddf) table = pa.Table.from_pandas(pddf,schema=schema) if table.shape[0] == 0: break -- 2.39.5