From: Nate E TeBlunthuis Date: Tue, 4 Aug 2020 05:55:10 +0000 (-0700) Subject: improve tokenizer. X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/commitdiff_plain/b3ffaaba1d065614f3f19ee0cbc876185dc220e1 improve tokenizer. --- diff --git a/tf_reddit_comments.py b/tf_reddit_comments.py index 010b759..ec2dd2c 100644 --- a/tf_reddit_comments.py +++ b/tf_reddit_comments.py @@ -7,6 +7,7 @@ from collections import Counter import pandas as pd import os import datetime +from nltk import wordpunct_tokenize, MWETokenizer # compute term frequencies for comments in each subreddit by week def weekly_tf(partition): @@ -36,13 +37,15 @@ def weekly_tf(partition): subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week)) + tokenizer = MWETokenizer() + def tf_comments(subreddit_weeks): for key, posts in subreddit_weeks: subreddit, week = key tfs = Counter([]) for post in posts: - tfs.update(post.body.split()) + tfs.update(tokenizer.tokenize(wordpunct_tokenize(post.body.lower()))) for term, tf in tfs.items(): yield [subreddit, term, week, tf] @@ -55,6 +58,7 @@ def weekly_tf(partition): while True: chunk = islice(outrows,outchunksize) pddf = pd.DataFrame(chunk, columns=schema.names) + print(pddf) table = pa.Table.from_pandas(pddf,schema=schema) if table.shape[0] == 0: break