From 4dc949de5fb8d3eac04bae125c819100002c9522 Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Mon, 22 Feb 2021 16:03:48 -0800 Subject: [PATCH 1/1] Changes from hyak. --- clustering/Makefile | 28 +++++++++++-- clustering/fit_tsne.py | 2 +- density/Makefile | 9 ++-- density/job_script.sh | 2 +- density/overlap_density.py | 6 +-- similarities/Makefile | 30 ++++++++++---- .../similarities_helper.cpython-37.pyc | Bin 6845 -> 10402 bytes similarities/cosine_similarities.py | 39 ++++++++++++++---- similarities/job_script.sh | 2 +- similarities/similarities_helper.py | 38 +++++++++-------- similarities/tfidf.py | 11 ++--- similarities/wang_similarity.py | 2 +- ...ubreddit_author_tf_similarities_10000.html | 2 +- visualization/tsne_vis.py | 12 +++++- 14 files changed, 128 insertions(+), 55 deletions(-) diff --git a/clustering/Makefile b/clustering/Makefile index 115b218..20d7808 100644 --- a/clustering/Makefile +++ b/clustering/Makefile @@ -1,10 +1,32 @@ #srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28' -all:/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather +all:/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather +#all:/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather /gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather /gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather # $srun_cdsc python3 - ./clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.85 --damping=0.85 + start_spark_and_run.sh 1 clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.85 --damping=0.85 /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather # $srun_cdsc python3 - ./clustering.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather ---max_iter=1000 --convergence_iter=15 --preference_quantile=0.9 --damping=0.5 + start_spark_and_run.sh 1 clustering.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather ---max_iter=1000 --convergence_iter=15 --preference_quantile=0.9 --damping=0.5 + +/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet +# $srun_cdsc + start_spark_and_run.sh 1 clustering.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.5 --damping=0.85 + +# it's pretty difficult to get a result that isn't one huge megacluster. A sign that it's bullcrap +# /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather +# ./clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.9 --damping=0.85 + +/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet + + start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet --output=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather + + +# /gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather + +# python3 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather --output=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather + +/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather +# $srun_cdsc python3 + start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --output=/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather diff --git a/clustering/fit_tsne.py b/clustering/fit_tsne.py index 28b0fd3..c9f45f6 100644 --- a/clustering/fit_tsne.py +++ b/clustering/fit_tsne.py @@ -5,7 +5,7 @@ from numpy import random import numpy as np from sklearn.manifold import TSNE -similarities = "term_similarities_10000.feather" +similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=10000, early_exaggeration=20): ''' diff --git a/density/Makefile b/density/Makefile index 43075a4..d223399 100644 --- a/density/Makefile +++ b/density/Makefile @@ -1,7 +1,10 @@ -all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscratch/comdata/output/reddit_density/comment_authors_10000.feather +all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscratch/comdata/output/reddit_density/comment_authors_10000.feather /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_density/comment_terms_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather - python3 overlap_density.py terms --inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather" --agg=pd.DataFrame.sum + start_spark_and_run.sh 1 overlap_density.py terms --inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather" --agg=pd.DataFrame.sum /gscratch/comdata/output/reddit_density/comment_authors_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather - python3 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather" --agg=pd.DataFrame.sum + start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather" --agg=pd.DataFrame.sum + +/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet + start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum diff --git a/density/job_script.sh b/density/job_script.sh index 553d0a1..7dfac14 100755 --- a/density/job_script.sh +++ b/density/job_script.sh @@ -1,4 +1,4 @@ #!/usr/bin/bash start_spark_cluster.sh -spark-submit --master spark://$(hostname):18899 overlap_density.py wang_overlaps --inpath=/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet --to_date=2020-04-13 +spark-submit --master spark://$(hostname):18899 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --outpath=/gscratch/comdata/output/reddit_density/comment_authors_10000.feather --agg=pd.DataFrame.sum stop-all.sh diff --git a/density/overlap_density.py b/density/overlap_density.py index a1e9f6d..5a8e91a 100644 --- a/density/overlap_density.py +++ b/density/overlap_density.py @@ -5,7 +5,7 @@ import numpy as np import sys sys.path.append("..") sys.path.append("../similarities") -from similarities.similarities_helper import read_tfidf_matrix, reindex_tfidf, reindex_tfidf_time_interval +from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval # this is the mean of the ratio of the overlap to the focal size. # mean shared membership per focal community member @@ -72,5 +72,5 @@ if __name__ == "__main__": fire.Fire({'authors':author_overlap_density, 'terms':term_overlap_density, 'author_weekly':author_overlap_density_weekly, - 'term_weekly':term_overlap_density_weekly, - 'wang_overlaps':wang_overlap_density}) + 'term_weekly':term_overlap_density_weekly}) + diff --git a/similarities/Makefile b/similarities/Makefile index 51fd0fa..0ec0342 100644 --- a/similarities/Makefile +++ b/similarities/Makefile @@ -1,13 +1,25 @@ -all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet +all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms.parquet -/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet - start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.feather +# all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet -/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet - start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.feather -/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet - start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.feather +# /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet +# start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.feather -/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet - start_spark_and_run.sh 1 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10000_weely.parquet +/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv + start_spark_and_run.sh 1 tfidf.py terms --topN=10000 + +/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv + start_spark_and_run.sh 1 tfidf.py authors --topN=10000 + +/gscratch/comdata/output/reddit_similarity/comment_authors_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet + start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather + +/gscratch/comdata/output/reddit_similarity/comment_terms.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet + start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather + +# /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet +# start_spark_and_run.sh 1 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10000_weely.parquet + +/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet + start_spark_and_run.sh 1 cosine_similarities.py author-tf --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet diff --git a/similarities/__pycache__/similarities_helper.cpython-37.pyc b/similarities/__pycache__/similarities_helper.cpython-37.pyc index e5e4965095c4996e907817c2cdc72dede4e3e7b0..eb607f33b4ceca27baf1f8cfd31b66ae8550b102 100644 GIT binary patch literal 10402 zcmc&)$#W#xSGfZSyGj1 zo0q9kOHN^e9&EET7;L}`h7!wxF#B#eb0hcz-iZiK0}-4!`NZ${GF7Egt37idf>Pv{ zFW;VDe!p*fdaF>#8u-2X#s6v=Zy3gZ@Zsd=q3|hO!7bA;gdxnnF)$mZ$+CsA)v$Eg zM%ivSy6p5*gLEUU%c;IQ@EV>jr~8>fwvp9kx1Vd|Fo)aE4+@Q^yegK(3QA9k*Tkw=L+L58E-rpxG-~1r@g(j~i>E{l z_e*95>AvRHZ zUbuIR`X@dlZ|kPY-x;^$?j1h}x}zbU(?8uEiqSqwna%(lKYZZ$Q&&;&jfN?Vh9%5~Ev$wkY;4_8nXuE`_q*FW;fL)0p#`#4Pbsh2 z9JU62v#GMp=3pfD`Yh+0&G+_N{iG%JcI*D#+bX>^l7nln|A`^FjwauKcXK;v%U0Ol z*=&yn*kx;TFYslsiMQQp4MTskErNDa`a*QW&7eEz_FJ+WcKu-Pp}FJt$G+ScAILoB zF5?Ou5YKc>U32)(qYI7p!tq|X8PDCE&V8^McDjOZ>+kjtxUGRd44bXJaAzcg4Y2gR zJwN>Lf53*@?F<&-uYxO$>n5(?Ni;>~Q6|a=bJ;k`;%?!d3yq#Rv5t+Se3b9m69=V2 zn2OR5fGX6aCvIdMn@7dS6~=Bx_ChamADCfg*OQGfJIO^ZTHUBP$w$RrA#!(9y0@o$ ze*?X}r1w{JZx5~Bw70h#&t04pg)N*2yV`b+%}FUNPZk7bnp8+JrW7tlrN|7cy$cU) zJT37lvLZWj9;A-SQGT)lhm@U0+Vf@;$F%`y*YyH+z*wDCX_oWHJ06Bs2p^{s>w%GPIxf(l^ujqQWL5| zM~(*FVAO%6wnCNebo-$%<(gh@TaNa|ZyhMRwY{y(4uA>#;#lUGd2)j@L$_q#DwuXXJ2O}9a-*5Hz z{GeV@>FyA_!h2$4gJyfAL8j7!?yxC3xVP@}zKBWsd%_pZDGLL99B-7e9fN*i%RJyjvI_cSE|x=?@@HN8P<{UWghkX)O?)O_;&qqMA=m&h0AkPdtgM;EJ#XVOvsT&Cce89gV;0Ra z>Xy)6Hsx)MXuBMTAGN_0m;Z`~DF)rhO)%(9G3ZV)n28*q%9GcDs?-DX$QkCtOqc}% zOg=?vVeVS86J;j3Fh40ozzuK(6qJavOqdcjQ5}||%qi?-u_jTT?UmUJqZfoDQW`xK zpl1>2$wj$Il}Kbsq=23@(1Q}t<4!Id8_*TmD5p{6ZrxR8^Ki3G(eDnoYvjAO3bjPO zYWv-Arxsg;8d%+JLj>h27$Gl`Xgn=xJjo|m@g&JpBsG$!NiLCG22o3~oHi*)rzOh{ zd|+!d_J{I0c7C3nZ3w6GASGd|I}9X0rQ9UBM)ESrD!;^%WI(#8Bmi>1eFdIZsF5sh5az|GuBc5 zs4!d$&G~lWJapr>&=L?bh>Hy~;9!m%6mJo=v1u@)B80B=b43Vh>47PK0ctl5Q7NH^ z817|7)7l&iMhOB1b=@n7jL3dqgvFCyw~PlS9jJ#E+V#k& zBnmeux)9-`atH$vl^>W8)#9WA#lLuLiqbLGJ6brmt*qNB=e8~AHkg-5HL{Pj*^5P5 zm@o>J!>?y+_{AzEQEk_2kgZxcsy!$DKBWA8AIjo6R=`oH4XLR6-9ukM_T`|~kv^>R zuzdhw4?-x~+Ngv1Uc!fbv9`a{ZSU0Dtzqq+&(WQ|ehr$w7Mu2$?m-)h+SCsKP&V{- zwrys0D2+V|j>`Sgb+lb~RwEgg*>?mD0RYHdv%pn@Y#* zP}W$e%q=*Tx23kS(%05DNHx-)j5H>5aNWbY+ek#n?$b z*+3(kJcDo3wuF0Uz8M^$Q{M7Bc*THP*>K8c&0e!S=sOry(}M2vtg_a9H4{3~x@h7L z##U~lcbhWAA32`FrL`Yr01mDLBS{B0g*zSGG>vOy(7l8HtVW1N-TXGqz$rX~zAc$bnv3nL2y--l$NVyt~E$zWGcSI;PC4B?DX z%(-QoF)GmJB=Km9<|R|UhuSvb${#l}W%CXm zz-AjvB%2-F$>!7?o6|HNk#(Z%qjb-XU^t@Gwhcy`n$a#8?M4J(kIn*k9<8+%v*8!Q>gm_uAm5# zxPox_XsvbIR6@zNDeGv#spW(TOJ+J!sMo zP7QW(Yd8bHAI9F-=dnSpT22Ci%rxw%r($0D=n{t>DXOhN5_V_UqgMA34*Q9K&4O4M zuU|y(MB3o!5bQQswvXDWks!MW@SE&P=>~YsU3-EI57-4p0M2IFfWa_(U=!E_X$R;MrJq~HV=I{3RGT)0Re&{WJ4OMc1wzvk&u9fpO+6+ihq@sxb~f^G=tZ-zcqzAw3DL7O9a0Bz1d#eiaB!s2st zZT2pc3E))HA3Y~P;PFW=V1kxRoJgo-)Ec7rCXnDyf zOV3JEp% zld0^T(Y~bBSJ^m@X^vp>Qd+`h9+#@nYo1R~D02h8U^@1HR2mr!_(9618p{x5TH&C5 z*cy)yV0j&6{d|?9b10LP=DPeGR-T1|@-y6s(+#@AS@5T_j7@K9v#qjnwBLkRb`R+s zhpR}!ROO%@L!|pgL*!jtWI=uT)`4tt@~b3uk}s3I2%;8dGk#XPddOUyiW-!SFaUWA zC?iP?z{cqUDy3gc`xx5EL@-Kloa@iey93I_hruTabU`R#zWQ2_@S5 z_tASU;lTt)6^grRE!h>g3T3-yR^yVDH`graH>+yOKgPK8aeepy3D-d34Ab-2IzbiD zoF|$S6csdf5%W$;x>OcOUji>gabBYsaqA4h+i25koP*yvADTkiQH9v+ye{ z6O-~+NgjgCpeTQhb+lg4_g`iCH%NYy_XCupbtTggNo)p9;Fo;)2MiqvTt81Ux z!>ougVA9SS8l9$xUjVD?&qW6RiQNQb)G44@ONb911zFXpTFpZ;*&!kEaQ=FlNS{0^V8dwSpF=K65BnVMY*?cd%@?=Hutyg0Q7-L}A=69t*54ZBsG29{kEc74;vhy534Noh;wS*^pgVQy5f7BrI zk9nW;vct9B`mXg=Q~n!Gu_&U~MO;r*H5Z2{vlhk>!0uhF8q55iNo&Mt?*!-?+B#LT z4v9E)aRhZW?9_3j$Fpdj?Z6+kMni2>RAuVK1~}@`u1{=en6rsZsJ5Q8ld~A=$=I3H zOKA&tmA$q5_dH84kZ2#!)yH)Aa7>4_V`HXGoi>Erp!`!3?KDo61#*X@ZTaVH_zcMz zgZXXNX{(uMiHE&23t7)0&xC`pdJ#Cjg8L<1UqPv2 zR~?+3Sr;w&J-pR455)kQ*G*hOCpM(BJQT1kER7)K1b|WmjtHF*Dnmw7kP@2#q(~w^jM#sA{0hxYV3e(Ad)DgL8bX~fqDnQpg!{!3i8@Q5~z8X9HHuh+v08k-Jqe5&shVRScb9PCVYk0-2>ybE({Cr{KyHV%5= zEC6}&yISEwuc9$^8*4hJ14D4e(3nE4C0_<`z~kE@#W3*vxCGwN?R z$$8wGXNp>kp+1N4FOW&pL?=wD;TTi?6Q0lef13Ej;GBPRndz72y#of{@D=p`x&%$+ z;e!1RPiNXZ>fz5xF3nhKU!7JZldKR)Fkr#L(WSAAlxqi><{nSYp)p`5tgSd_nwAX= zYj`?6j;ZBO;*pxS-6%{~maXBg{5%@!PJHA$;+Qn6MJGN9p68JK33T`pIl@oR!`(Gg z*C8plwIzAeL0cdh>RhW5+4_mJZh+J9tjU@+EaL^=WCjt^QAsgFh}DQqNa zJkF-~ICDN|cgF`C+zI|O%H!f05M*ZJvkNj+rLYnF^V1jDDoLokokZ4zQa zAMNOa2c1IKNnxEN)j2|)P>7G~zst^xB(x4%fVG@y2A{&O{v-sk@@~Zc1oj5Lh!NZX w$;14d{4t^{>vJcT@>bb#tn5?Swe00=CY$xLUdD61E8er-E5%p67mK<71!Ju1%Wk>#$nDnH z4sBkO1)Hg239RHL3w{7=_7v-0QBXzIZ1@3L@qM@3lI@v+P(-di_x64Fx#v6Ixz}%0 zDkTlSKiB`-9yB%WAAC9cS;*YP6a5oHXhILQzTVPx<_+YHmZ|b)X!Y%ut@2h_=sPV( z>-FV(nAC z^+f)*E;jJ?q0kUwq{cnyjI-5})1P-itcYi#z+iv9~>rd>L)yd-lCS z>~D8O)cKP1g$Uy9DCh^FCxbZfqYE$XeLo!ea%*%fU5s796VYf~-O^P_$HW_dthVvI zi6;^eiGE@xrqEZk6AP(<)Q+{Do@l}p)_vookQ91`utg!U_sm$En8G>L6XU=T?tNoo zooXjeY$r~ykT{9G1`}D;@a8b>8eQX_4*ivOQnMY1)PZOZ{KM24dGh12A7fVvU8u#2 z(*iUch;CYo#=F_~o;9`(egB|oq)ulTj{AcMU4k(7r7U9oO*b_}lseR1k%U7lEiFVw@m*l1fv+)nYW7bAx2n~DU>aJ1Muj%qS-aFhW{?8m8&9kM3IsTN&BYMyF3cCTj%*y^5W!0@6!m|BbUG4L> zzK$Vs-MM;++B27R`4UPy#36s&+Ntioj~Cdrkw7!BXA>#y+PYxZc4FstZSBFP@9Wg) zNg=ji?{-p1EZDLOTP{GWNT;@3+>v#RFBd83F`BwuVt!^=x?E#{GEYZtFi)eAS16vO zxMYcr(vp0NRo5tN|_@f)(+zS40AVbrF zT8u=SKv}1H4=w;kqzfv`683!)W(VV&&;jT)rV}mE0CugWQNb`PXGfB{BKm77>v?F=7dvwG(7ZWYDQyVts4%#_{%S+Zhsuc!6-aV^*c>7 zHR&6y0epnq!ibl(u$cLT3+A$ob`c$&reD{qFc|q7@>7`2;f1wI2e+|mmq5{+saTrakp{`N$bivXD&DQ%Bj8u( zHR-{M!@Iszo(A;;Z!|gvvshqmKb->wqiGZ&#At4$X6O$jT~DfaQxkTUT0s;H(9I7K zRHUv5qBuZAl9mpGc>hgBPH9OF58DV$c462S=aO_$=^R!YgSdx-H;z*W5sNR8v3f%c z=6tgjnYqhvI6d4RT$hVL_9 z5PUB{_acPOz(dfM3ECVi6$Q~M;5M&vHO57N1;JWYU`5D01L_{?lzVstOfLYmoB*Kt z919Gzr{DzidUZUD0A~ZBr7O{5z%+wr2F~7cZv~KC6$=FA6q1d{LQ=jC?J5wet(9NH zD=3ZRkWm`_qbeDuIih5+1!Ll*O<%A8O9u3dDA&Kk}^>;MkOiX{fu#DM(qk3 zbuO)tdW+4vFsqsEoWFKVJ+_#i0OlKXUS&mQSfLrrZM<& zd-_TL88of(0Eexd=_09ScRM_xbM{xc7#E}BECIs0gsrs*Jr58BfMop(0o_m0FM1K; z0>p3%co<|2t6@BEyGG4)&9WgS`aGIB{zb4E=K?=%!hlAf;LN4`I6;J980KQ`A=H~# zu?=d@lG;dEB!s#zF7 z&cjT}G_~ot3jnQ)IWVJI!@OWSoUhdy06NT}>eUPkKFgwnwboIy2$0Mnuh?P*XF)Ny z!a3QhHk}hQsNk`j*l4?i`Ih6A$*SO2uO;eR(TBm~g;gog;{xd6K3pOG3UFo6* zP*Tk1&UL~sIA4PAM+t5dGTrB`%y_-GabWyLmw&-Z%A$%^n|Q7?lokibvlM|UfZf|{ zXe->$yf)$a_YCOI!6>q`s1bKRar)lTjkw!Pi|Tgh2r&y{Ph&Vxypq;uAyFIA5)bnk z*YG@(F_9urp8jU1o+mPD>>JOg>Uh-J;f z!Zzc+NGSmKS#2@wM2IL^mX>CUx{U1g-Ek0#vy*)mspt1Kf5h5m&}Qg|fUm6A83r0J z0LIsmUQ^|DL-p-)Nw{tMuGelNz#yIgQ5FRPGmGRB%cs4G8 z5Kci*8|yg;^I8SMYR_fWBLd}G^YZOmYoKSp3@iU5_Q~ZMB5)&?Fc)0+SzoT8m8xYOF{h?9U zui`1=gw!x=fWkE=g&v>1_wY6Ia2$`u@pgXGI%E3n8I0Sz$2`e` zk^6ld;i9chba#5;#!E{+3UN1(QJ0iNj}hZusE13po|R|^xWMchYg*4n9EOW#xM5>Q zT?I4?sWB2QS6yRaWLyK``WB9@)2f`!*p}YlK)!&{aRqP1!_khqC~Fm1F1lG1e@S`E z+3~#1)-%TyEy5{8JQ`@Y5L(yeuQ{kL?=g7_$Ma(qeQZS^huOKqM1?x3q?QjQNPcVU z>RGdXh`Z6P5dU@v_qJ3p6YP#-U#Osqr{n$zchr7%CU-{1p2W34Yd+6x<<2j=+~fUV zs~mNL(eV}+f%|_4MYy)eZmmD%i;XqlcK;RRCAFb{!&_u@6^NsFF`D8mnDe>kAy57(|Ryqe_}-7+4>ecHX| LzFd8!`qIAvgi>*; diff --git a/similarities/cosine_similarities.py b/similarities/cosine_similarities.py index 609e477..c1e99c9 100644 --- a/similarities/cosine_similarities.py +++ b/similarities/cosine_similarities.py @@ -1,30 +1,53 @@ import pandas as pd import fire from pathlib import Path -from similarities_helper import similarities +from similarities_helper import similarities, column_similarities -def cosine_similarities(infile, term_colname, outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False,from_date=None, to_date=None): - return similiarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date) +def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): -def term_cosine_similarities(outfile, min_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): + return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) + + +def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet', 'term', outfile, min_df, + max_df, + included_subreddits, + topN, + exclude_phrases, + from_date, + to_date) + +def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): + return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet', + 'author', + outfile, + min_df, + max_df, included_subreddits, topN, - exclude_phrasesby.) + exclude_phrases=False, + from_date=from_date, + to_date=to_date) -def author_cosine_similarities(outfile, min_df=2, included_subreddits=None, topN=10000, from_date=None, to_date=None): +def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None): return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet', 'author', outfile, min_df, + max_df, included_subreddits, topN, - exclude_phrases=False) + exclude_phrases=False, + from_date=from_date, + to_date=to_date, + tfidf_colname='relative_tf') + if __name__ == "__main__": fire.Fire({'term':term_cosine_similarities, - 'author':author_cosine_similarities}) + 'author':author_cosine_similarities, + 'author-tf':author_tf_similarities}) diff --git a/similarities/job_script.sh b/similarities/job_script.sh index e79a061..03e77de 100755 --- a/similarities/job_script.sh +++ b/similarities/job_script.sh @@ -1,4 +1,4 @@ #!/usr/bin/bash start_spark_cluster.sh -spark-submit --master spark://$(hostname):18899 wang_similarity.py --infile=/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet --max_df=10 --outfile=/gscratch/comdata/output/reddit_similarity/wang_similarity_1000_max10.feather +spark-submit --master spark://$(hostname):18899 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather stop-all.sh diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index 69516a6..9e33c9d 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -75,17 +75,20 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre spark.stop() return (tempdir, subreddit_names) -def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): +def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): + ''' + tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities. + ''' if from_date is not None or to_date is not None: - tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname='author', min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date) + tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date) else: - tempdir, subreddit_names = reindex_tfidf(infile, term_colname='author', min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False) + tempdir, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False) print("loading matrix") # mat = read_tfidf_matrix("term_tfidf_entries7ejhvnvl.parquet", term_colname) - mat = read_tfidf_matrix(tempdir.name, term_colname) + mat = read_tfidf_matrix(tempdir.name, term_colname, tfidf_colname) print('computing similarities') sims = simfunc(mat) del mat @@ -108,14 +111,24 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non sims.to_feather(outfile) tempdir.cleanup() -def read_tfidf_matrix_weekly(path, term_colname, week): +def read_tfidf_matrix_weekly(path, term_colname, week, tfidf_colname='tf_idf'): term = term_colname term_id = term + '_id' term_id_new = term + '_id_new' dataset = ds.dataset(path,format='parquet') - entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new],filter=ds.field('week')==week).to_pandas() - return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1)))) + entries = dataset.to_table(columns=[tfidf_colname,'subreddit_id_new', term_id_new],filter=ds.field('week')==week).to_pandas() + return(csr_matrix((entries[tfidf_colname], (entries[term_id_new]-1, entries.subreddit_id_new-1)))) + +def read_tfidf_matrix(path, term_colname, tfidf_colname='tf_idf'): + term = term_colname + term_id = term + '_id' + term_id_new = term + '_id_new' + dataset = ds.dataset(path,format='parquet') + print(f"tfidf_colname:{tfidf_colname}") + entries = dataset.to_table(columns=[tfidf_colname, 'subreddit_id_new',term_id_new]).to_pandas() + return(csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)))) + def write_weekly_similarities(path, sims, week, names): sims['week'] = week @@ -127,15 +140,6 @@ def write_weekly_similarities(path, sims, week, names): sims = sims.melt(id_vars=['subreddit','week'],value_vars=names.subreddit.values) sims.to_parquet(p / week.isoformat()) -def read_tfidf_matrix(path,term_colname): - term = term_colname - term_id = term + '_id' - term_id_new = term + '_id_new' - - dataset = ds.dataset(path,format='parquet') - entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new]).to_pandas() - return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1)))) - def column_overlaps(mat): non_zeros = (mat != 0).astype('double') @@ -383,7 +387,7 @@ def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm return df -def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv"): +def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonswf.csv"): rankdf = pd.read_csv(path) included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values) return included_subreddits diff --git a/similarities/tfidf.py b/similarities/tfidf.py index b7b4e63..885dae2 100644 --- a/similarities/tfidf.py +++ b/similarities/tfidf.py @@ -58,12 +58,13 @@ def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfi def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', topN=25000): + return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", - outpath, - topN, - 'term', - [] - ) + outpath, + topN, + 'term', + [] + ) if __name__ == "__main__": diff --git a/similarities/wang_similarity.py b/similarities/wang_similarity.py index 99dc3cb..452e07a 100644 --- a/similarities/wang_similarity.py +++ b/similarities/wang_similarity.py @@ -12,7 +12,7 @@ infile="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet def wang_overlaps(infile, outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather", min_df=1, max_df=None, included_subreddits=None, topN=10000, exclude_phrases=False, from_date=None, to_date=None): - return similarities(infile=infile, simfunc=wang_similarity, term_colname='author', outfile=outfile, min_df=min_df, max_df=None, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases, from_date=from_date, to_date=to_date) + return similarities(infile=infile, simfunc=wang_similarity, term_colname='author', outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases, from_date=from_date, to_date=to_date) if __name__ == "__main__": fire.Fire(wang_overlaps) diff --git a/visualization/subreddit_author_tf_similarities_10000.html b/visualization/subreddit_author_tf_similarities_10000.html index 722f5b0..eac12c5 100644 --- a/visualization/subreddit_author_tf_similarities_10000.html +++ b/visualization/subreddit_author_tf_similarities_10000.html @@ -14,7 +14,7 @@