B
    3ë_ >  ã               @   sD  d dl mZ d dl mZ d dl mZ d dlmZ d dlmZ d dl	m
Z
 d dlZd dlmZ d dlmZmZ d dlZd dlZd dlZd d	lmZ d d
lmZ G dd„ deƒZdZd0dd„Zd1dd„Zd2dd„Zd3dd„Zd4dd„Zdd„ Z dd„ Z!dd „ Z"d!d"„ Z#d#d$„ Z$d%d&„ Z%ej&fd'd(„Z'd)d*„ Z(ej&fd+d,„Z)d5d.d/„Z*dS )6é    )ÚSparkSession)ÚWindow)Ú	functions)ÚEnum)ÚCoordinateMatrix)ÚTemporaryDirectoryN)Ú
csr_matrixÚissparse)Údatetime)ÚPathc               @   s   e Zd ZdZdZdS )Ú	tf_weighté   é   N)Ú__name__Ú
__module__Ú__qualname__ÚMaxTFÚNorm05© r   r   úP/gscratch/comdata/users/nathante/cdsc_reddit/similarities/similarities_helper.pyr      s   r   zO/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquetéô  Fc	             C   s  |}	|	d }
|	d }t j ¡ }|j ¡ }t|ƒ |j | ¡}|d k	rjt|ƒt	krZt
 |¡}| |j|k¡}|d k	r˜t|ƒt	krˆt
 |¡}| |j|k ¡}| dd|
|	g¡ t d¡ d¡¡}t||tjƒ}t|||||ƒ}| |j¡}| ddg¡ ¡  ¡ }| d¡}|d d |d< ||fS )NÚ_idÚ_id_newÚ	subredditÚweekÚtfÚsubreddit_id_newr   )r   ÚbuilderÚgetOrCreateÚsparkContextÚgetConfÚprintÚreadÚparquetÚtypeÚstrr
   ÚfromisoformatÚfilterr   ÚgroupByÚaggÚfÚsumÚaliasÚ_calc_tfidfr   r   Úprep_tfidf_entriesZread_parquetÚnameÚselectÚdistinctÚtoPandasÚsort_values)ÚinfileÚterm_colnameÚmin_dfÚmax_dfÚincluded_subredditsÚtopNÚexclude_phrasesÚ	from_dateÚto_dateÚtermÚterm_idÚterm_id_newÚsparkÚconfÚtfidf_weeklyÚtfidfÚtempdirÚsubreddit_namesr   r   r   Úreindex_tfidf_time_interval   s.    



$
rF   c             C   sÌ   t j ¡ }|j ¡ }t|ƒ |j | ¡}	|d kr:t|ƒ}nt	t
|ƒƒ}|dkrf|	 t |¡ d¡ ¡}	tdƒ t|	||||ƒ}
|j |
j¡}	|	 ddg¡ ¡  ¡ }| d¡}|d d |d< | ¡  |
|fS )NTÚ_z/creating temporary parquet with matrix indiciesr   r   r   )r   r   r   r   r    r!   r"   r#   Úselect_topN_subredditsÚsetÚopenr'   r*   ÚcolÚcontainsr.   r/   r0   r1   r2   r3   Ústop)r4   r5   r6   r7   r8   r9   r:   r@   rA   rC   rD   rE   r   r   r   Úreindex_tfidf5   s"    



rN   Útf_idfc             C   sP  |	dk	s|
dk	r0t | |||||d|	|
d	\}}nt| |||||dd\}}tdƒ t|j||ƒ}tdƒ ||ƒ}~t|ƒr‚| ¡ }td|j› ƒ tdt|j	j
ƒ› ƒ t |¡}|jd	d
„ t|j	j
ƒD ƒdd}|j	j
|d< t|ƒ}tt|ƒ d |j¡d¡ƒ}tt|ƒ d |j¡d¡ƒ}tt|ƒ d |j¡d¡ƒ}| |¡ | ¡  dS )z”
    tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities.
    NF)r5   r6   r7   r8   r9   r:   r;   r<   )r5   r6   r7   r8   r9   r:   zloading matrixzcomputing similaritieszshape of sims:z&len(subreddit_names.subreddit.values):c             S   s   i | ]\}}||“qS r   r   )Ú.0ÚiÚsrr   r   r   ú
<dictcomp>f   s    z similarities.<locals>.<dictcomp>r   )Úaxisr   Ú z.featherz.csvz.parquet)rF   rN   r!   Úread_tfidf_matrixr/   r	   ZtodenseÚshapeÚlenr   ÚvaluesÚpdÚ	DataFrameÚrenameÚ	enumerater   r%   ÚreplaceÚjoinÚsuffixesZ
to_featherÚcleanup)r4   Zsimfuncr5   Úoutfiler6   r7   r8   r9   r:   r;   r<   Útfidf_colnamerD   rE   ÚmatÚsimsÚpZoutput_featherZ
output_csvZoutput_parquetr   r   r   ÚsimilaritiesO   s*     
 
rg   c       	      C   sf   |}|d }|d }t j| dd}|j|d|gt  d¡|kd ¡ }t|| || d |jd ffƒS )	Nr   r   r#   )Úformatr   r   )Úcolumnsr'   r   )ÚdsÚdatasetÚto_tableÚfieldÚ	to_pandasr   r   )	Úpathr5   r   rc   r=   r>   r?   rk   Úentriesr   r   r   Úread_tfidf_matrix_weeklyr   s    "rq   c             C   sh   |}|d }|d }t j| dd}td|› ƒ |j|d|gd ¡ }t|| || d |jd ffƒS )	Nr   r   r#   )rh   ztfidf_colname:r   )ri   r   )rj   rk   r!   rl   rn   r   r   )ro   r5   rc   r=   r>   r?   rk   rp   r   r   r   rV   {   s    rV   c             C   sN   ||d< t  | ¡}| ¡ s"| ¡  |jddg|jjd}| || ¡  ¡ d S )Nr   r   )Zid_varsZ
value_vars)	Úpathlibr   Úis_dirÚmkdirZmeltr   rY   Z
to_parquetÚ	isoformat)ro   re   r   Únamesrf   r   r   r   Úwrite_weekly_similarities…   s    
rw   c             C   s>   | dk  d¡}|j| }|jdd}tj ||¡| }|| S )Nr   Údouble)rT   )ÚastypeÚTr+   ÚnpÚaddÚouter)rd   Z	non_zerosÚintersectionZcard1Zdenr   r   r   Úcolumn_overlaps   s
    
r   c             C   sB   t  t j|  d¡jdddt jd¡}|  d| ¡} | j|  }|S )Nr   r   )rT   g      à?)Údtyper   )r{   ÚmatrixÚpowerr+   Úfloat32Úmultiplyrz   )rd   Únormre   r   r   r   Úcolumn_similarities˜   s    &
r†   c             C   s€  |}|d }|d }|d kr<dt |ƒ }|  t d¡|k¡} |d k	rX|  t d¡|k¡} |  t d¡ |¡¡} |  ddg¡ ¡ }| dt ¡  	t
 d¡ d¡¡¡}|  |ddg¡} |  |dg¡ t |¡ d	¡¡}	| j|	|dgd
d} |  |dg¡ ¡ }
|
 |t ¡  	t
 d¡ |¡¡¡}
|  |
|dg¡} |  dd¡} |  d| j| j  d¡¡} tdddd}|  d¡} | jj|jddd |S )Nr   r   gš™™™™™¹?Úcountr   Úsubreddit_idr   r   Ú	new_countÚinner)ÚhowrO   Ú
tf_idf_oldÚfloatz.parquetÚterm_tfidf_entriesÚ.)ÚsuffixÚprefixÚdirÚ	overwriteÚsnappy)ÚmodeÚcompression)rX   r'   r*   rK   Úisinr0   r1   Ú
withColumnÚ
row_numberÚoverr   ÚpartitionByÚorderByr_   r(   r)   r‡   r,   ÚwithColumnRenamedÚrelative_tfÚidfÚcastr   ÚrepartitionÚwriter#   r/   )rC   r5   r6   r7   r8   r=   r>   r?   Úsub_idsr‰   Úterm_idsrD   r   r   r   Úprep_tfidf_entries_weeklyŸ   s.    " "
r¥   c             C   sT  |}|d }|d }|d kr<dt |ƒ }|  t d¡|k¡} |d k	rX|  t d¡|k¡} |  t d¡ |¡¡} |  d¡ ¡ }| dt ¡  	t
 d¡¡¡}|  |d¡} |  |¡ t |¡ d¡¡}	| j|	|d	d
} |  |g¡ ¡ }
|
 |t ¡  	t
 |¡¡¡}
|  |
|¡} |  dd¡} |  d| j| j  d¡¡} tdddd}| jj|jddd |S )Nr   r   gš™™™™™¹?r‡   r   rˆ   r   r‰   rŠ   )r‹   rO   rŒ   r   z.parquetrŽ   r   )r   r‘   r’   r“   r”   )r•   r–   )rX   r'   r*   rK   r—   r0   r1   r˜   r™   rš   r   rœ   r_   r(   r)   r‡   r,   r   rž   rŸ   r    r   r¢   r#   r/   )rC   r5   r6   r7   r8   r=   r>   r?   r£   r‰   r¤   rD   r   r   r   r.   Å   s,    r.   c             C   sr  |}|d }|d }|d kr(dt |ƒ }|  t d¡ |¡¡} |  ¡ } |  d¡ ¡ }| dt 	¡  
t d¡¡¡}|  |d¡} |  |¡ t |¡ d¡¡}	| j|	|dd	} |  |g¡ ¡ }
|
 |t 	¡  
t |¡¡¡}
|  |
|¡} |  d
d¡} |  d
| j| j ¡} tt |ƒd d ƒ}|  t |¡d t d¡d d
¡j |¡}t|ƒ}t|j |¡ƒ}| ¡ }|j|d}|| fS )Nr   r   gš™™™™™¹?r   rˆ   r   r‰   rŠ   )r‹   rO   rŒ   r   é   r   )Ú	threshold)rX   r'   r*   rK   r—   Úcacher0   r1   r˜   r™   rš   r   rœ   r_   r(   r)   r‡   r,   r   rž   rŸ   ÚintÚrddr¡   r   rp   ZtoRowMatrixZcolumnSimilarities)rC   r5   r6   r8   Zsimilarity_thresholdr=   r>   r?   r£   r‰   r¤   Zn_partitionsrp   ZcoordMatrd   Zsim_distr   r   r   Úspark_cosine_similaritiesê   s0    *r«   c             C   sî  |}|d }|   | j |¡¡} |  d|dg¡ t d¡ d¡¡} |  ddg¡ 	d¡}| 
dd¡}| j|ddgd} |  d| j| j ¡} |  |dg¡ ¡ }|  ddg¡ ¡  dg¡ t d¡ d	¡¡}|j|dgd}| d
t |j¡dt d¡  d ¡}| |dg¡ ¡ }	|	 |t ¡  t d¡ |¡¡¡}	|  ddg¡ ¡ }
|
 dt ¡  t d¡ d¡¡¡}
| j|
ddgd} | j|	|dgd} |j|	|dgd}| j|||dgd} |tjkrÎ|  d| j| j ¡} n|  ddd| j  | j ¡} | S )Nr   r   r   r   zmax(tf)Ú	sr_max_tf)Úonrž   Úsubreddits_in_weekrŸ   r   r‡   rˆ   rO   g      à?)r'   r   r—   r(   r)   r*   r+   r,   ÚgroupbyÚmaxr   r_   r˜   r   r¬   r‡   r0   r1   Úlogr®   rK   r™   rš   r   r›   rœ   r   r   rž   rŸ   )ÚdfÚinclude_subsr5   Ú	tf_familyr=   r>   Úmax_subreddit_termsrŸ   ÚN_docsÚtermsÚ
subredditsr   r   r   Úbuild_weekly_tfidf_dataset  s0    ",&""r¹   c       
   
   C   sf  |}|d }|   dg¡ d¡}| dd¡}| j|dd} |  d| j| j ¡} |   |g¡ ¡ }|  d¡ 	¡  ¡ }| dt
 |d	t
 d
¡  ¡d	 ¡}| |¡ 	¡ }| |t
 ¡  t |¡¡¡}|  dg¡ 	¡ }	|	 dt
 ¡  t d¡¡¡}	| j|	dd} | j||d} |j||d}| j|||gd} |tjkrF|  d| j| j ¡} n|  ddd| j  | j ¡} | S )Nr   r   r   zmax(tf)r¬   )r­   rž   rŸ   r   r‡   rˆ   rO   g      à?)r¯   r°   r   r_   r˜   r   r¬   r‡   r0   r1   r*   r±   rK   r™   rš   r   rœ   r   r   rž   rŸ   )
r²   r5   r´   r=   r>   rµ   rŸ   r¶   r·   r¸   r   r   r   r-   O  s*    $r-   c             C   sN   |}|d }|   | j |¡¡} |  d|g¡ t d¡ d¡¡} t| ||ƒ} | S )Nr   r   r   )	r'   r   r—   r(   r)   r*   r+   r,   r-   )r²   r³   r5   r´   r=   r>   r   r   r   Úbuild_tfidf_dataset{  s     rº   úP/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonswf.csvc             C   s(   t  |¡}t|j|j| kdf jƒ}|S )Nr   )rZ   Zread_csvrI   ÚlocZcomments_rankrY   )r9   ro   Zrankdfr8   r   r   r   rH   †  s    
rH   )NNNr   FNN)NNNr   F)NNNr   FNNrO   )rO   )rO   )r»   )+Úpyspark.sqlr   r   r   r*   Úenumr   Z pyspark.mllib.linalg.distributedr   Útempfiler   ÚpyarrowZpyarrow.datasetrk   rj   Zscipy.sparser   r	   ÚpandasrZ   Únumpyr{   rr   r
   r   r   r4   rF   rN   rg   rq   rV   rw   r   r†   r¥   r.   r«   r   r¹   r-   rº   rH   r   r   r   r   Ú<module>   s<   


#
	


	&%23,