# coding: utf-8 # # Import data and get things setup import random random.seed(9001) # import code to write r modules and create our variable we'll write to import rpy2.robjects as robjects from rpy2.robjects import pandas2ri pandas2ri.activate() r = {} def remember(name, x): r[name] = x # load in modules we'll need for analysis import subprocess import csv from igraph import * import pandas as pd import numpy as np import re # grab the largest connected compontent with a little function def get_largest_component(g): g_components = g.components(mode="WEAK") max_size = max(g_components.sizes()) for g_tmp in g_components.subgraphs(): if g_tmp.vcount() == max_size: return(g_tmp) # look the full edgelist into igraph def edge_list_iter(df): for i, row in df.iterrows(): yield (row['from'], row['to']) # list top 5 journals for each of the clusters def top_journals_for_clusters(clu): articles_tmp = pd.merge(clu, articles[['eid', 'source_title']]) output = pd.DataFrame() for cid in articles_tmp['cluster'].unique(): journal_counts = articles_tmp['source_title'][articles_tmp['cluster'] == cid].value_counts().head(5) tmp = pd.DataFrame({'cluster' : cid, 'count' : journal_counts }) output = output.append(tmp) output = output.reset_index() output = output.rename(columns = {'index' : "journal"}) return(output) def infomap_edgelist(g, edgelist_filename, directed=True): nodes_tmp = pd.DataFrame([ {'node_infomap' : v.index, 'eid' : v['name']} for v in g.vs ]) # write out the edgelist to an external file so we can call infomap on it with open("code/bibliometrics/" + edgelist_filename + ".txt", 'w') as f: for e in g.es: if e.source != e.target: if 'weight' in e.attributes(): print("{}\t{}\t{}".format(e.source, e.target, e['weight']), file=f) else: print("{}\t{}".format(e.source, e.target), file=f) # run the external program to generate the infomap clustering infomap_cmdline = ["code/bibliometrics/infomap/Infomap", "code/bibliometrics/" + edgelist_filename + ".txt", "code/bibliometrics/output_dir -z --map --clu --tree"] if directed: infomap_cmdline.append("-d") subprocess.call(infomap_cmdline) # load up the clu data clu = pd.read_csv("code/bibliometrics/output_dir/" + edgelist_filename + ".clu", header=None, comment="#", delim_whitespace=True) clu.columns = ['node_infomap', 'cluster', 'flow'] return pd.merge(clu, nodes_tmp, on="node_infomap") def write_graphml(g, clu, graphml_filename): clu = clu[['node_infomap', 'cluster']].sort_values('node_infomap') g.vs["cluster"] = clu["cluster"].tolist() g.write_graphml("code/bibliometrics/" + graphml_filename) # load article data articles = pd.read_csv("processed_data/abstracts.tsv", delimiter="\t") # # network for just the central "social media" set # this contains the list of all INCOMING citations to for paper in the original set raw_edgelist = pd.read_csv("processed_data/social_media_edgelist.txt", delimiter="\t") g_sm_all = Graph.TupleList([i for i in edge_list_iter(raw_edgelist)], directed=True) g_sm = get_largest_component(g_sm_all) g_sm = g_sm.simplify() g_sm_clu = infomap_edgelist(g_sm, "sm_edgelist_infomap", directed=True) g_sm_clu['cluster'].value_counts() write_graphml(g_sm, g_sm_clu, "g_sm.graphml") # # larger network that contains the incoming cites to citing articles # this contains the list of all INCOMING citations to everything in the original set # plus every INCOMING citation to every paper that cites one of those papers raw_edgelist_files = ["processed_data/citation_edgelist.txt", "processed_data/social_media_edgelist.txt"] combo_raw_edgelist = pd.concat([pd.read_csv(x, delimiter="\t") for x in raw_edgelist_files]) g_full_all = Graph.TupleList([i for i in edge_list_iter(combo_raw_edgelist)], directed=True) g_full = get_largest_component(g_full_all) g_full = g_full.simplify() g_full_clu = infomap_edgelist(g_full, "citation_edglist_infomap", directed=True) g_full_clu['cluster'].value_counts() top_journals_for_clusters(g_full_clu) write_graphml(g_full, g_full_clu, "g_full.graphml") # # create the meta-network of connections between clusters edgelist_tmp = pd.merge(raw_edgelist, g_sm_clu[["eid", "cluster"]], how="inner", left_on="to", right_on="eid") edgelist_tmp = edgelist_tmp.rename(columns={'cluster' : 'to_cluster'}) edgelist_tmp.drop('eid', 1, inplace=True) edgelist_tmp = pd.merge(edgelist_tmp, g_sm_clu[["eid", "cluster"]], how="inner", left_on="from", right_on="eid") edgelist_tmp = edgelist_tmp.rename(columns={"cluster" : 'from_cluster'}) edgelist_tmp.drop('eid', 1, inplace=True) edgelist_tmp = edgelist_tmp[["to_cluster", "from_cluster"]] edgelist_tmp = edgelist_tmp[edgelist_tmp["to_cluster"] != edgelist_tmp["from_cluster"]] cluster_edgelist = pd.crosstab(edgelist_tmp["to_cluster"], edgelist_tmp["from_cluster"]) cluster_edgelist["to_cluster"] = cluster_edgelist.index cluster_edgelist = pd.melt(cluster_edgelist, id_vars=["to_cluster"]) cluster_edgelist = cluster_edgelist[cluster_edgelist['to_cluster'] != cluster_edgelist['from_cluster']] remember("cluster_edgelist", cluster_edgelist) top_clusters = g_sm_clu["cluster"].value_counts().head(6).index # write the edgelist for the total number of clusters (currently 1-6) cluster_edgelist_output = cluster_edgelist[(cluster_edgelist["to_cluster"].isin(top_clusters)) & (cluster_edgelist["from_cluster"].isin(top_clusters))] cluster_edgelist_output = cluster_edgelist_output[cluster_edgelist_output["value"] > 0] g_cluster = Graph.TupleList([tuple(x) for x in cluster_edgelist_output[["from_cluster", "to_cluster"]].values], directed=True) g_cluster.es["weight"] = cluster_edgelist_output["value"].tolist() # assign the number of total articles as an attribute for each node g_cluster.vs["papers"] = g_sm_clu["cluster"].value_counts()[[x["name"] for x in g_cluster.vs]].tolist() g_cluster.write_graphml("code/bibliometrics/clusters.graphml") # # create network stats for tables (overall and within clusters) def create_network_stats(g): network_stats = pd.DataFrame({'eid' : g.vs['name'], 'eig_cent' : g.eigenvector_centrality(), 'indegree' : g.indegree(), 'betweenness' : g.betweenness()}) network_stats = pd.merge(network_stats, articles[['eid', 'title', 'source_title']], how="inner") return network_stats network_stats = create_network_stats(g_full) network_stats.sort_values("indegree", ascending=False).head(4) network_stats.sort_values("eig_cent", ascending=False).head(4) network_stats.sort_values("betweenness", ascending=False).head(4) # # things to store remember('total_articles', articles.shape[0]) # total number of citations in the sm dataset remember('sm_citations', raw_edgelist.shape[0]) remember('sm_citing', len(raw_edgelist["from"].unique())) # the number of articles in the original dataset that have any INCOMING citations remember('sm_cited', len(raw_edgelist["to"].unique())) # total number of citations in the sm dataset remember('all_citations', combo_raw_edgelist.shape[0]) remember('all_citing', len(combo_raw_edgelist["from"].unique())) # the number of articles in the original dataset that have any INCOMING citations remember('all_cited', len(combo_raw_edgelist["to"].unique())) remember('g_sm_clusters', g_sm_clu[["eid", "cluster"]]) sorted(r.keys()) #save the r function to rdata file def save_to_r(r_dict, filename="output.RData"): for var_name, x in r.items(): var_name = var_name.replace('_', '.') if type(x) == np.int64: x = np.asscalar(x) if type(x) == pd.DataFrame: rx = pandas2ri.py2ri(x) else: rx = x robjects.r.assign(var_name, x) # create a new variable called in R robjects.r("r <- sapply(ls(), function (x) {eval(parse(text=x))})") robjects.r('save("r", file="{}")'.format(filename)) robjects.r("rm(list=ls())") save_to_r(r, "paper/data/network_data.RData")