code/bibliometrics/00_citation_network_analysis.py

   1 # coding: utf-8
   2 # # Import data and get things setup
   3
   4 import random
   5 random.seed(9001)
   6
   7 # import code to write r modules and create our variable we'll write to
   8 import rpy2.robjects as robjects
   9 from rpy2.robjects import pandas2ri
  10 pandas2ri.activate()
  11
  12 r = {}
  13 def remember(name, x):
  14     r[name] = x
  15
  16 # load in modules we'll need for analysis
  17 import subprocess
  18 import csv
  19 from igraph import *
  20 import pandas as pd
  21 import numpy as np
  22 import re
  23
  24 # grab the largest connected compontent with a little function
  25 def get_largest_component(g):
  26     g_components = g.components(mode="WEAK")
  27     max_size = max(g_components.sizes())
  28     for g_tmp in g_components.subgraphs():
  29         if g_tmp.vcount() == max_size:
  30             return(g_tmp)
  31
  32 # look the full edgelist into igraph
  33 def edge_list_iter(df):
  34     for i, row in df.iterrows():
  35         yield (row['from'], row['to'])
  36
  37 # list top 5 journals for each of the clusters
  38 def top_journals_for_clusters(clu):
  39     articles_tmp = pd.merge(clu, articles[['eid', 'source_title']])
  40
  41     output = pd.DataFrame()
  42     for cid in articles_tmp['cluster'].unique():
  43         journal_counts = articles_tmp['source_title'][articles_tmp['cluster'] == cid].value_counts().head(5)
  44         tmp = pd.DataFrame({'cluster' : cid, 'count' : journal_counts })
  45         output = output.append(tmp)
  46
  47     output = output.reset_index()
  48     output = output.rename(columns = {'index' : "journal"})
  49     return(output)
  50
  51 def infomap_edgelist(g, edgelist_filename, directed=True):
  52     nodes_tmp = pd.DataFrame([ {'node_infomap' : v.index,
  53                                 'eid' : v['name']} for v in g.vs ])
  54
  55     # write out the edgelist to an external file so we can call infomap on it
  56     with open("code/bibliometrics/" + edgelist_filename + ".txt", 'w') as f:
  57         for e in g.es:
  58             if e.source != e.target:
  59                 if 'weight' in e.attributes():
  60                     print("{}\t{}\t{}".format(e.source, e.target, e['weight']), file=f)
  61                 else:
  62                     print("{}\t{}".format(e.source, e.target), file=f)
  63
  64
  65     # run the external program to generate the infomap clustering
  66     infomap_cmdline = ["code/bibliometrics/infomap/Infomap", "code/bibliometrics/" + edgelist_filename + ".txt", "code/bibliometrics/output_dir -z --map --clu --tree"]
  67     if directed:
  68         infomap_cmdline.append("-d")
  69     subprocess.call(infomap_cmdline)
  70
  71     # load up the clu data
  72     clu = pd.read_csv("code/bibliometrics/output_dir/" + edgelist_filename + ".clu",
  73                       header=None, comment="#", delim_whitespace=True)
  74     clu.columns = ['node_infomap', 'cluster', 'flow']
  75
  76     return pd.merge(clu, nodes_tmp, on="node_infomap")
  77
  78
  79 def write_graphml(g, clu, graphml_filename):
  80     clu = clu[['node_infomap', 'cluster']].sort_values('node_infomap')
  81     g.vs["cluster"] =  clu["cluster"].tolist()
  82     g.write_graphml("code/bibliometrics/" + graphml_filename)
  83
  84
  85 # load article data
  86 articles = pd.read_csv("processed_data/abstracts.tsv", delimiter="\t")
  87
  88 # # network for just the central "social media" set
  89
  90 # this contains the list of all INCOMING citations to for paper in the original set
  91 raw_edgelist = pd.read_csv("processed_data/social_media_edgelist.txt", delimiter="\t")
  92
  93 g_sm_all = Graph.TupleList([i for i in edge_list_iter(raw_edgelist)], directed=True)
  94
  95
  96 g_sm = get_largest_component(g_sm_all)
  97 g_sm = g_sm.simplify()
  98
  99 g_sm_clu = infomap_edgelist(g_sm, "sm_edgelist_infomap", directed=True)
 100
 101 g_sm_clu['cluster'].value_counts()
 102
 103 write_graphml(g_sm, g_sm_clu, "g_sm.graphml")
 104
 105
 106 # # larger network that contains the incoming cites to citing articles
 107
 108 # this contains the list of all INCOMING citations to everything in the original set
 109 # plus every INCOMING citation to every paper that cites one of those papers
 110 raw_edgelist_files = ["processed_data/citation_edgelist.txt",
 111                       "processed_data/social_media_edgelist.txt"]
 112 combo_raw_edgelist = pd.concat([pd.read_csv(x, delimiter="\t") for x in raw_edgelist_files])
 113
 114
 115 g_full_all = Graph.TupleList([i for i in edge_list_iter(combo_raw_edgelist)], directed=True)
 116
 117 g_full = get_largest_component(g_full_all)
 118 g_full = g_full.simplify()
 119
 120
 121 g_full_clu = infomap_edgelist(g_full, "citation_edglist_infomap", directed=True)
 122
 123
 124 g_full_clu['cluster'].value_counts()
 125
 126 top_journals_for_clusters(g_full_clu)
 127
 128 write_graphml(g_full, g_full_clu, "g_full.graphml")
 129
 130
 131 # # create the meta-network of connections between clusters
 132
 133 edgelist_tmp = pd.merge(raw_edgelist, g_sm_clu[["eid", "cluster"]], how="inner", left_on="to", right_on="eid")
 134 edgelist_tmp = edgelist_tmp.rename(columns={'cluster' : 'to_cluster'})
 135 edgelist_tmp.drop('eid', 1, inplace=True)
 136
 137 edgelist_tmp = pd.merge(edgelist_tmp, g_sm_clu[["eid", "cluster"]], how="inner", left_on="from", right_on="eid")
 138 edgelist_tmp = edgelist_tmp.rename(columns={"cluster" : 'from_cluster'})
 139 edgelist_tmp.drop('eid', 1, inplace=True)
 140
 141 edgelist_tmp = edgelist_tmp[["to_cluster", "from_cluster"]]
 142 edgelist_tmp = edgelist_tmp[edgelist_tmp["to_cluster"] != edgelist_tmp["from_cluster"]]
 143
 144 cluster_edgelist = pd.crosstab(edgelist_tmp["to_cluster"], edgelist_tmp["from_cluster"])
 145 cluster_edgelist["to_cluster"] = cluster_edgelist.index
 146
 147 cluster_edgelist = pd.melt(cluster_edgelist, id_vars=["to_cluster"])
 148 cluster_edgelist = cluster_edgelist[cluster_edgelist['to_cluster'] != cluster_edgelist['from_cluster']]
 149
 150 remember("cluster_edgelist", cluster_edgelist)
 151
 152 top_clusters = g_sm_clu["cluster"].value_counts().head(6).index
 153
 154 # write the edgelist for the total number of clusters (currently 1-6)
 155 cluster_edgelist_output = cluster_edgelist[(cluster_edgelist["to_cluster"].isin(top_clusters)) &
 156                                            (cluster_edgelist["from_cluster"].isin(top_clusters))]
 157
 158 cluster_edgelist_output = cluster_edgelist_output[cluster_edgelist_output["value"] > 0]
 159
 160 g_cluster = Graph.TupleList([tuple(x) for x in cluster_edgelist_output[["from_cluster", "to_cluster"]].values], directed=True)
 161 g_cluster.es["weight"] = cluster_edgelist_output["value"].tolist()
 162
 163 # assign the number of total articles as an attribute for each node
 164 g_cluster.vs["papers"] = g_sm_clu["cluster"].value_counts()[[x["name"] for x in g_cluster.vs]].tolist()
 165
 166 g_cluster.write_graphml("code/bibliometrics/clusters.graphml")
 167
 168 # # create network stats for tables (overall and within clusters)
 169
 170 def create_network_stats(g):
 171     network_stats = pd.DataFrame({'eid' : g.vs['name'],
 172                                   'eig_cent' : g.eigenvector_centrality(),
 173                                   'indegree' : g.indegree(),
 174                                   'betweenness' : g.betweenness()})
 175
 176     network_stats = pd.merge(network_stats,
 177                              articles[['eid', 'title', 'source_title']],
 178                              how="inner")
 179     return network_stats
 180
 181 network_stats = create_network_stats(g_full)
 182
 183 network_stats.sort_values("indegree", ascending=False).head(4)
 184
 185 network_stats.sort_values("eig_cent", ascending=False).head(4)
 186
 187 network_stats.sort_values("betweenness", ascending=False).head(4)
 188
 189 # # things to store
 190 remember('total_articles', articles.shape[0])
 191
 192 # total number of citations in the sm dataset
 193 remember('sm_citations', raw_edgelist.shape[0])
 194
 195 remember('sm_citing', len(raw_edgelist["from"].unique()))
 196
 197 # the number of articles in the original dataset that have any INCOMING citations
 198 remember('sm_cited', len(raw_edgelist["to"].unique()))
 199
 200 # total number of citations in the sm dataset
 201 remember('all_citations', combo_raw_edgelist.shape[0])
 202
 203 remember('all_citing', len(combo_raw_edgelist["from"].unique()))
 204
 205 # the number of articles in the original dataset that have any INCOMING citations
 206 remember('all_cited', len(combo_raw_edgelist["to"].unique()))
 207
 208 remember('g_sm_clusters', g_sm_clu[["eid", "cluster"]])
 209
 210 sorted(r.keys())
 211
 212 #save the r function to rdata file
 213 def save_to_r(r_dict, filename="output.RData"):
 214     for var_name, x in r.items():
 215         var_name = var_name.replace('_', '.')
 216         if type(x) == np.int64:
 217             x = np.asscalar(x)
 218
 219         if type(x) == pd.DataFrame:
 220             rx = pandas2ri.py2ri(x)
 221         else:
 222             rx = x
 223
 224         robjects.r.assign(var_name, x)
 225
 226         # create a new variable called in R
 227     robjects.r("r <- sapply(ls(), function (x) {eval(parse(text=x))})")
 228     robjects.r('save("r", file="{}")'.format(filename))
 229     robjects.r("rm(list=ls())")
 230
 231 save_to_r(r, "paper/data/network_data.RData")
 232