{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import data and get things setup"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"random.seed(9001)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/lib/python3/dist-packages/IPython/core/magics/pylab.py:161: UserWarning: pylab import has clobbered these variables: ['sin', 'pi', 'median', 'random', 'percentile', 'save', 'deprecated', 'Rectangle', 'load', 'mean', 'plot', 'cos']\n",
"`%matplotlib` prevents importing * from pylab and numpy\n",
" \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n"
]
}
],
"source": [
"# turn on the magic so we have inline figures\n",
"%pylab inline\n",
"import matplotlib\n",
"matplotlib.style.use('ggplot')\n",
"from IPython.display import display"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"# import code to write r modules and create our variable we'll write to\n",
"import rpy2.robjects as robjects\n",
"from rpy2.robjects import pandas2ri\n",
"pandas2ri.activate()\n",
"\n",
"r = {}\n",
"def remember(name, x):\n",
" r[name] = x\n",
" display(x)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"# load in modules we'll need for analysis\n",
"import subprocess\n",
"import csv\n",
"from igraph import *\n",
"import pandas as pd\n",
"import numpy as np\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# grab the largest connected compontent with a little function\n",
"def get_largest_component(g):\n",
" g_components = g.components(mode=\"WEAK\")\n",
" max_size = max(g_components.sizes())\n",
" for g_tmp in g_components.subgraphs():\n",
" if g_tmp.vcount() == max_size:\n",
" return(g_tmp)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"# look the full edgelist into igraph\n",
"def edge_list_iter(df):\n",
" for i, row in df.iterrows():\n",
" yield (row['from'], row['to'])"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"# list top 5 journals for each of the clusters\n",
"def top_journals_for_clusters(clu):\n",
" articles_tmp = pd.merge(clu, articles[['eid', 'source_title']])\n",
" \n",
" output = pd.DataFrame()\n",
" for cid in articles_tmp['cluster'].unique():\n",
" journal_counts = articles_tmp['source_title'][articles_tmp['cluster'] == cid].value_counts().head(5)\n",
" tmp = pd.DataFrame({'cluster' : cid, 'count' : journal_counts }) \n",
" output = output.append(tmp)\n",
"\n",
" output = output.reset_index()\n",
" output = output.rename(columns = {'index' : \"journal\"})\n",
" return(output)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"def infomap_edgelist(g, edgelist_filename, directed=True):\n",
" nodes_tmp = pd.DataFrame([ {'node_infomap' : v.index, \n",
" 'eid' : v['name']} for v in g.vs ])\n",
"\n",
" # write out the edgelist to an external file so we can call infomap on it\n",
" with open(edgelist_filename + \".txt\", 'w') as f:\n",
" for e in g.es:\n",
" if e.source != e.target:\n",
" if 'weight' in e.attributes():\n",
" print(\"{}\\t{}\\t{}\".format(e.source, e.target, e['weight']), file=f)\n",
" else:\n",
" print(\"{}\\t{}\".format(e.source, e.target), file=f)\n",
"\n",
" \n",
" # run the external program to generate the infomap clustering\n",
" infomap_cmdline = [\"infomap/Infomap\", edgelist_filename + \".txt\", \"output_dir -z --map --clu --tree\"]\n",
" if directed:\n",
" infomap_cmdline.append(\"-d\")\n",
" subprocess.call(infomap_cmdline)\n",
"\n",
" # load up the clu data\n",
" clu = pd.read_csv(\"output_dir/\" + edgelist_filename + \".clu\",\n",
" header=None, comment=\"#\", delim_whitespace=True)\n",
" clu.columns = ['node_infomap', 'cluster', 'flow']\n",
" \n",
" return pd.merge(clu, nodes_tmp, on=\"node_infomap\")"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"def write_graphml(g, clu, graphml_filename):\n",
" clu = clu[['node_infomap', 'cluster']].sort_values('node_infomap')\n",
" g.vs[\"cluster\"] = clu[\"cluster\"].tolist()\n",
" g.write_graphml(graphml_filename)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"# load article data\n",
"articles = pd.read_csv(\"../../processed_data/abstracts.tsv\", delimiter=\"\\t\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# network for just the central \"social media\" set"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"# this contains the list of all INCOMING citations to for paper in the original set\n",
"raw_edgelist = pd.read_csv(\"../../processed_data/social_media_edgelist.txt\", delimiter=\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"g_sm_all = Graph.TupleList([i for i in edge_list_iter(raw_edgelist)], directed=True)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"g_sm = get_largest_component(g_sm_all)\n",
"g_sm = g_sm.simplify()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"g_sm_clu = infomap_edgelist(g_sm, \"sm_edgelist_infomap\", directed=True)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2 1817\n",
"1 1748\n",
"3 1088\n",
"4 653\n",
"6 355\n",
"10 114\n",
"5 104\n",
"9 90\n",
"8 59\n",
"7 44\n",
"12 27\n",
"11 19\n",
"13 10\n",
"14 5\n",
"15 3\n",
"16 2\n",
"18 1\n",
"17 1\n",
"Name: cluster, dtype: int64"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g_sm_clu['cluster'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" journal | \n",
" cluster | \n",
" count | \n",
"
\n",
" \n",
" \n",
" \n",
" 40 | \n",
" Lecture Notes in Computer Science (including s... | \n",
" 9 | \n",
" 4 | \n",
"
\n",
" \n",
" 41 | \n",
" WSDM 2013 - Proceedings of the 6th ACM Interna... | \n",
" 9 | \n",
" 4 | \n",
"
\n",
" \n",
" 42 | \n",
" Conference on Human Factors in Computing Syste... | \n",
" 9 | \n",
" 2 | \n",
"
\n",
" \n",
" 43 | \n",
" WWW 2013 Companion - Proceedings of the 22nd I... | \n",
" 9 | \n",
" 2 | \n",
"
\n",
" \n",
" 44 | \n",
" PLoS ONE | \n",
" 9 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" journal cluster count\n",
"40 Lecture Notes in Computer Science (including s... 9 4\n",
"41 WSDM 2013 - Proceedings of the 6th ACM Interna... 9 4\n",
"42 Conference on Human Factors in Computing Syste... 9 2\n",
"43 WWW 2013 Companion - Proceedings of the 22nd I... 9 2\n",
"44 PLoS ONE 9 2"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tmp = top_journals_for_clusters(g_sm_clu)\n",
"tmp[tmp.cluster == 9]"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"write_graphml(g_sm, g_sm_clu, \"g_sm.graphml\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# larger network that contains the incoming cites to citing articles"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"# this contains the list of all INCOMING citations to everything in the original set\n",
"# plus every INCOMING citation to every paper that cites one of those papers\n",
"raw_edgelist_files = [\"../../processed_data/citation_edgelist.txt\",\n",
" \"../../processed_data/social_media_edgelist.txt\"]\n",
"combo_raw_edgelist = pd.concat([pd.read_csv(x, delimiter=\"\\t\") for x in raw_edgelist_files])"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"g_full_all = Graph.TupleList([i for i in edge_list_iter(combo_raw_edgelist)], directed=True)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"g_full = get_largest_component(g_full_all)\n",
"g_full = g_full.simplify()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"g_full_clu = infomap_edgelist(g_full, \"citation_edglist_infomap\", directed=True)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 9243\n",
"2 8225\n",
"3 6826\n",
"4 3227\n",
"6 2835\n",
"5 2704\n",
"7 1911\n",
"9 810\n",
"8 803\n",
"10 589\n",
"11 520\n",
"12 491\n",
"13 336\n",
"14 219\n",
"15 175\n",
"17 162\n",
"16 153\n",
"22 139\n",
"18 135\n",
"19 118\n",
"25 117\n",
"23 106\n",
"21 93\n",
"24 88\n",
"30 84\n",
"28 79\n",
"27 78\n",
"32 76\n",
"26 73\n",
"20 71\n",
" ... \n",
"54 26\n",
"56 25\n",
"52 23\n",
"49 23\n",
"55 22\n",
"58 19\n",
"62 18\n",
"61 18\n",
"63 18\n",
"60 17\n",
"66 15\n",
"59 15\n",
"57 15\n",
"65 14\n",
"68 13\n",
"53 7\n",
"64 6\n",
"73 6\n",
"71 4\n",
"70 4\n",
"74 3\n",
"67 3\n",
"72 3\n",
"69 3\n",
"75 2\n",
"78 1\n",
"79 1\n",
"77 1\n",
"80 1\n",
"76 1\n",
"Name: cluster, Length: 80, dtype: int64"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g_full_clu['cluster'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" journal | \n",
" cluster | \n",
" count | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Public Relations Review | \n",
" 1 | \n",
" 119 | \n",
"
\n",
" \n",
" 1 | \n",
" Lecture Notes in Computer Science (including s... | \n",
" 1 | \n",
" 81 | \n",
"
\n",
" \n",
" 2 | \n",
" Computers in Human Behavior | \n",
" 1 | \n",
" 71 | \n",
"
\n",
" \n",
" 3 | \n",
" Proceedings of the Annual Hawaii International... | \n",
" 1 | \n",
" 49 | \n",
"
\n",
" \n",
" 4 | \n",
" Government Information Quarterly | \n",
" 1 | \n",
" 40 | \n",
"
\n",
" \n",
" 5 | \n",
" Journal of Medical Internet Research | \n",
" 2 | \n",
" 149 | \n",
"
\n",
" \n",
" 6 | \n",
" PLoS ONE | \n",
" 2 | \n",
" 43 | \n",
"
\n",
" \n",
" 7 | \n",
" Studies in Health Technology and Informatics | \n",
" 2 | \n",
" 41 | \n",
"
\n",
" \n",
" 8 | \n",
" Lecture Notes in Computer Science (including s... | \n",
" 2 | \n",
" 32 | \n",
"
\n",
" \n",
" 9 | \n",
" Annals of Emergency Medicine | \n",
" 2 | \n",
" 17 | \n",
"
\n",
" \n",
" 10 | \n",
" Lecture Notes in Computer Science (including s... | \n",
" 3 | \n",
" 180 | \n",
"
\n",
" \n",
" 11 | \n",
" ACM International Conference Proceeding Series | \n",
" 3 | \n",
" 51 | \n",
"
\n",
" \n",
" 12 | \n",
" International Conference on Information and Kn... | \n",
" 3 | \n",
" 38 | \n",
"
\n",
" \n",
" 13 | \n",
" CEUR Workshop Proceedings | \n",
" 3 | \n",
" 37 | \n",
"
\n",
" \n",
" 14 | \n",
" PLoS ONE | \n",
" 3 | \n",
" 36 | \n",
"
\n",
" \n",
" 15 | \n",
" Information Communication and Society | \n",
" 4 | \n",
" 70 | \n",
"
\n",
" \n",
" 16 | \n",
" New Media and Society | \n",
" 4 | \n",
" 34 | \n",
"
\n",
" \n",
" 17 | \n",
" First Monday | \n",
" 4 | \n",
" 24 | \n",
"
\n",
" \n",
" 18 | \n",
" Lecture Notes in Computer Science (including s... | \n",
" 4 | \n",
" 23 | \n",
"
\n",
" \n",
" 19 | \n",
" Computers in Human Behavior | \n",
" 4 | \n",
" 21 | \n",
"
\n",
" \n",
" 20 | \n",
" Computers in Human Behavior | \n",
" 5 | \n",
" 42 | \n",
"
\n",
" \n",
" 21 | \n",
" Cyberpsychology, Behavior, and Social Networking | \n",
" 5 | \n",
" 42 | \n",
"
\n",
" \n",
" 22 | \n",
" Personality and Individual Differences | \n",
" 5 | \n",
" 11 | \n",
"
\n",
" \n",
" 23 | \n",
" Journal of Medical Internet Research | \n",
" 5 | \n",
" 11 | \n",
"
\n",
" \n",
" 24 | \n",
" Journal of Adolescent Health | \n",
" 5 | \n",
" 11 | \n",
"
\n",
" \n",
" 25 | \n",
" Computers in Human Behavior | \n",
" 6 | \n",
" 38 | \n",
"
\n",
" \n",
" 26 | \n",
" Lecture Notes in Computer Science (including s... | \n",
" 6 | \n",
" 24 | \n",
"
\n",
" \n",
" 27 | \n",
" Computers and Education | \n",
" 6 | \n",
" 16 | \n",
"
\n",
" \n",
" 28 | \n",
" Conference on Human Factors in Computing Syste... | \n",
" 6 | \n",
" 11 | \n",
"
\n",
" \n",
" 29 | \n",
" Journal of Marketing Education | \n",
" 6 | \n",
" 11 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 286 | \n",
" Medical Journal of Australia | \n",
" 63 | \n",
" 1 | \n",
"
\n",
" \n",
" 287 | \n",
" Nicotine and Tobacco Research | \n",
" 63 | \n",
" 1 | \n",
"
\n",
" \n",
" 288 | \n",
" 35th International Conference on Information S... | \n",
" 64 | \n",
" 1 | \n",
"
\n",
" \n",
" 289 | \n",
" First Monday | \n",
" 64 | \n",
" 1 | \n",
"
\n",
" \n",
" 290 | \n",
" Cyberpsychology, Behavior, and Social Networking | \n",
" 64 | \n",
" 1 | \n",
"
\n",
" \n",
" 291 | \n",
" HT'12 - Proceedings of 23rd ACM Conference on ... | \n",
" 65 | \n",
" 1 | \n",
"
\n",
" \n",
" 292 | \n",
" IEEE/ACM Transactions on Networking | \n",
" 65 | \n",
" 1 | \n",
"
\n",
" \n",
" 293 | \n",
" Journal of Healthcare Engineering | \n",
" 65 | \n",
" 1 | \n",
"
\n",
" \n",
" 294 | \n",
" International Journal of Information Management | \n",
" 66 | \n",
" 2 | \n",
"
\n",
" \n",
" 295 | \n",
" Journal of Theoretical and Applied Electronic ... | \n",
" 66 | \n",
" 1 | \n",
"
\n",
" \n",
" 296 | \n",
" Journal of Experimental and Theoretical Artifi... | \n",
" 66 | \n",
" 1 | \n",
"
\n",
" \n",
" 297 | \n",
" McKinsey Quarterly | \n",
" 66 | \n",
" 1 | \n",
"
\n",
" \n",
" 298 | \n",
" Lecture Notes in Computer Science (including s... | \n",
" 66 | \n",
" 1 | \n",
"
\n",
" \n",
" 299 | \n",
" Science (New York, N.Y.) | \n",
" 67 | \n",
" 1 | \n",
"
\n",
" \n",
" 300 | \n",
" International Conference on Information and Kn... | \n",
" 68 | \n",
" 1 | \n",
"
\n",
" \n",
" 301 | \n",
" Lecture Notes in Computer Science (including s... | \n",
" 68 | \n",
" 1 | \n",
"
\n",
" \n",
" 302 | \n",
" 16th Americas Conference on Information System... | \n",
" 68 | \n",
" 1 | \n",
"
\n",
" \n",
" 303 | \n",
" Procedia Engineering | \n",
" 68 | \n",
" 1 | \n",
"
\n",
" \n",
" 304 | \n",
" International Journal of Virtual and Personal ... | \n",
" 68 | \n",
" 1 | \n",
"
\n",
" \n",
" 305 | \n",
" Scientometrics | \n",
" 69 | \n",
" 1 | \n",
"
\n",
" \n",
" 306 | \n",
" Conference on Human Factors in Computing Syste... | \n",
" 70 | \n",
" 2 | \n",
"
\n",
" \n",
" 307 | \n",
" NyS | \n",
" 71 | \n",
" 2 | \n",
"
\n",
" \n",
" 308 | \n",
" Aslib Proceedings: New Information Perspectives | \n",
" 71 | \n",
" 1 | \n",
"
\n",
" \n",
" 309 | \n",
" WWW 2013 Companion - Proceedings of the 22nd I... | \n",
" 72 | \n",
" 1 | \n",
"
\n",
" \n",
" 310 | \n",
" Cyberpsychology, Behavior, and Social Networking | \n",
" 72 | \n",
" 1 | \n",
"
\n",
" \n",
" 311 | \n",
" PACIS 2011 - 15th Pacific Asia Conference on I... | \n",
" 73 | \n",
" 1 | \n",
"
\n",
" \n",
" 312 | \n",
" Proceedings of the International Conference on... | \n",
" 73 | \n",
" 1 | \n",
"
\n",
" \n",
" 313 | \n",
" Online (Wilton, Connecticut) | \n",
" 74 | \n",
" 1 | \n",
"
\n",
" \n",
" 314 | \n",
" Catalan Journal of Communication and Cultural ... | \n",
" 75 | \n",
" 1 | \n",
"
\n",
" \n",
" 315 | \n",
" Proceedings - Pacific Asia Conference on Infor... | \n",
" 75 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
316 rows × 3 columns
\n",
"
"
],
"text/plain": [
" journal cluster count\n",
"0 Public Relations Review 1 119\n",
"1 Lecture Notes in Computer Science (including s... 1 81\n",
"2 Computers in Human Behavior 1 71\n",
"3 Proceedings of the Annual Hawaii International... 1 49\n",
"4 Government Information Quarterly 1 40\n",
"5 Journal of Medical Internet Research 2 149\n",
"6 PLoS ONE 2 43\n",
"7 Studies in Health Technology and Informatics 2 41\n",
"8 Lecture Notes in Computer Science (including s... 2 32\n",
"9 Annals of Emergency Medicine 2 17\n",
"10 Lecture Notes in Computer Science (including s... 3 180\n",
"11 ACM International Conference Proceeding Series 3 51\n",
"12 International Conference on Information and Kn... 3 38\n",
"13 CEUR Workshop Proceedings 3 37\n",
"14 PLoS ONE 3 36\n",
"15 Information Communication and Society 4 70\n",
"16 New Media and Society 4 34\n",
"17 First Monday 4 24\n",
"18 Lecture Notes in Computer Science (including s... 4 23\n",
"19 Computers in Human Behavior 4 21\n",
"20 Computers in Human Behavior 5 42\n",
"21 Cyberpsychology, Behavior, and Social Networking 5 42\n",
"22 Personality and Individual Differences 5 11\n",
"23 Journal of Medical Internet Research 5 11\n",
"24 Journal of Adolescent Health 5 11\n",
"25 Computers in Human Behavior 6 38\n",
"26 Lecture Notes in Computer Science (including s... 6 24\n",
"27 Computers and Education 6 16\n",
"28 Conference on Human Factors in Computing Syste... 6 11\n",
"29 Journal of Marketing Education 6 11\n",
".. ... ... ...\n",
"286 Medical Journal of Australia 63 1\n",
"287 Nicotine and Tobacco Research 63 1\n",
"288 35th International Conference on Information S... 64 1\n",
"289 First Monday 64 1\n",
"290 Cyberpsychology, Behavior, and Social Networking 64 1\n",
"291 HT'12 - Proceedings of 23rd ACM Conference on ... 65 1\n",
"292 IEEE/ACM Transactions on Networking 65 1\n",
"293 Journal of Healthcare Engineering 65 1\n",
"294 International Journal of Information Management 66 2\n",
"295 Journal of Theoretical and Applied Electronic ... 66 1\n",
"296 Journal of Experimental and Theoretical Artifi... 66 1\n",
"297 McKinsey Quarterly 66 1\n",
"298 Lecture Notes in Computer Science (including s... 66 1\n",
"299 Science (New York, N.Y.) 67 1\n",
"300 International Conference on Information and Kn... 68 1\n",
"301 Lecture Notes in Computer Science (including s... 68 1\n",
"302 16th Americas Conference on Information System... 68 1\n",
"303 Procedia Engineering 68 1\n",
"304 International Journal of Virtual and Personal ... 68 1\n",
"305 Scientometrics 69 1\n",
"306 Conference on Human Factors in Computing Syste... 70 2\n",
"307 NyS 71 2\n",
"308 Aslib Proceedings: New Information Perspectives 71 1\n",
"309 WWW 2013 Companion - Proceedings of the 22nd I... 72 1\n",
"310 Cyberpsychology, Behavior, and Social Networking 72 1\n",
"311 PACIS 2011 - 15th Pacific Asia Conference on I... 73 1\n",
"312 Proceedings of the International Conference on... 73 1\n",
"313 Online (Wilton, Connecticut) 74 1\n",
"314 Catalan Journal of Communication and Cultural ... 75 1\n",
"315 Proceedings - Pacific Asia Conference on Infor... 75 1\n",
"\n",
"[316 rows x 3 columns]"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"top_journals_for_clusters(g_full_clu)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"write_graphml(g_full, g_full_clu, \"g_full.graphml\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# create the meta-network of connections between clusters"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" to_cluster | \n",
" from_cluster | \n",
" value | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 396 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 1 | \n",
" 278 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 1 | \n",
" 233 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 1 | \n",
" 171 | \n",
"
\n",
" \n",
" 5 | \n",
" 6 | \n",
" 1 | \n",
" 85 | \n",
"
\n",
" \n",
" 6 | \n",
" 7 | \n",
" 1 | \n",
" 57 | \n",
"
\n",
" \n",
" 7 | \n",
" 8 | \n",
" 1 | \n",
" 86 | \n",
"
\n",
" \n",
" 8 | \n",
" 9 | \n",
" 1 | \n",
" 25 | \n",
"
\n",
" \n",
" 9 | \n",
" 10 | \n",
" 1 | \n",
" 29 | \n",
"
\n",
" \n",
" 10 | \n",
" 11 | \n",
" 1 | \n",
" 12 | \n",
"
\n",
" \n",
" 11 | \n",
" 12 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 12 | \n",
" 13 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" 13 | \n",
" 1 | \n",
" 2 | \n",
" 412 | \n",
"
\n",
" \n",
" 15 | \n",
" 3 | \n",
" 2 | \n",
" 117 | \n",
"
\n",
" \n",
" 16 | \n",
" 4 | \n",
" 2 | \n",
" 126 | \n",
"
\n",
" \n",
" 17 | \n",
" 5 | \n",
" 2 | \n",
" 187 | \n",
"
\n",
" \n",
" 18 | \n",
" 6 | \n",
" 2 | \n",
" 104 | \n",
"
\n",
" \n",
" 19 | \n",
" 7 | \n",
" 2 | \n",
" 175 | \n",
"
\n",
" \n",
" 20 | \n",
" 8 | \n",
" 2 | \n",
" 68 | \n",
"
\n",
" \n",
" 21 | \n",
" 9 | \n",
" 2 | \n",
" 16 | \n",
"
\n",
" \n",
" 22 | \n",
" 10 | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 23 | \n",
" 11 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" 24 | \n",
" 12 | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 25 | \n",
" 13 | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" 26 | \n",
" 1 | \n",
" 3 | \n",
" 184 | \n",
"
\n",
" \n",
" 27 | \n",
" 2 | \n",
" 3 | \n",
" 150 | \n",
"
\n",
" \n",
" 29 | \n",
" 4 | \n",
" 3 | \n",
" 174 | \n",
"
\n",
" \n",
" 30 | \n",
" 5 | \n",
" 3 | \n",
" 345 | \n",
"
\n",
" \n",
" 31 | \n",
" 6 | \n",
" 3 | \n",
" 11 | \n",
"
\n",
" \n",
" 32 | \n",
" 7 | \n",
" 3 | \n",
" 99 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 204 | \n",
" 10 | \n",
" 16 | \n",
" 0 | \n",
"
\n",
" \n",
" 205 | \n",
" 11 | \n",
" 16 | \n",
" 0 | \n",
"
\n",
" \n",
" 206 | \n",
" 12 | \n",
" 16 | \n",
" 0 | \n",
"
\n",
" \n",
" 207 | \n",
" 13 | \n",
" 16 | \n",
" 1 | \n",
"
\n",
" \n",
" 208 | \n",
" 1 | \n",
" 17 | \n",
" 0 | \n",
"
\n",
" \n",
" 209 | \n",
" 2 | \n",
" 17 | \n",
" 0 | \n",
"
\n",
" \n",
" 210 | \n",
" 3 | \n",
" 17 | \n",
" 0 | \n",
"
\n",
" \n",
" 211 | \n",
" 4 | \n",
" 17 | \n",
" 3 | \n",
"
\n",
" \n",
" 212 | \n",
" 5 | \n",
" 17 | \n",
" 4 | \n",
"
\n",
" \n",
" 213 | \n",
" 6 | \n",
" 17 | \n",
" 0 | \n",
"
\n",
" \n",
" 214 | \n",
" 7 | \n",
" 17 | \n",
" 0 | \n",
"
\n",
" \n",
" 215 | \n",
" 8 | \n",
" 17 | \n",
" 2 | \n",
"
\n",
" \n",
" 216 | \n",
" 9 | \n",
" 17 | \n",
" 0 | \n",
"
\n",
" \n",
" 217 | \n",
" 10 | \n",
" 17 | \n",
" 0 | \n",
"
\n",
" \n",
" 218 | \n",
" 11 | \n",
" 17 | \n",
" 0 | \n",
"
\n",
" \n",
" 219 | \n",
" 12 | \n",
" 17 | \n",
" 0 | \n",
"
\n",
" \n",
" 220 | \n",
" 13 | \n",
" 17 | \n",
" 0 | \n",
"
\n",
" \n",
" 221 | \n",
" 1 | \n",
" 18 | \n",
" 3 | \n",
"
\n",
" \n",
" 222 | \n",
" 2 | \n",
" 18 | \n",
" 0 | \n",
"
\n",
" \n",
" 223 | \n",
" 3 | \n",
" 18 | \n",
" 0 | \n",
"
\n",
" \n",
" 224 | \n",
" 4 | \n",
" 18 | \n",
" 2 | \n",
"
\n",
" \n",
" 225 | \n",
" 5 | \n",
" 18 | \n",
" 2 | \n",
"
\n",
" \n",
" 226 | \n",
" 6 | \n",
" 18 | \n",
" 0 | \n",
"
\n",
" \n",
" 227 | \n",
" 7 | \n",
" 18 | \n",
" 0 | \n",
"
\n",
" \n",
" 228 | \n",
" 8 | \n",
" 18 | \n",
" 0 | \n",
"
\n",
" \n",
" 229 | \n",
" 9 | \n",
" 18 | \n",
" 0 | \n",
"
\n",
" \n",
" 230 | \n",
" 10 | \n",
" 18 | \n",
" 0 | \n",
"
\n",
" \n",
" 231 | \n",
" 11 | \n",
" 18 | \n",
" 0 | \n",
"
\n",
" \n",
" 232 | \n",
" 12 | \n",
" 18 | \n",
" 0 | \n",
"
\n",
" \n",
" 233 | \n",
" 13 | \n",
" 18 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
221 rows × 3 columns
\n",
"
"
],
"text/plain": [
" to_cluster from_cluster value\n",
"1 2 1 396\n",
"2 3 1 278\n",
"3 4 1 233\n",
"4 5 1 171\n",
"5 6 1 85\n",
"6 7 1 57\n",
"7 8 1 86\n",
"8 9 1 25\n",
"9 10 1 29\n",
"10 11 1 12\n",
"11 12 1 0\n",
"12 13 1 3\n",
"13 1 2 412\n",
"15 3 2 117\n",
"16 4 2 126\n",
"17 5 2 187\n",
"18 6 2 104\n",
"19 7 2 175\n",
"20 8 2 68\n",
"21 9 2 16\n",
"22 10 2 4\n",
"23 11 2 3\n",
"24 12 2 0\n",
"25 13 2 4\n",
"26 1 3 184\n",
"27 2 3 150\n",
"29 4 3 174\n",
"30 5 3 345\n",
"31 6 3 11\n",
"32 7 3 99\n",
".. ... ... ...\n",
"204 10 16 0\n",
"205 11 16 0\n",
"206 12 16 0\n",
"207 13 16 1\n",
"208 1 17 0\n",
"209 2 17 0\n",
"210 3 17 0\n",
"211 4 17 3\n",
"212 5 17 4\n",
"213 6 17 0\n",
"214 7 17 0\n",
"215 8 17 2\n",
"216 9 17 0\n",
"217 10 17 0\n",
"218 11 17 0\n",
"219 12 17 0\n",
"220 13 17 0\n",
"221 1 18 3\n",
"222 2 18 0\n",
"223 3 18 0\n",
"224 4 18 2\n",
"225 5 18 2\n",
"226 6 18 0\n",
"227 7 18 0\n",
"228 8 18 0\n",
"229 9 18 0\n",
"230 10 18 0\n",
"231 11 18 0\n",
"232 12 18 0\n",
"233 13 18 0\n",
"\n",
"[221 rows x 3 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"edgelist_tmp = pd.merge(raw_edgelist, g_sm_clu[[\"eid\", \"cluster\"]], how=\"inner\", left_on=\"to\", right_on=\"eid\")\n",
"edgelist_tmp = edgelist_tmp.rename(columns={'cluster' : 'to_cluster'})\n",
"edgelist_tmp.drop('eid', 1, inplace=True)\n",
" \n",
"edgelist_tmp = pd.merge(edgelist_tmp, g_sm_clu[[\"eid\", \"cluster\"]], how=\"inner\", left_on=\"from\", right_on=\"eid\")\n",
"edgelist_tmp = edgelist_tmp.rename(columns={\"cluster\" : 'from_cluster'})\n",
"edgelist_tmp.drop('eid', 1, inplace=True)\n",
"\n",
"edgelist_tmp = edgelist_tmp[[\"to_cluster\", \"from_cluster\"]]\n",
"edgelist_tmp = edgelist_tmp[edgelist_tmp[\"to_cluster\"] != edgelist_tmp[\"from_cluster\"]]\n",
"\n",
"cluster_edgelist = pd.crosstab(edgelist_tmp[\"to_cluster\"], edgelist_tmp[\"from_cluster\"])\n",
"cluster_edgelist[\"to_cluster\"] = cluster_edgelist.index\n",
"\n",
"cluster_edgelist = pd.melt(cluster_edgelist, id_vars=[\"to_cluster\"])\n",
"cluster_edgelist = cluster_edgelist[cluster_edgelist['to_cluster'] != cluster_edgelist['from_cluster']]\n",
"\n",
"remember(\"cluster_edgelist\", cluster_edgelist)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"top_clusters = g_sm_clu[\"cluster\"].value_counts().head(6).index\n",
"\n",
"# write the edgelist for the total number of clusters (currently 1-6)\n",
"cluster_edgelist_output = cluster_edgelist[(cluster_edgelist[\"to_cluster\"].isin(top_clusters)) &\n",
" (cluster_edgelist[\"from_cluster\"].isin(top_clusters))]\n",
"\n",
"cluster_edgelist_output = cluster_edgelist_output[cluster_edgelist_output[\"value\"] > 0]\n",
"\n",
"g_cluster = Graph.TupleList([tuple(x) for x in cluster_edgelist_output[[\"from_cluster\", \"to_cluster\"]].values], directed=True)\n",
"g_cluster.es[\"weight\"] = cluster_edgelist_output[\"value\"].tolist()\n",
"\n",
"# assign the number of total articles as an attribute for each node\n",
"g_cluster.vs[\"papers\"] = g_sm_clu[\"cluster\"].value_counts()[[x[\"name\"] for x in g_cluster.vs]].tolist()\n",
"\n",
"g_cluster.write_graphml(\"clusters.graphml\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# create network stats for tables (overall and within clusters)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"def create_network_stats(g):\n",
" network_stats = pd.DataFrame({'eid' : g.vs['name'],\n",
" 'eig_cent' : g.eigenvector_centrality(),\n",
" 'indegree' : g.indegree(),\n",
" 'betweenness' : g.betweenness()})\n",
"\n",
" network_stats = pd.merge(network_stats,\n",
" articles[['eid', 'title', 'source_title']],\n",
" how=\"inner\")\n",
" return network_stats"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"network_stats = create_network_stats(g_full)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" betweenness | \n",
" eid | \n",
" eig_cent | \n",
" indegree | \n",
" title | \n",
" source_title | \n",
"
\n",
" \n",
" \n",
" \n",
" 2275 | \n",
" 6393.560498 | \n",
" 2-s2.0-71149088987 | \n",
" 1.000000e+00 | \n",
" 1876 | \n",
" Users of the world, unite! The challenges and ... | \n",
" Business Horizons | \n",
"
\n",
" \n",
" 179 | \n",
" 0.000000 | \n",
" 2-s2.0-43449135033 | \n",
" 6.899762e-15 | \n",
" 645 | \n",
" Why we twitter: Understanding microblogging us... | \n",
" Joint Ninth WebKDD and First SNA-KDD 2007 Work... | \n",
"
\n",
" \n",
" 5120 | \n",
" 669.625397 | \n",
" 2-s2.0-79953711711 | \n",
" 7.271520e-02 | \n",
" 468 | \n",
" Social media? Get serious! Understanding the f... | \n",
" Business Horizons | \n",
"
\n",
" \n",
" 1855 | \n",
" 0.000000 | \n",
" 2-s2.0-67349268124 | \n",
" 2.974873e-01 | \n",
" 450 | \n",
" Social media: The new hybrid element of the pr... | \n",
" Business Horizons | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" betweenness eid eig_cent indegree \\\n",
"2275 6393.560498 2-s2.0-71149088987 1.000000e+00 1876 \n",
"179 0.000000 2-s2.0-43449135033 6.899762e-15 645 \n",
"5120 669.625397 2-s2.0-79953711711 7.271520e-02 468 \n",
"1855 0.000000 2-s2.0-67349268124 2.974873e-01 450 \n",
"\n",
" title \\\n",
"2275 Users of the world, unite! The challenges and ... \n",
"179 Why we twitter: Understanding microblogging us... \n",
"5120 Social media? Get serious! Understanding the f... \n",
"1855 Social media: The new hybrid element of the pr... \n",
"\n",
" source_title \n",
"2275 Business Horizons \n",
"179 Joint Ninth WebKDD and First SNA-KDD 2007 Work... \n",
"5120 Business Horizons \n",
"1855 Business Horizons "
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"network_stats.sort_values(\"indegree\", ascending=False).head(4)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" betweenness | \n",
" eid | \n",
" eig_cent | \n",
" indegree | \n",
" title | \n",
" source_title | \n",
"
\n",
" \n",
" \n",
" \n",
" 2275 | \n",
" 6393.560498 | \n",
" 2-s2.0-71149088987 | \n",
" 1.000000 | \n",
" 1876 | \n",
" Users of the world, unite! The challenges and ... | \n",
" Business Horizons | \n",
"
\n",
" \n",
" 2259 | \n",
" 0.000000 | \n",
" 2-s2.0-70349816888 | \n",
" 0.605279 | \n",
" 70 | \n",
" The fairyland of Second Life: Virtual social w... | \n",
" Business Horizons | \n",
"
\n",
" \n",
" 3612 | \n",
" 0.000000 | \n",
" 2-s2.0-77949522596 | \n",
" 0.563979 | \n",
" 335 | \n",
" Networked narratives: Understanding word-of-mo... | \n",
" Journal of Marketing | \n",
"
\n",
" \n",
" 7088 | \n",
" 0.000000 | \n",
" 2-s2.0-79551582037 | \n",
" 0.432951 | \n",
" 36 | \n",
" Online Personal Branding: Processes, Challenge... | \n",
" Journal of Interactive Marketing | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" betweenness eid eig_cent indegree \\\n",
"2275 6393.560498 2-s2.0-71149088987 1.000000 1876 \n",
"2259 0.000000 2-s2.0-70349816888 0.605279 70 \n",
"3612 0.000000 2-s2.0-77949522596 0.563979 335 \n",
"7088 0.000000 2-s2.0-79551582037 0.432951 36 \n",
"\n",
" title \\\n",
"2275 Users of the world, unite! The challenges and ... \n",
"2259 The fairyland of Second Life: Virtual social w... \n",
"3612 Networked narratives: Understanding word-of-mo... \n",
"7088 Online Personal Branding: Processes, Challenge... \n",
"\n",
" source_title \n",
"2275 Business Horizons \n",
"2259 Business Horizons \n",
"3612 Journal of Marketing \n",
"7088 Journal of Interactive Marketing "
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"network_stats.sort_values(\"eig_cent\", ascending=False).head(4)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" betweenness | \n",
" eid | \n",
" eig_cent | \n",
" indegree | \n",
" title | \n",
" source_title | \n",
"
\n",
" \n",
" \n",
" \n",
" 2275 | \n",
" 6393.560498 | \n",
" 2-s2.0-71149088987 | \n",
" 1.000000e+00 | \n",
" 1876 | \n",
" Users of the world, unite! The challenges and ... | \n",
" Business Horizons | \n",
"
\n",
" \n",
" 401 | \n",
" 6220.250000 | \n",
" 2-s2.0-70350491889 | \n",
" 3.749870e-16 | \n",
" 103 | \n",
" Crisis in a networked world: Features of compu... | \n",
" Social Science Computer Review | \n",
"
\n",
" \n",
" 2781 | \n",
" 5131.824639 | \n",
" 2-s2.0-84888047300 | \n",
" 1.310283e-01 | \n",
" 31 | \n",
" Social media metrics - A framework and guideli... | \n",
" Journal of Interactive Marketing | \n",
"
\n",
" \n",
" 3821 | \n",
" 4319.747561 | \n",
" 2-s2.0-84910136235 | \n",
" 3.045168e-18 | \n",
" 8 | \n",
" What are health-related users tweeting? A qual... | \n",
" Journal of Medical Internet Research | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" betweenness eid eig_cent indegree \\\n",
"2275 6393.560498 2-s2.0-71149088987 1.000000e+00 1876 \n",
"401 6220.250000 2-s2.0-70350491889 3.749870e-16 103 \n",
"2781 5131.824639 2-s2.0-84888047300 1.310283e-01 31 \n",
"3821 4319.747561 2-s2.0-84910136235 3.045168e-18 8 \n",
"\n",
" title \\\n",
"2275 Users of the world, unite! The challenges and ... \n",
"401 Crisis in a networked world: Features of compu... \n",
"2781 Social media metrics - A framework and guideli... \n",
"3821 What are health-related users tweeting? A qual... \n",
"\n",
" source_title \n",
"2275 Business Horizons \n",
"401 Social Science Computer Review \n",
"2781 Journal of Interactive Marketing \n",
"3821 Journal of Medical Internet Research "
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"network_stats.sort_values(\"betweenness\", ascending=False).head(4)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAFKFJREFUeJzt3W9sW2fdxvHLifenJSVN7eKQLZNo1kqk2tYad0CgW/6YTlQIdRVEjBdoC6PJsjGyMbHhF9MkFinSiBwJGoEgRKNIaENKKEggJFO6ogRo4ixd1bAt6ZjUqFlM7NK667LO8XleVPPT0KS1XZ/45Ob7eVWf2T5X7sTXnJ9PznFZlmUJAGCskmIHAADYi6IHAMNR9ABgOIoeAAxH0QOA4Sh6ADAcRQ8AhqPoAcBwFD0AGI6iBwDDuYu589HRUUWjUbW2tur06dN5PYfX69Xc3FyBkxWW0zM6PZ9ExkJwej7J+Rmdlq+qqiqr+xW16AOBgAKBQDEjAIDxGN0AgOEoegAwHEUPAIaj6AHAcBQ9ABiOogcAw1H0AGC4oh5HXwiz99cVbd+lP/td0fYNANniHT0AGM6Wop+fn9fTTz+taDRqx9MDAHKQ1eimt7dXY2NjKi8vV3d3d2b7+Pi4+vv7lU6n1dTUpD179kiSDh48qM9+9rP2JAYA5CSrd/T19fUKhUKLtqXTafX19SkUCikcDmtoaEjT09N67bXXdOutt2r9+vW2BAYA5Card/S1tbWKxWKLtk1NTamyslI+n0+SVFdXp5GREc3Pz+v999/X9PS0brzxRm3fvl0lJXwUAADFkvdRN4lEQh6PJ3Pb4/FocnJS3/zmNyVJhw8f1rp165Yt+UgkokgkIknq6uqS1+vNK8dsXo8qjGwzu93uvL++leD0fBIZC8Hp+STnZ3R6vuXkXfSWZV2xzeVyZf5dX19/1ccHg0EFg8HMbSed4zlb2WZ22jms/5vT80lkLASn55Ocn9Fp+bI9H33eMxWPx6N4PJ65HY/HVVFRkdNzjI6O6qc//Wm+EQAAWci76GtqajQzM6NYLKZUKqXh4eGcLyISCATU2tqabwQAQBayGt309PRoYmJCyWRSbW1tam5uVmNjo1paWtTZ2al0Oq2GhgZVV1fntPPLLyUIALBHVkXf0dGx5Ha/3y+/35/3zrmUIADYj+MeAcBwRS16PowFAPsV9eyVjG4AwH6MbgDAcIxuAMBwjG4AwHCMbgDAcBQ9ABiOGT0AGI4ZPQAYjtENABiOogcAw1H0AGA4PowFAMPxYSwAGI7RDQAYjqIHAMNR9ABgOIoeAAzHUTcAYDiOugEAwzG6AQDDUfQAYDiKHgAMR9EDgOEoegAwHEUPAIbjOHoAMBzH0QOA4RjdAIDhKHoAMBxFDwCGo+gBwHAUPQAYjqIHAMNR9ABgOIoeAAxH0QOA4Qr+l7HT09P6wx/+oGQyqTvuuEO7du0q9C4AADnIquh7e3s1Njam8vJydXd3Z7aPj4+rv79f6XRaTU1N2rNnj2699Vbt27dP6XSa89gAgANkNbqpr69XKBRatC2dTquvr0+hUEjhcFhDQ0Oanp6WdOlkZc8++6zuuOOOwicGAOQkq6Kvra1VWVnZom1TU1OqrKyUz+eT2+1WXV2dRkZGJF06Wdnzzz+vv/71r4VPDADISd4z+kQiIY/Hk7nt8Xg0OTmpEydO6B//+IdSqZS2b9++7OMjkYgikYgkqaurS16vN68cs3k9qjCyzex2u/P++laC0/NJZCwEp+eTnJ/R6fmWk3fRW5Z1xTaXy6WtW7dq69at13x8MBhUMBjM3J6bm8s3StFkm9nr9Tr663N6PomMheD0fJLzMzotX1VVVVb3y/vwSo/Ho3g8nrkdj8dVUVGR03Nw4REAsF/eRV9TU6OZmRnFYjGlUikNDw/nfBGRQCCg1tbWfCMAALKQ1eimp6dHExMTSiaTamtrU3NzsxobG9XS0qLOzk6l02k1NDSouro6p52Pjo4qGo1S9gBgo6yKvqOjY8ntfr9ffr8/751zKUEAsB+nQAAAwxW16PkwFgDsV/Bz3eSC0Q0A2I/RDQAYjtENABiO0Q0AGI7RDQAYjqIHAMMxowcAwzGjBwDDMboBAMNR9ABgOIoeAAzHh7EAYDg+jAUAwzG6AQDDUfQAYDiKHgAMR9EDgOE46gYADMdRNwBgOEY3AGA4ih4ADEfRA4DhKHoAMBxFDwCGo+gBwHAcRw8AhuM4egAwHKMbADAcRQ8AhqPoAcBwFD0AGI6iBwDDUfQAYDiKHgAMR9EDgOEoegAwnC1/GXv06FGNjY3p3Llzuu+++3TXXXfZsRsAQBayLvre3l6NjY2pvLxc3d3dme3j4+Pq7+9XOp1WU1OT9uzZo7vvvlt33323zp8/rwMHDlD0AFBEWY9u6uvrFQqFFm1Lp9Pq6+tTKBRSOBzW0NCQpqenM/99YGBA9913X+HSAgBylnXR19bWqqysbNG2qakpVVZWyufzye12q66uTiMjI7IsS7/61a+0bds2bdq0qeChAQDZu64ZfSKRkMfjydz2eDyanJzUH//4Rx0/flwXLlzQO++8o127dl3x2EgkokgkIknq6uqS1+vNK8NsftELItvMbrc7769vJTg9n0TGQnB6Psn5GZ2ebznXVfSWZV2xzeVyaffu3dq9e/dVHxsMBhUMBjO35+bmridKUWSb2ev1Ovrrc3o+iYyF4PR8kvMzOi1fVVVVVve7rsMrPR6P4vF45nY8HldFRUXWj+fCIwBgv+sq+pqaGs3MzCgWiymVSml4eDinC4kEAgG1trZeTwQAwDVkPbrp6enRxMSEksmk2tra1NzcrMbGRrW0tKizs1PpdFoNDQ2qrq7Oeuejo6OKRqOUPQDYKOui7+joWHK73++X3+/Pa+dcShAA7McpEADAcEUtej6MBQD72XKum2wxugEA+zG6AQDDMboBAMMxugEAwzG6AQDDUfQAYDhm9ABgOGb0AGA4RjcAYDiKHgAMR9EDgOH4MBYADMeHsQBgOEY3AGA4ih4ADEfRA4DhKHoAMBxH3QCA4TjqBgAMx+gGAAxH0QOA4Sh6ADAcRQ8AhqPoAcBwRT3qZrVb+NaXs7rfbIH3W/qz3xX4GQGYjOPoAcBwHEcPAIZjRg8AhqPoAcBwFD0AGI6iBwDDUfQAYDiKHgAMR9EDgOEoegAwHEUPAIYr+F/Gzs7OamBgQBcuXNB3v/vdQj89ACBHWb2j7+3t1cMPP3xFcY+Pj+s73/mOvv3tb+u3v/2tJMnn8+mRRx4pfFIAQF6yKvr6+nqFQqFF29LptPr6+hQKhRQOhzU0NKTp6WlbQgIA8pdV0dfW1qqsrGzRtqmpKVVWVsrn88ntdquurk4jIyO2hAQA5C/vGX0ikZDH48nc9ng8mpycVDKZ1K9//Wu9/fbbGhwc1P3337/k4yORiCKRiCSpq6tLXq83rxyFPtf7apDvWi3H7XYX/DkLjYzXz+n5JOdndHq+5eRd9JZlXbHN5XJp3bp12rdv3zUfHwwGFQwGM7fn5ubyjfI/p9Br5fV6Hb/+ZLx+Ts8nOT+j0/JVVVVldb+8D6/0eDyKx+OZ2/F4XBUVFTk9BxceAQD75V30NTU1mpmZUSwWUyqV0vDwcM4XEQkEAmptbc03AgAgC1mNbnp6ejQxMaFkMqm2tjY1NzersbFRLS0t6uzsVDqdVkNDg6qrq3Pa+ejoqKLRKGUPADbKqug7OjqW3O73++X3+/PeOZcSBAD7cQoEADBcUYueD2MBwH4FP9dNLhjdAID9GN0AgOEY3QCA4RjdAIDhGN0AgOEoegAwHDN6ADAcM3oAMByjGwAwHEUPAIaj6AHAcHwYCwCG48NYADAcoxsAMBxFDwCGo+gBwHAUPQAYrqgfxnJx8PwsfOvLBX2+2RzuW/qz3xV03wDsx1E3AGA4RjcAYDiKHgAMR9EDgOEoegAwHEUPAIaj6AHAcJy9EgAMx3H0AGA4RjcAYDiKHgAMR9EDgOEoegAwHEUPAIaj6AHAcBQ9ABiOogcAw1H0AGC4gv9l7Pz8vH7+85/L7XZr69at2rlzZ6F3AQDIQVZF39vbq7GxMZWXl6u7uzuzfXx8XP39/Uqn02pqatKePXt09OhRfeYzn1EgEFA4HKboAaDIshrd1NfXKxQKLdqWTqfV19enUCikcDisoaEhTU9PKx6Py+v1XnryEiZDAFBsWTVxbW2tysrKFm2bmppSZWWlfD6f3G636urqNDIyIo/Ho3g8LkmyLKvwiQEAOcl7Rp9IJOTxeDK3PR6PJicn9cUvflG/+MUvNDY2pk996lPLPj4SiSgSiUiSurq6Mr8F5Go2r0chXwvf+nJR9uv+/dG8f0ZWitvtdnRGp+eTipdx9v667O5nw759g8M2POtieRf9Uu/WXS6Xbr75ZrW3t1/z8cFgUMFgMHN7bm4u3yj4H5BKpRz/M+L1eh2d0en5pNWRsdCu5+utqqrK6n55D9EvH9FIUjweV0VFRU7PwYVHAMB+eRd9TU2NZmZmFIvFlEqlNDw8nPNFRAKBgFpbW/ONAADIQlajm56eHk1MTCiZTKqtrU3Nzc1qbGxUS0uLOjs7lU6n1dDQoOrqarvzAgBylFXRd3R0LLnd7/fL7/fnvfPR0VFFo1He1QOAjbhmLAAYrqh/0cSHsQBgP97RA4DhOEcBABjOZXGeAgAw2qp/R//MM88UO8I1OT2j0/NJZCwEp+eTnJ/R6fmWs+qLHgBwdRQ9ABiu9Lnnnnuu2CGu16ZNm4od4ZqcntHp+SQyFoLT80nOz+j0fEvhw1gAMByjGwAwXFH/YOp6LXXN2pU2Nzen/fv36z//+Y9cLpeCwaB2796tl19+WX/+85/10Y9+VJL0wAMPZM4LNDg4qEOHDqmkpEQPPfSQtm3bZnvORx99VDfffLNKSkpUWlqqrq4unT9/XuFwWP/+97+1ceNGPfHEEyorK5NlWerv79err76qm266Se3t7bb+unr69GmFw+HM7VgspubmZr377rtFXcOlrpWcz5odPnxYAwMDkqS9e/eqvr7e1owHDhxQNBqV2+2Wz+dTe3u7PvKRjygWi+mJJ57InMN88+bN2rdvnyTprbfe0v79+3Xx4kVt375dDz30kFwuly358nlt2PlaXypjOBzW6dOnJUkXLlzQ2rVr9cILLxRlDQvCWqUWFhasxx57zHrnnXesDz74wHrqqaesU6dOrXiORCJhnTx50rIsy7pw4YL1+OOPW6dOnbJeeukl6+DBg1fc/9SpU9ZTTz1lXbx40ZqdnbUee+wxa2Fhwfac7e3t1tmzZxdtO3DggDU4OGhZlmUNDg5aBw4csCzLsqLRqNXZ2Wml02nrjTfesL7//e/bnu9DCwsL1sMPP2zFYrGir+GJEyeskydPWk8++WRmW65rlkwmrUcffdRKJpOL/m1nxvHxcSuVSmXyfphxdnZ20f0u98wzz1hvvPGGlU6nrc7OTmtsbMy2fLl+X+1+rS+V8XIvvvii9Zvf/MayrOKsYSGs2tHNctesXWkVFRWZd25r1qzRLbfcokQisez9R0ZGVFdXpxtuuEEf+9jHVFlZqampqZWKe0WWe++9V5J07733ZtZvdHRU99xzj1wul7Zs2aJ3331XZ86cWZFMx48fV2VlpTZu3HjV3CuxhktdKznXNRsfH9edd96psrIylZWV6c4779T4+LitGe+66y6VlpZKkrZs2XLVn0dJOnPmjN577z1t2bJFLpdL99xzT8FeS0vlW85y31e7X+tXy2hZlv72t7/pc5/73FWfw841LIRVO7pZ7pq1xRSLxfSvf/1Lt99+u15//XX96U9/0pEjR7Rp0yZ94xvfUFlZmRKJhDZv3px5zIYNG675QiyUzs5OSdIXvvAFBYNBnT17NnNVsIqKCp07d07SpbW9/LqdHo9HiUQi5yuI5WNoaGjRi8ppa5jrmv33z+lKZpWkQ4cOqa7u/6+HGovF9L3vfU9r1qzR1772NX3yk59c8rVkd8Zcv6/Feq3/85//VHl5uT7+8Y9ntjllDXOxaoveWuaatcUyPz+v7u5uPfjgg1q7dq127dqlr3zlK5Kkl156Sb/85S/V3t6+ZO6V8IMf/EAbNmzQ2bNn9fzzz1/1WpPFWttUKqVoNKqvf/3rkuS4NbyaXNZspX5OBwYGVFpaqp07d0q69D+m3t5erVu3Tm+99ZZeeOEFdXd3r/h65vp9LeZr/b/feDhlDXO1akc3hbhmbaGkUil1d3dr586d+vSnPy1JWr9+vUpKSlRSUqKmpiadPHlyydyJREIbNmywPeOH+ygvL9eOHTs0NTWl8vLyzEjmzJkzmQ/HPB7PogsWr9Tavvrqq/rEJz6h9evXS3LeGkrKec02bNhwRdaVWMvDhw8rGo3q8ccfz5TiDTfcoHXr1km6dCy4z+fTzMzMkq8lO9cz1+9rsV7rCwsLOnr06KLfiJyyhrlatUVfiGvWFoJlWfrJT36iW265RV/60pcy2y+faR89ejRzmcVAIKDh4WF98MEHisVimpmZ0e23325rxvn5eb333nuZf7/22mu67bbbFAgE9Morr0iSXnnlFe3YsSOT8ciRI7IsS2+++abWrl1blLGNk9bwQ7mu2bZt23Ts2DGdP39e58+f17Fjx2w/ymp8fFwHDx7U008/rZtuuimz/dy5c0qn05Kk2dlZzczMyOfzqaKiQmvWrNGbb74py7J05MgRW19LuX5fi/VaP378uKqqqhaNZJyyhrla1X8wNTY2phdffDFzzdq9e/eueIbXX39dzz77rG677bbMO6cHHnhAQ0NDevvtt+VyubRx40bt27cvU5YDAwP6y1/+opKSEj344IPavn27rRlnZ2f1wx/+UNKldymf//zntXfvXiWTSYXDYc3Nzcnr9erJJ5/MHCrY19enY8eO6cYbb1R7e7tqampszfj+++/rkUce0Y9//GOtXbtWkvSjH/2oqGt4+bWSy8vL1dzcrB07duS8ZocOHdLg4KCkS4dXNjQ02JpxcHBQqVQq8wHjh4cA/v3vf9fLL7+s0tJSlZSU6Ktf/WqmjE6ePKne3l5dvHhR27ZtU0tLS0HGI0vlO3HiRM7fVztf60tlbGxs1P79+7V582bt2rUrc99irGEhrOqiBwBc26od3QAAskPRA4DhKHoAMBxFDwCGo+gBwHAUPQAYjqIHAMNR9ABguP8DaoV4MSni/p8AAAAASUVORK5CYII=\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"network_stats['indegree'].hist(log = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# things to store"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"23131"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"remember('total_articles', articles.shape[0])"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"35620"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"4807"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"3864"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# total number of citations in the sm dataset\n",
"remember('sm_citations', raw_edgelist.shape[0])\n",
"\n",
"remember('sm_citing', len(raw_edgelist[\"from\"].unique()))\n",
"\n",
"# the number of articles in the original dataset that have any INCOMING citations\n",
"remember('sm_cited', len(raw_edgelist[\"to\"].unique()))"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"212773"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"42935"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"9710"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# total number of citations in the sm dataset\n",
"remember('all_citations', combo_raw_edgelist.shape[0])\n",
"\n",
"remember('all_citing', len(combo_raw_edgelist[\"from\"].unique()))\n",
"\n",
"# the number of articles in the original dataset that have any INCOMING citations\n",
"remember('all_cited', len(combo_raw_edgelist[\"to\"].unique()))"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" eid | \n",
" cluster | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2-s2.0-71149088987 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 2-s2.0-70349816888 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 2-s2.0-79953711711 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 2-s2.0-79551630751 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 2-s2.0-80051469103 | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" 2-s2.0-84866718851 | \n",
" 1 | \n",
"
\n",
" \n",
" 6 | \n",
" 2-s2.0-84877685551 | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" 2-s2.0-84864442547 | \n",
" 1 | \n",
"
\n",
" \n",
" 8 | \n",
" 2-s2.0-84861420864 | \n",
" 1 | \n",
"
\n",
" \n",
" 9 | \n",
" 2-s2.0-84887483487 | \n",
" 1 | \n",
"
\n",
" \n",
" 10 | \n",
" 2-s2.0-80955144847 | \n",
" 1 | \n",
"
\n",
" \n",
" 11 | \n",
" 2-s2.0-84885038309 | \n",
" 1 | \n",
"
\n",
" \n",
" 12 | \n",
" 2-s2.0-84886099569 | \n",
" 1 | \n",
"
\n",
" \n",
" 13 | \n",
" 2-s2.0-84863379783 | \n",
" 1 | \n",
"
\n",
" \n",
" 14 | \n",
" 2-s2.0-84899093663 | \n",
" 1 | \n",
"
\n",
" \n",
" 15 | \n",
" 2-s2.0-84879109859 | \n",
" 1 | \n",
"
\n",
" \n",
" 16 | \n",
" 2-s2.0-83055168309 | \n",
" 1 | \n",
"
\n",
" \n",
" 17 | \n",
" 2-s2.0-84876304322 | \n",
" 1 | \n",
"
\n",
" \n",
" 18 | \n",
" 2-s2.0-84866168147 | \n",
" 1 | \n",
"
\n",
" \n",
" 19 | \n",
" 2-s2.0-84877817428 | \n",
" 1 | \n",
"
\n",
" \n",
" 20 | \n",
" 2-s2.0-84873481256 | \n",
" 1 | \n",
"
\n",
" \n",
" 21 | \n",
" 2-s2.0-84861794897 | \n",
" 1 | \n",
"
\n",
" \n",
" 22 | \n",
" 2-s2.0-84899508298 | \n",
" 1 | \n",
"
\n",
" \n",
" 23 | \n",
" 2-s2.0-84898082465 | \n",
" 1 | \n",
"
\n",
" \n",
" 24 | \n",
" 2-s2.0-84879021774 | \n",
" 1 | \n",
"
\n",
" \n",
" 25 | \n",
" 2-s2.0-80054988041 | \n",
" 1 | \n",
"
\n",
" \n",
" 26 | \n",
" 2-s2.0-84944394118 | \n",
" 1 | \n",
"
\n",
" \n",
" 27 | \n",
" 2-s2.0-84870572301 | \n",
" 1 | \n",
"
\n",
" \n",
" 28 | \n",
" 2-s2.0-84907167320 | \n",
" 1 | \n",
"
\n",
" \n",
" 29 | \n",
" 2-s2.0-84914675721 | \n",
" 1 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 6110 | \n",
" 2-s2.0-84856086839 | \n",
" 12 | \n",
"
\n",
" \n",
" 6111 | \n",
" 2-s2.0-84859510122 | \n",
" 12 | \n",
"
\n",
" \n",
" 6112 | \n",
" 2-s2.0-84905121209 | \n",
" 12 | \n",
"
\n",
" \n",
" 6113 | \n",
" 2-s2.0-84883758613 | \n",
" 12 | \n",
"
\n",
" \n",
" 6114 | \n",
" 2-s2.0-84877953100 | \n",
" 12 | \n",
"
\n",
" \n",
" 6115 | \n",
" 2-s2.0-84904376766 | \n",
" 12 | \n",
"
\n",
" \n",
" 6116 | \n",
" 2-s2.0-84905837182 | \n",
" 12 | \n",
"
\n",
" \n",
" 6117 | \n",
" 2-s2.0-84900461218 | \n",
" 12 | \n",
"
\n",
" \n",
" 6118 | \n",
" 2-s2.0-83755228785 | \n",
" 13 | \n",
"
\n",
" \n",
" 6119 | \n",
" 2-s2.0-84886795975 | \n",
" 13 | \n",
"
\n",
" \n",
" 6120 | \n",
" 2-s2.0-84876132785 | \n",
" 13 | \n",
"
\n",
" \n",
" 6121 | \n",
" 2-s2.0-84903121334 | \n",
" 13 | \n",
"
\n",
" \n",
" 6122 | \n",
" 2-s2.0-84863720400 | \n",
" 13 | \n",
"
\n",
" \n",
" 6123 | \n",
" 2-s2.0-84873180938 | \n",
" 13 | \n",
"
\n",
" \n",
" 6124 | \n",
" 2-s2.0-84914112838 | \n",
" 13 | \n",
"
\n",
" \n",
" 6125 | \n",
" 2-s2.0-84878795748 | \n",
" 13 | \n",
"
\n",
" \n",
" 6126 | \n",
" 2-s2.0-84888011666 | \n",
" 13 | \n",
"
\n",
" \n",
" 6127 | \n",
" 2-s2.0-84942101218 | \n",
" 13 | \n",
"
\n",
" \n",
" 6128 | \n",
" 2-s2.0-80052752113 | \n",
" 14 | \n",
"
\n",
" \n",
" 6129 | \n",
" 2-s2.0-84874074707 | \n",
" 14 | \n",
"
\n",
" \n",
" 6130 | \n",
" 2-s2.0-84942582235 | \n",
" 14 | \n",
"
\n",
" \n",
" 6131 | \n",
" 2-s2.0-70849130360 | \n",
" 14 | \n",
"
\n",
" \n",
" 6132 | \n",
" 2-s2.0-84864152630 | \n",
" 14 | \n",
"
\n",
" \n",
" 6133 | \n",
" 2-s2.0-84868709161 | \n",
" 15 | \n",
"
\n",
" \n",
" 6134 | \n",
" 2-s2.0-84896350015 | \n",
" 15 | \n",
"
\n",
" \n",
" 6135 | \n",
" 2-s2.0-84944104933 | \n",
" 15 | \n",
"
\n",
" \n",
" 6136 | \n",
" 2-s2.0-84875539506 | \n",
" 16 | \n",
"
\n",
" \n",
" 6137 | \n",
" 2-s2.0-84902262954 | \n",
" 16 | \n",
"
\n",
" \n",
" 6138 | \n",
" 2-s2.0-84909954481 | \n",
" 17 | \n",
"
\n",
" \n",
" 6139 | \n",
" 2-s2.0-84921469678 | \n",
" 18 | \n",
"
\n",
" \n",
"
\n",
"
6140 rows × 2 columns
\n",
"
"
],
"text/plain": [
" eid cluster\n",
"0 2-s2.0-71149088987 1\n",
"1 2-s2.0-70349816888 1\n",
"2 2-s2.0-79953711711 1\n",
"3 2-s2.0-79551630751 1\n",
"4 2-s2.0-80051469103 1\n",
"5 2-s2.0-84866718851 1\n",
"6 2-s2.0-84877685551 1\n",
"7 2-s2.0-84864442547 1\n",
"8 2-s2.0-84861420864 1\n",
"9 2-s2.0-84887483487 1\n",
"10 2-s2.0-80955144847 1\n",
"11 2-s2.0-84885038309 1\n",
"12 2-s2.0-84886099569 1\n",
"13 2-s2.0-84863379783 1\n",
"14 2-s2.0-84899093663 1\n",
"15 2-s2.0-84879109859 1\n",
"16 2-s2.0-83055168309 1\n",
"17 2-s2.0-84876304322 1\n",
"18 2-s2.0-84866168147 1\n",
"19 2-s2.0-84877817428 1\n",
"20 2-s2.0-84873481256 1\n",
"21 2-s2.0-84861794897 1\n",
"22 2-s2.0-84899508298 1\n",
"23 2-s2.0-84898082465 1\n",
"24 2-s2.0-84879021774 1\n",
"25 2-s2.0-80054988041 1\n",
"26 2-s2.0-84944394118 1\n",
"27 2-s2.0-84870572301 1\n",
"28 2-s2.0-84907167320 1\n",
"29 2-s2.0-84914675721 1\n",
"... ... ...\n",
"6110 2-s2.0-84856086839 12\n",
"6111 2-s2.0-84859510122 12\n",
"6112 2-s2.0-84905121209 12\n",
"6113 2-s2.0-84883758613 12\n",
"6114 2-s2.0-84877953100 12\n",
"6115 2-s2.0-84904376766 12\n",
"6116 2-s2.0-84905837182 12\n",
"6117 2-s2.0-84900461218 12\n",
"6118 2-s2.0-83755228785 13\n",
"6119 2-s2.0-84886795975 13\n",
"6120 2-s2.0-84876132785 13\n",
"6121 2-s2.0-84903121334 13\n",
"6122 2-s2.0-84863720400 13\n",
"6123 2-s2.0-84873180938 13\n",
"6124 2-s2.0-84914112838 13\n",
"6125 2-s2.0-84878795748 13\n",
"6126 2-s2.0-84888011666 13\n",
"6127 2-s2.0-84942101218 13\n",
"6128 2-s2.0-80052752113 14\n",
"6129 2-s2.0-84874074707 14\n",
"6130 2-s2.0-84942582235 14\n",
"6131 2-s2.0-70849130360 14\n",
"6132 2-s2.0-84864152630 14\n",
"6133 2-s2.0-84868709161 15\n",
"6134 2-s2.0-84896350015 15\n",
"6135 2-s2.0-84944104933 15\n",
"6136 2-s2.0-84875539506 16\n",
"6137 2-s2.0-84902262954 16\n",
"6138 2-s2.0-84909954481 17\n",
"6139 2-s2.0-84921469678 18\n",
"\n",
"[6140 rows x 2 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"remember('g_sm_clusters', g_sm_clu[[\"eid\", \"cluster\"]])"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['all_citations',\n",
" 'all_cited',\n",
" 'all_citing',\n",
" 'cluster_edgelist',\n",
" 'g_sm_clusters',\n",
" 'sm_citations',\n",
" 'sm_cited',\n",
" 'sm_citing',\n",
" 'total_articles']"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted(r.keys())"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"#save the r function to rdata file\n",
"def save_to_r(r_dict, filename=\"output.RData\"):\n",
" for var_name, x in r.items():\n",
" var_name = var_name.replace('_', '.')\n",
" if type(x) == np.int64:\n",
" x = np.asscalar(x)\n",
" \n",
" if type(x) == pd.DataFrame:\n",
" rx = pandas2ri.py2ri(x)\n",
" else:\n",
" rx = x\n",
" \n",
" robjects.r.assign(var_name, x)\n",
"\n",
" # create a new variable called in R\n",
" robjects.r(\"r <- sapply(ls(), function (x) {eval(parse(text=x))})\")\n",
" robjects.r('save(\"r\", file=\"{}\")'.format(filename))\n",
" robjects.r(\"rm(list=ls())\")\n",
" \n",
"save_to_r(r, \"../../paper/data/network_data.RData\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}