{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import data and get things setup" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "import random\n", "random.seed(9001)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/lib/python3/dist-packages/IPython/core/magics/pylab.py:161: UserWarning: pylab import has clobbered these variables: ['sin', 'pi', 'median', 'random', 'percentile', 'save', 'deprecated', 'Rectangle', 'load', 'mean', 'plot', 'cos']\n", "`%matplotlib` prevents importing * from pylab and numpy\n", " \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n" ] } ], "source": [ "# turn on the magic so we have inline figures\n", "%pylab inline\n", "import matplotlib\n", "matplotlib.style.use('ggplot')\n", "from IPython.display import display" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "# import code to write r modules and create our variable we'll write to\n", "import rpy2.robjects as robjects\n", "from rpy2.robjects import pandas2ri\n", "pandas2ri.activate()\n", "\n", "r = {}\n", "def remember(name, x):\n", " r[name] = x\n", " display(x)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "# load in modules we'll need for analysis\n", "import subprocess\n", "import csv\n", "from igraph import *\n", "import pandas as pd\n", "import numpy as np\n", "import re" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# grab the largest connected compontent with a little function\n", "def get_largest_component(g):\n", " g_components = g.components(mode=\"WEAK\")\n", " max_size = max(g_components.sizes())\n", " for g_tmp in g_components.subgraphs():\n", " if g_tmp.vcount() == max_size:\n", " return(g_tmp)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "# look the full edgelist into igraph\n", "def edge_list_iter(df):\n", " for i, row in df.iterrows():\n", " yield (row['from'], row['to'])" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "# list top 5 journals for each of the clusters\n", "def top_journals_for_clusters(clu):\n", " articles_tmp = pd.merge(clu, articles[['eid', 'source_title']])\n", " \n", " output = pd.DataFrame()\n", " for cid in articles_tmp['cluster'].unique():\n", " journal_counts = articles_tmp['source_title'][articles_tmp['cluster'] == cid].value_counts().head(5)\n", " tmp = pd.DataFrame({'cluster' : cid, 'count' : journal_counts }) \n", " output = output.append(tmp)\n", "\n", " output = output.reset_index()\n", " output = output.rename(columns = {'index' : \"journal\"})\n", " return(output)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "def infomap_edgelist(g, edgelist_filename, directed=True):\n", " nodes_tmp = pd.DataFrame([ {'node_infomap' : v.index, \n", " 'eid' : v['name']} for v in g.vs ])\n", "\n", " # write out the edgelist to an external file so we can call infomap on it\n", " with open(edgelist_filename + \".txt\", 'w') as f:\n", " for e in g.es:\n", " if e.source != e.target:\n", " if 'weight' in e.attributes():\n", " print(\"{}\\t{}\\t{}\".format(e.source, e.target, e['weight']), file=f)\n", " else:\n", " print(\"{}\\t{}\".format(e.source, e.target), file=f)\n", "\n", " \n", " # run the external program to generate the infomap clustering\n", " infomap_cmdline = [\"infomap/Infomap\", edgelist_filename + \".txt\", \"output_dir -z --map --clu --tree\"]\n", " if directed:\n", " infomap_cmdline.append(\"-d\")\n", " subprocess.call(infomap_cmdline)\n", "\n", " # load up the clu data\n", " clu = pd.read_csv(\"output_dir/\" + edgelist_filename + \".clu\",\n", " header=None, comment=\"#\", delim_whitespace=True)\n", " clu.columns = ['node_infomap', 'cluster', 'flow']\n", " \n", " return pd.merge(clu, nodes_tmp, on=\"node_infomap\")" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "def write_graphml(g, clu, graphml_filename):\n", " clu = clu[['node_infomap', 'cluster']].sort_values('node_infomap')\n", " g.vs[\"cluster\"] = clu[\"cluster\"].tolist()\n", " g.write_graphml(graphml_filename)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "# load article data\n", "articles = pd.read_csv(\"../../processed_data/abstracts.tsv\", delimiter=\"\\t\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# network for just the central \"social media\" set" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "# this contains the list of all INCOMING citations to for paper in the original set\n", "raw_edgelist = pd.read_csv(\"../../processed_data/social_media_edgelist.txt\", delimiter=\"\\t\")" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "g_sm_all = Graph.TupleList([i for i in edge_list_iter(raw_edgelist)], directed=True)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "g_sm = get_largest_component(g_sm_all)\n", "g_sm = g_sm.simplify()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "g_sm_clu = infomap_edgelist(g_sm, \"sm_edgelist_infomap\", directed=True)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2 1817\n", "1 1748\n", "3 1088\n", "4 653\n", "6 355\n", "10 114\n", "5 104\n", "9 90\n", "8 59\n", "7 44\n", "12 27\n", "11 19\n", "13 10\n", "14 5\n", "15 3\n", "16 2\n", "18 1\n", "17 1\n", "Name: cluster, dtype: int64" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g_sm_clu['cluster'].value_counts()" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
journalclustercount
40Lecture Notes in Computer Science (including s...94
41WSDM 2013 - Proceedings of the 6th ACM Interna...94
42Conference on Human Factors in Computing Syste...92
43WWW 2013 Companion - Proceedings of the 22nd I...92
44PLoS ONE92
\n", "
" ], "text/plain": [ " journal cluster count\n", "40 Lecture Notes in Computer Science (including s... 9 4\n", "41 WSDM 2013 - Proceedings of the 6th ACM Interna... 9 4\n", "42 Conference on Human Factors in Computing Syste... 9 2\n", "43 WWW 2013 Companion - Proceedings of the 22nd I... 9 2\n", "44 PLoS ONE 9 2" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tmp = top_journals_for_clusters(g_sm_clu)\n", "tmp[tmp.cluster == 9]" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "write_graphml(g_sm, g_sm_clu, \"g_sm.graphml\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# larger network that contains the incoming cites to citing articles" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "# this contains the list of all INCOMING citations to everything in the original set\n", "# plus every INCOMING citation to every paper that cites one of those papers\n", "raw_edgelist_files = [\"../../processed_data/citation_edgelist.txt\",\n", " \"../../processed_data/social_media_edgelist.txt\"]\n", "combo_raw_edgelist = pd.concat([pd.read_csv(x, delimiter=\"\\t\") for x in raw_edgelist_files])" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "g_full_all = Graph.TupleList([i for i in edge_list_iter(combo_raw_edgelist)], directed=True)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "g_full = get_largest_component(g_full_all)\n", "g_full = g_full.simplify()" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "g_full_clu = infomap_edgelist(g_full, \"citation_edglist_infomap\", directed=True)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1 9243\n", "2 8225\n", "3 6826\n", "4 3227\n", "6 2835\n", "5 2704\n", "7 1911\n", "9 810\n", "8 803\n", "10 589\n", "11 520\n", "12 491\n", "13 336\n", "14 219\n", "15 175\n", "17 162\n", "16 153\n", "22 139\n", "18 135\n", "19 118\n", "25 117\n", "23 106\n", "21 93\n", "24 88\n", "30 84\n", "28 79\n", "27 78\n", "32 76\n", "26 73\n", "20 71\n", " ... \n", "54 26\n", "56 25\n", "52 23\n", "49 23\n", "55 22\n", "58 19\n", "62 18\n", "61 18\n", "63 18\n", "60 17\n", "66 15\n", "59 15\n", "57 15\n", "65 14\n", "68 13\n", "53 7\n", "64 6\n", "73 6\n", "71 4\n", "70 4\n", "74 3\n", "67 3\n", "72 3\n", "69 3\n", "75 2\n", "78 1\n", "79 1\n", "77 1\n", "80 1\n", "76 1\n", "Name: cluster, Length: 80, dtype: int64" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g_full_clu['cluster'].value_counts()" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
journalclustercount
0Public Relations Review1119
1Lecture Notes in Computer Science (including s...181
2Computers in Human Behavior171
3Proceedings of the Annual Hawaii International...149
4Government Information Quarterly140
5Journal of Medical Internet Research2149
6PLoS ONE243
7Studies in Health Technology and Informatics241
8Lecture Notes in Computer Science (including s...232
9Annals of Emergency Medicine217
10Lecture Notes in Computer Science (including s...3180
11ACM International Conference Proceeding Series351
12International Conference on Information and Kn...338
13CEUR Workshop Proceedings337
14PLoS ONE336
15Information Communication and Society470
16New Media and Society434
17First Monday424
18Lecture Notes in Computer Science (including s...423
19Computers in Human Behavior421
20Computers in Human Behavior542
21Cyberpsychology, Behavior, and Social Networking542
22Personality and Individual Differences511
23Journal of Medical Internet Research511
24Journal of Adolescent Health511
25Computers in Human Behavior638
26Lecture Notes in Computer Science (including s...624
27Computers and Education616
28Conference on Human Factors in Computing Syste...611
29Journal of Marketing Education611
............
286Medical Journal of Australia631
287Nicotine and Tobacco Research631
28835th International Conference on Information S...641
289First Monday641
290Cyberpsychology, Behavior, and Social Networking641
291HT'12 - Proceedings of 23rd ACM Conference on ...651
292IEEE/ACM Transactions on Networking651
293Journal of Healthcare Engineering651
294International Journal of Information Management662
295Journal of Theoretical and Applied Electronic ...661
296Journal of Experimental and Theoretical Artifi...661
297McKinsey Quarterly661
298Lecture Notes in Computer Science (including s...661
299Science (New York, N.Y.)671
300International Conference on Information and Kn...681
301Lecture Notes in Computer Science (including s...681
30216th Americas Conference on Information System...681
303Procedia Engineering681
304International Journal of Virtual and Personal ...681
305Scientometrics691
306Conference on Human Factors in Computing Syste...702
307NyS712
308Aslib Proceedings: New Information Perspectives711
309WWW 2013 Companion - Proceedings of the 22nd I...721
310Cyberpsychology, Behavior, and Social Networking721
311PACIS 2011 - 15th Pacific Asia Conference on I...731
312Proceedings of the International Conference on...731
313Online (Wilton, Connecticut)741
314Catalan Journal of Communication and Cultural ...751
315Proceedings - Pacific Asia Conference on Infor...751
\n", "

316 rows × 3 columns

\n", "
" ], "text/plain": [ " journal cluster count\n", "0 Public Relations Review 1 119\n", "1 Lecture Notes in Computer Science (including s... 1 81\n", "2 Computers in Human Behavior 1 71\n", "3 Proceedings of the Annual Hawaii International... 1 49\n", "4 Government Information Quarterly 1 40\n", "5 Journal of Medical Internet Research 2 149\n", "6 PLoS ONE 2 43\n", "7 Studies in Health Technology and Informatics 2 41\n", "8 Lecture Notes in Computer Science (including s... 2 32\n", "9 Annals of Emergency Medicine 2 17\n", "10 Lecture Notes in Computer Science (including s... 3 180\n", "11 ACM International Conference Proceeding Series 3 51\n", "12 International Conference on Information and Kn... 3 38\n", "13 CEUR Workshop Proceedings 3 37\n", "14 PLoS ONE 3 36\n", "15 Information Communication and Society 4 70\n", "16 New Media and Society 4 34\n", "17 First Monday 4 24\n", "18 Lecture Notes in Computer Science (including s... 4 23\n", "19 Computers in Human Behavior 4 21\n", "20 Computers in Human Behavior 5 42\n", "21 Cyberpsychology, Behavior, and Social Networking 5 42\n", "22 Personality and Individual Differences 5 11\n", "23 Journal of Medical Internet Research 5 11\n", "24 Journal of Adolescent Health 5 11\n", "25 Computers in Human Behavior 6 38\n", "26 Lecture Notes in Computer Science (including s... 6 24\n", "27 Computers and Education 6 16\n", "28 Conference on Human Factors in Computing Syste... 6 11\n", "29 Journal of Marketing Education 6 11\n", ".. ... ... ...\n", "286 Medical Journal of Australia 63 1\n", "287 Nicotine and Tobacco Research 63 1\n", "288 35th International Conference on Information S... 64 1\n", "289 First Monday 64 1\n", "290 Cyberpsychology, Behavior, and Social Networking 64 1\n", "291 HT'12 - Proceedings of 23rd ACM Conference on ... 65 1\n", "292 IEEE/ACM Transactions on Networking 65 1\n", "293 Journal of Healthcare Engineering 65 1\n", "294 International Journal of Information Management 66 2\n", "295 Journal of Theoretical and Applied Electronic ... 66 1\n", "296 Journal of Experimental and Theoretical Artifi... 66 1\n", "297 McKinsey Quarterly 66 1\n", "298 Lecture Notes in Computer Science (including s... 66 1\n", "299 Science (New York, N.Y.) 67 1\n", "300 International Conference on Information and Kn... 68 1\n", "301 Lecture Notes in Computer Science (including s... 68 1\n", "302 16th Americas Conference on Information System... 68 1\n", "303 Procedia Engineering 68 1\n", "304 International Journal of Virtual and Personal ... 68 1\n", "305 Scientometrics 69 1\n", "306 Conference on Human Factors in Computing Syste... 70 2\n", "307 NyS 71 2\n", "308 Aslib Proceedings: New Information Perspectives 71 1\n", "309 WWW 2013 Companion - Proceedings of the 22nd I... 72 1\n", "310 Cyberpsychology, Behavior, and Social Networking 72 1\n", "311 PACIS 2011 - 15th Pacific Asia Conference on I... 73 1\n", "312 Proceedings of the International Conference on... 73 1\n", "313 Online (Wilton, Connecticut) 74 1\n", "314 Catalan Journal of Communication and Cultural ... 75 1\n", "315 Proceedings - Pacific Asia Conference on Infor... 75 1\n", "\n", "[316 rows x 3 columns]" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_journals_for_clusters(g_full_clu)" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "write_graphml(g_full, g_full_clu, \"g_full.graphml\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# create the meta-network of connections between clusters" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
to_clusterfrom_clustervalue
121396
231278
341233
451171
56185
67157
78186
89125
910129
1011112
111210
121313
1312412
1532117
1642126
1752187
1862104
1972175
208268
219216
221024
231123
241220
251324
2613184
2723150
2943174
3053345
316311
327399
............
20410160
20511160
20612160
20713161
2081170
2092170
2103170
2114173
2125174
2136170
2147170
2158172
2169170
21710170
21811170
21912170
22013170
2211183
2222180
2233180
2244182
2255182
2266180
2277180
2288180
2299180
23010180
23111180
23212180
23313180
\n", "

221 rows × 3 columns

\n", "
" ], "text/plain": [ " to_cluster from_cluster value\n", "1 2 1 396\n", "2 3 1 278\n", "3 4 1 233\n", "4 5 1 171\n", "5 6 1 85\n", "6 7 1 57\n", "7 8 1 86\n", "8 9 1 25\n", "9 10 1 29\n", "10 11 1 12\n", "11 12 1 0\n", "12 13 1 3\n", "13 1 2 412\n", "15 3 2 117\n", "16 4 2 126\n", "17 5 2 187\n", "18 6 2 104\n", "19 7 2 175\n", "20 8 2 68\n", "21 9 2 16\n", "22 10 2 4\n", "23 11 2 3\n", "24 12 2 0\n", "25 13 2 4\n", "26 1 3 184\n", "27 2 3 150\n", "29 4 3 174\n", "30 5 3 345\n", "31 6 3 11\n", "32 7 3 99\n", ".. ... ... ...\n", "204 10 16 0\n", "205 11 16 0\n", "206 12 16 0\n", "207 13 16 1\n", "208 1 17 0\n", "209 2 17 0\n", "210 3 17 0\n", "211 4 17 3\n", "212 5 17 4\n", "213 6 17 0\n", "214 7 17 0\n", "215 8 17 2\n", "216 9 17 0\n", "217 10 17 0\n", "218 11 17 0\n", "219 12 17 0\n", "220 13 17 0\n", "221 1 18 3\n", "222 2 18 0\n", "223 3 18 0\n", "224 4 18 2\n", "225 5 18 2\n", "226 6 18 0\n", "227 7 18 0\n", "228 8 18 0\n", "229 9 18 0\n", "230 10 18 0\n", "231 11 18 0\n", "232 12 18 0\n", "233 13 18 0\n", "\n", "[221 rows x 3 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "edgelist_tmp = pd.merge(raw_edgelist, g_sm_clu[[\"eid\", \"cluster\"]], how=\"inner\", left_on=\"to\", right_on=\"eid\")\n", "edgelist_tmp = edgelist_tmp.rename(columns={'cluster' : 'to_cluster'})\n", "edgelist_tmp.drop('eid', 1, inplace=True)\n", " \n", "edgelist_tmp = pd.merge(edgelist_tmp, g_sm_clu[[\"eid\", \"cluster\"]], how=\"inner\", left_on=\"from\", right_on=\"eid\")\n", "edgelist_tmp = edgelist_tmp.rename(columns={\"cluster\" : 'from_cluster'})\n", "edgelist_tmp.drop('eid', 1, inplace=True)\n", "\n", "edgelist_tmp = edgelist_tmp[[\"to_cluster\", \"from_cluster\"]]\n", "edgelist_tmp = edgelist_tmp[edgelist_tmp[\"to_cluster\"] != edgelist_tmp[\"from_cluster\"]]\n", "\n", "cluster_edgelist = pd.crosstab(edgelist_tmp[\"to_cluster\"], edgelist_tmp[\"from_cluster\"])\n", "cluster_edgelist[\"to_cluster\"] = cluster_edgelist.index\n", "\n", "cluster_edgelist = pd.melt(cluster_edgelist, id_vars=[\"to_cluster\"])\n", "cluster_edgelist = cluster_edgelist[cluster_edgelist['to_cluster'] != cluster_edgelist['from_cluster']]\n", "\n", "remember(\"cluster_edgelist\", cluster_edgelist)" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "top_clusters = g_sm_clu[\"cluster\"].value_counts().head(6).index\n", "\n", "# write the edgelist for the total number of clusters (currently 1-6)\n", "cluster_edgelist_output = cluster_edgelist[(cluster_edgelist[\"to_cluster\"].isin(top_clusters)) &\n", " (cluster_edgelist[\"from_cluster\"].isin(top_clusters))]\n", "\n", "cluster_edgelist_output = cluster_edgelist_output[cluster_edgelist_output[\"value\"] > 0]\n", "\n", "g_cluster = Graph.TupleList([tuple(x) for x in cluster_edgelist_output[[\"from_cluster\", \"to_cluster\"]].values], directed=True)\n", "g_cluster.es[\"weight\"] = cluster_edgelist_output[\"value\"].tolist()\n", "\n", "# assign the number of total articles as an attribute for each node\n", "g_cluster.vs[\"papers\"] = g_sm_clu[\"cluster\"].value_counts()[[x[\"name\"] for x in g_cluster.vs]].tolist()\n", "\n", "g_cluster.write_graphml(\"clusters.graphml\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# create network stats for tables (overall and within clusters)" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "def create_network_stats(g):\n", " network_stats = pd.DataFrame({'eid' : g.vs['name'],\n", " 'eig_cent' : g.eigenvector_centrality(),\n", " 'indegree' : g.indegree(),\n", " 'betweenness' : g.betweenness()})\n", "\n", " network_stats = pd.merge(network_stats,\n", " articles[['eid', 'title', 'source_title']],\n", " how=\"inner\")\n", " return network_stats" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "network_stats = create_network_stats(g_full)" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
betweennesseideig_centindegreetitlesource_title
22756393.5604982-s2.0-711490889871.000000e+001876Users of the world, unite! The challenges and ...Business Horizons
1790.0000002-s2.0-434491350336.899762e-15645Why we twitter: Understanding microblogging us...Joint Ninth WebKDD and First SNA-KDD 2007 Work...
5120669.6253972-s2.0-799537117117.271520e-02468Social media? Get serious! Understanding the f...Business Horizons
18550.0000002-s2.0-673492681242.974873e-01450Social media: The new hybrid element of the pr...Business Horizons
\n", "
" ], "text/plain": [ " betweenness eid eig_cent indegree \\\n", "2275 6393.560498 2-s2.0-71149088987 1.000000e+00 1876 \n", "179 0.000000 2-s2.0-43449135033 6.899762e-15 645 \n", "5120 669.625397 2-s2.0-79953711711 7.271520e-02 468 \n", "1855 0.000000 2-s2.0-67349268124 2.974873e-01 450 \n", "\n", " title \\\n", "2275 Users of the world, unite! The challenges and ... \n", "179 Why we twitter: Understanding microblogging us... \n", "5120 Social media? Get serious! Understanding the f... \n", "1855 Social media: The new hybrid element of the pr... \n", "\n", " source_title \n", "2275 Business Horizons \n", "179 Joint Ninth WebKDD and First SNA-KDD 2007 Work... \n", "5120 Business Horizons \n", "1855 Business Horizons " ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "network_stats.sort_values(\"indegree\", ascending=False).head(4)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
betweennesseideig_centindegreetitlesource_title
22756393.5604982-s2.0-711490889871.0000001876Users of the world, unite! The challenges and ...Business Horizons
22590.0000002-s2.0-703498168880.60527970The fairyland of Second Life: Virtual social w...Business Horizons
36120.0000002-s2.0-779495225960.563979335Networked narratives: Understanding word-of-mo...Journal of Marketing
70880.0000002-s2.0-795515820370.43295136Online Personal Branding: Processes, Challenge...Journal of Interactive Marketing
\n", "
" ], "text/plain": [ " betweenness eid eig_cent indegree \\\n", "2275 6393.560498 2-s2.0-71149088987 1.000000 1876 \n", "2259 0.000000 2-s2.0-70349816888 0.605279 70 \n", "3612 0.000000 2-s2.0-77949522596 0.563979 335 \n", "7088 0.000000 2-s2.0-79551582037 0.432951 36 \n", "\n", " title \\\n", "2275 Users of the world, unite! The challenges and ... \n", "2259 The fairyland of Second Life: Virtual social w... \n", "3612 Networked narratives: Understanding word-of-mo... \n", "7088 Online Personal Branding: Processes, Challenge... \n", "\n", " source_title \n", "2275 Business Horizons \n", "2259 Business Horizons \n", "3612 Journal of Marketing \n", "7088 Journal of Interactive Marketing " ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "network_stats.sort_values(\"eig_cent\", ascending=False).head(4)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
betweennesseideig_centindegreetitlesource_title
22756393.5604982-s2.0-711490889871.000000e+001876Users of the world, unite! The challenges and ...Business Horizons
4016220.2500002-s2.0-703504918893.749870e-16103Crisis in a networked world: Features of compu...Social Science Computer Review
27815131.8246392-s2.0-848880473001.310283e-0131Social media metrics - A framework and guideli...Journal of Interactive Marketing
38214319.7475612-s2.0-849101362353.045168e-188What are health-related users tweeting? A qual...Journal of Medical Internet Research
\n", "
" ], "text/plain": [ " betweenness eid eig_cent indegree \\\n", "2275 6393.560498 2-s2.0-71149088987 1.000000e+00 1876 \n", "401 6220.250000 2-s2.0-70350491889 3.749870e-16 103 \n", "2781 5131.824639 2-s2.0-84888047300 1.310283e-01 31 \n", "3821 4319.747561 2-s2.0-84910136235 3.045168e-18 8 \n", "\n", " title \\\n", "2275 Users of the world, unite! The challenges and ... \n", "401 Crisis in a networked world: Features of compu... \n", "2781 Social media metrics - A framework and guideli... \n", "3821 What are health-related users tweeting? A qual... \n", "\n", " source_title \n", "2275 Business Horizons \n", "401 Social Science Computer Review \n", "2781 Journal of Interactive Marketing \n", "3821 Journal of Medical Internet Research " ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "network_stats.sort_values(\"betweenness\", ascending=False).head(4)" ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAFKFJREFUeJzt3W9sW2fdxvHLifenJSVN7eKQLZNo1kqk2tYad0CgW/6YTlQIdRVEjBdoC6PJsjGyMbHhF9MkFinSiBwJGoEgRKNIaENKKEggJFO6ogRo4ixd1bAt6ZjUqFlM7NK667LO8XleVPPT0KS1XZ/45Ob7eVWf2T5X7sTXnJ9PznFZlmUJAGCskmIHAADYi6IHAMNR9ABgOIoeAAxH0QOA4Sh6ADAcRQ8AhqPoAcBwFD0AGI6iBwDDuYu589HRUUWjUbW2tur06dN5PYfX69Xc3FyBkxWW0zM6PZ9ExkJwej7J+Rmdlq+qqiqr+xW16AOBgAKBQDEjAIDxGN0AgOEoegAwHEUPAIaj6AHAcBQ9ABiOogcAw1H0AGC4oh5HXwiz99cVbd+lP/td0fYNANniHT0AGM6Wop+fn9fTTz+taDRqx9MDAHKQ1eimt7dXY2NjKi8vV3d3d2b7+Pi4+vv7lU6n1dTUpD179kiSDh48qM9+9rP2JAYA5CSrd/T19fUKhUKLtqXTafX19SkUCikcDmtoaEjT09N67bXXdOutt2r9+vW2BAYA5Card/S1tbWKxWKLtk1NTamyslI+n0+SVFdXp5GREc3Pz+v999/X9PS0brzxRm3fvl0lJXwUAADFkvdRN4lEQh6PJ3Pb4/FocnJS3/zmNyVJhw8f1rp165Yt+UgkokgkIknq6uqS1+vNK8dsXo8qjGwzu93uvL++leD0fBIZC8Hp+STnZ3R6vuXkXfSWZV2xzeVyZf5dX19/1ccHg0EFg8HMbSed4zlb2WZ22jms/5vT80lkLASn55Ocn9Fp+bI9H33eMxWPx6N4PJ65HY/HVVFRkdNzjI6O6qc//Wm+EQAAWci76GtqajQzM6NYLKZUKqXh4eGcLyISCATU2tqabwQAQBayGt309PRoYmJCyWRSbW1tam5uVmNjo1paWtTZ2al0Oq2GhgZVV1fntPPLLyUIALBHVkXf0dGx5Ha/3y+/35/3zrmUIADYj+MeAcBwRS16PowFAPsV9eyVjG4AwH6MbgDAcIxuAMBwjG4AwHCMbgDAcBQ9ABiOGT0AGI4ZPQAYjtENABiOogcAw1H0AGA4PowFAMPxYSwAGI7RDQAYjqIHAMNR9ABgOIoeAAzHUTcAYDiOugEAwzG6AQDDUfQAYDiKHgAMR9EDgOEoegAwHEUPAIbjOHoAMBzH0QOA4RjdAIDhKHoAMBxFDwCGo+gBwHAUPQAYjqIHAMNR9ABgOIoeAAxH0QOA4Qr+l7HT09P6wx/+oGQyqTvuuEO7du0q9C4AADnIquh7e3s1Njam8vJydXd3Z7aPj4+rv79f6XRaTU1N2rNnj2699Vbt27dP6XSa89gAgANkNbqpr69XKBRatC2dTquvr0+hUEjhcFhDQ0Oanp6WdOlkZc8++6zuuOOOwicGAOQkq6Kvra1VWVnZom1TU1OqrKyUz+eT2+1WXV2dRkZGJF06Wdnzzz+vv/71r4VPDADISd4z+kQiIY/Hk7nt8Xg0OTmpEydO6B//+IdSqZS2b9++7OMjkYgikYgkqaurS16vN68cs3k9qjCyzex2u/P++laC0/NJZCwEp+eTnJ/R6fmWk3fRW5Z1xTaXy6WtW7dq69at13x8MBhUMBjM3J6bm8s3StFkm9nr9Tr663N6PomMheD0fJLzMzotX1VVVVb3y/vwSo/Ho3g8nrkdj8dVUVGR03Nw4REAsF/eRV9TU6OZmRnFYjGlUikNDw/nfBGRQCCg1tbWfCMAALKQ1eimp6dHExMTSiaTamtrU3NzsxobG9XS0qLOzk6l02k1NDSouro6p52Pjo4qGo1S9gBgo6yKvqOjY8ntfr9ffr8/751zKUEAsB+nQAAAwxW16PkwFgDsV/Bz3eSC0Q0A2I/RDQAYjtENABiO0Q0AGI7RDQAYjqIHAMMxowcAwzGjBwDDMboBAMNR9ABgOIoeAAzHh7EAYDg+jAUAwzG6AQDDUfQAYDiKHgAMR9EDgOE46gYADMdRNwBgOEY3AGA4ih4ADEfRA4DhKHoAMBxFDwCGo+gBwHAcRw8AhuM4egAwHKMbADAcRQ8AhqPoAcBwFD0AGI6iBwDDUfQAYDiKHgAMR9EDgOEoegAwnC1/GXv06FGNjY3p3Llzuu+++3TXXXfZsRsAQBayLvre3l6NjY2pvLxc3d3dme3j4+Pq7+9XOp1WU1OT9uzZo7vvvlt33323zp8/rwMHDlD0AFBEWY9u6uvrFQqFFm1Lp9Pq6+tTKBRSOBzW0NCQpqenM/99YGBA9913X+HSAgBylnXR19bWqqysbNG2qakpVVZWyufzye12q66uTiMjI7IsS7/61a+0bds2bdq0qeChAQDZu64ZfSKRkMfjydz2eDyanJzUH//4Rx0/flwXLlzQO++8o127dl3x2EgkokgkIknq6uqS1+vNK8NsftELItvMbrc7769vJTg9n0TGQnB6Psn5GZ2ebznXVfSWZV2xzeVyaffu3dq9e/dVHxsMBhUMBjO35+bmridKUWSb2ev1Ovrrc3o+iYyF4PR8kvMzOi1fVVVVVve7rsMrPR6P4vF45nY8HldFRUXWj+fCIwBgv+sq+pqaGs3MzCgWiymVSml4eDinC4kEAgG1trZeTwQAwDVkPbrp6enRxMSEksmk2tra1NzcrMbGRrW0tKizs1PpdFoNDQ2qrq7Oeuejo6OKRqOUPQDYKOui7+joWHK73++X3+/Pa+dcShAA7McpEADAcEUtej6MBQD72XKum2wxugEA+zG6AQDDMboBAMMxugEAwzG6AQDDUfQAYDhm9ABgOGb0AGA4RjcAYDiKHgAMR9EDgOH4MBYADMeHsQBgOEY3AGA4ih4ADEfRA4DhKHoAMBxH3QCA4TjqBgAMx+gGAAxH0QOA4Sh6ADAcRQ8AhqPoAcBwRT3qZrVb+NaXs7rfbIH3W/qz3xX4GQGYjOPoAcBwHEcPAIZjRg8AhqPoAcBwFD0AGI6iBwDDUfQAYDiKHgAMR9EDgOEoegAwHEUPAIYr+F/Gzs7OamBgQBcuXNB3v/vdQj89ACBHWb2j7+3t1cMPP3xFcY+Pj+s73/mOvv3tb+u3v/2tJMnn8+mRRx4pfFIAQF6yKvr6+nqFQqFF29LptPr6+hQKhRQOhzU0NKTp6WlbQgIA8pdV0dfW1qqsrGzRtqmpKVVWVsrn88ntdquurk4jIyO2hAQA5C/vGX0ikZDH48nc9ng8mpycVDKZ1K9//Wu9/fbbGhwc1P3337/k4yORiCKRiCSpq6tLXq83rxyFPtf7apDvWi3H7XYX/DkLjYzXz+n5JOdndHq+5eRd9JZlXbHN5XJp3bp12rdv3zUfHwwGFQwGM7fn5ubyjfI/p9Br5fV6Hb/+ZLx+Ts8nOT+j0/JVVVVldb+8D6/0eDyKx+OZ2/F4XBUVFTk9BxceAQD75V30NTU1mpmZUSwWUyqV0vDwcM4XEQkEAmptbc03AgAgC1mNbnp6ejQxMaFkMqm2tjY1NzersbFRLS0t6uzsVDqdVkNDg6qrq3Pa+ejoqKLRKGUPADbKqug7OjqW3O73++X3+/PeOZcSBAD7cQoEADBcUYueD2MBwH4FP9dNLhjdAID9GN0AgOEY3QCA4RjdAIDhGN0AgOEoegAwHDN6ADAcM3oAMByjGwAwHEUPAIaj6AHAcHwYCwCG48NYADAcoxsAMBxFDwCGo+gBwHAUPQAYrqgfxnJx8PwsfOvLBX2+2RzuW/qz3xV03wDsx1E3AGA4RjcAYDiKHgAMR9EDgOEoegAwHEUPAIaj6AHAcJy9EgAMx3H0AGA4RjcAYDiKHgAMR9EDgOEoegAwHEUPAIaj6AHAcBQ9ABiOogcAw1H0AGC4gv9l7Pz8vH7+85/L7XZr69at2rlzZ6F3AQDIQVZF39vbq7GxMZWXl6u7uzuzfXx8XP39/Uqn02pqatKePXt09OhRfeYzn1EgEFA4HKboAaDIshrd1NfXKxQKLdqWTqfV19enUCikcDisoaEhTU9PKx6Py+v1XnryEiZDAFBsWTVxbW2tysrKFm2bmppSZWWlfD6f3G636urqNDIyIo/Ho3g8LkmyLKvwiQEAOcl7Rp9IJOTxeDK3PR6PJicn9cUvflG/+MUvNDY2pk996lPLPj4SiSgSiUiSurq6Mr8F5Go2r0chXwvf+nJR9uv+/dG8f0ZWitvtdnRGp+eTipdx9v667O5nw759g8M2POtieRf9Uu/WXS6Xbr75ZrW3t1/z8cFgUMFgMHN7bm4u3yj4H5BKpRz/M+L1eh2d0en5pNWRsdCu5+utqqrK6n55D9EvH9FIUjweV0VFRU7PwYVHAMB+eRd9TU2NZmZmFIvFlEqlNDw8nPNFRAKBgFpbW/ONAADIQlajm56eHk1MTCiZTKqtrU3Nzc1qbGxUS0uLOjs7lU6n1dDQoOrqarvzAgBylFXRd3R0LLnd7/fL7/fnvfPR0VFFo1He1QOAjbhmLAAYrqh/0cSHsQBgP97RA4DhOEcBABjOZXGeAgAw2qp/R//MM88UO8I1OT2j0/NJZCwEp+eTnJ/R6fmWs+qLHgBwdRQ9ABiu9Lnnnnuu2CGu16ZNm4od4ZqcntHp+SQyFoLT80nOz+j0fEvhw1gAMByjGwAwXFH/YOp6LXXN2pU2Nzen/fv36z//+Y9cLpeCwaB2796tl19+WX/+85/10Y9+VJL0wAMPZM4LNDg4qEOHDqmkpEQPPfSQtm3bZnvORx99VDfffLNKSkpUWlqqrq4unT9/XuFwWP/+97+1ceNGPfHEEyorK5NlWerv79err76qm266Se3t7bb+unr69GmFw+HM7VgspubmZr377rtFXcOlrpWcz5odPnxYAwMDkqS9e/eqvr7e1owHDhxQNBqV2+2Wz+dTe3u7PvKRjygWi+mJJ57InMN88+bN2rdvnyTprbfe0v79+3Xx4kVt375dDz30kFwuly358nlt2PlaXypjOBzW6dOnJUkXLlzQ2rVr9cILLxRlDQvCWqUWFhasxx57zHrnnXesDz74wHrqqaesU6dOrXiORCJhnTx50rIsy7pw4YL1+OOPW6dOnbJeeukl6+DBg1fc/9SpU9ZTTz1lXbx40ZqdnbUee+wxa2Fhwfac7e3t1tmzZxdtO3DggDU4OGhZlmUNDg5aBw4csCzLsqLRqNXZ2Wml02nrjTfesL7//e/bnu9DCwsL1sMPP2zFYrGir+GJEyeskydPWk8++WRmW65rlkwmrUcffdRKJpOL/m1nxvHxcSuVSmXyfphxdnZ20f0u98wzz1hvvPGGlU6nrc7OTmtsbMy2fLl+X+1+rS+V8XIvvvii9Zvf/MayrOKsYSGs2tHNctesXWkVFRWZd25r1qzRLbfcokQisez9R0ZGVFdXpxtuuEEf+9jHVFlZqampqZWKe0WWe++9V5J07733ZtZvdHRU99xzj1wul7Zs2aJ3331XZ86cWZFMx48fV2VlpTZu3HjV3CuxhktdKznXNRsfH9edd96psrIylZWV6c4779T4+LitGe+66y6VlpZKkrZs2XLVn0dJOnPmjN577z1t2bJFLpdL99xzT8FeS0vlW85y31e7X+tXy2hZlv72t7/pc5/73FWfw841LIRVO7pZ7pq1xRSLxfSvf/1Lt99+u15//XX96U9/0pEjR7Rp0yZ94xvfUFlZmRKJhDZv3px5zIYNG675QiyUzs5OSdIXvvAFBYNBnT17NnNVsIqKCp07d07SpbW9/LqdHo9HiUQi5yuI5WNoaGjRi8ppa5jrmv33z+lKZpWkQ4cOqa7u/6+HGovF9L3vfU9r1qzR1772NX3yk59c8rVkd8Zcv6/Feq3/85//VHl5uT7+8Y9ntjllDXOxaoveWuaatcUyPz+v7u5uPfjgg1q7dq127dqlr3zlK5Kkl156Sb/85S/V3t6+ZO6V8IMf/EAbNmzQ2bNn9fzzz1/1WpPFWttUKqVoNKqvf/3rkuS4NbyaXNZspX5OBwYGVFpaqp07d0q69D+m3t5erVu3Tm+99ZZeeOEFdXd3r/h65vp9LeZr/b/feDhlDXO1akc3hbhmbaGkUil1d3dr586d+vSnPy1JWr9+vUpKSlRSUqKmpiadPHlyydyJREIbNmywPeOH+ygvL9eOHTs0NTWl8vLyzEjmzJkzmQ/HPB7PogsWr9Tavvrqq/rEJz6h9evXS3LeGkrKec02bNhwRdaVWMvDhw8rGo3q8ccfz5TiDTfcoHXr1km6dCy4z+fTzMzMkq8lO9cz1+9rsV7rCwsLOnr06KLfiJyyhrlatUVfiGvWFoJlWfrJT36iW265RV/60pcy2y+faR89ejRzmcVAIKDh4WF98MEHisVimpmZ0e23325rxvn5eb333nuZf7/22mu67bbbFAgE9Morr0iSXnnlFe3YsSOT8ciRI7IsS2+++abWrl1blLGNk9bwQ7mu2bZt23Ts2DGdP39e58+f17Fjx2w/ymp8fFwHDx7U008/rZtuuimz/dy5c0qn05Kk2dlZzczMyOfzqaKiQmvWrNGbb74py7J05MgRW19LuX5fi/VaP378uKqqqhaNZJyyhrla1X8wNTY2phdffDFzzdq9e/eueIbXX39dzz77rG677bbMO6cHHnhAQ0NDevvtt+VyubRx40bt27cvU5YDAwP6y1/+opKSEj344IPavn27rRlnZ2f1wx/+UNKldymf//zntXfvXiWTSYXDYc3Nzcnr9erJJ5/MHCrY19enY8eO6cYbb1R7e7tqampszfj+++/rkUce0Y9//GOtXbtWkvSjH/2oqGt4+bWSy8vL1dzcrB07duS8ZocOHdLg4KCkS4dXNjQ02JpxcHBQqVQq8wHjh4cA/v3vf9fLL7+s0tJSlZSU6Ktf/WqmjE6ePKne3l5dvHhR27ZtU0tLS0HGI0vlO3HiRM7fVztf60tlbGxs1P79+7V582bt2rUrc99irGEhrOqiBwBc26od3QAAskPRA4DhKHoAMBxFDwCGo+gBwHAUPQAYjqIHAMNR9ABguP8DaoV4MSni/p8AAAAASUVORK5CYII=\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "network_stats['indegree'].hist(log = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# things to store" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "23131" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "remember('total_articles', articles.shape[0])" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "35620" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "4807" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "3864" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# total number of citations in the sm dataset\n", "remember('sm_citations', raw_edgelist.shape[0])\n", "\n", "remember('sm_citing', len(raw_edgelist[\"from\"].unique()))\n", "\n", "# the number of articles in the original dataset that have any INCOMING citations\n", "remember('sm_cited', len(raw_edgelist[\"to\"].unique()))" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "212773" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "42935" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "9710" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# total number of citations in the sm dataset\n", "remember('all_citations', combo_raw_edgelist.shape[0])\n", "\n", "remember('all_citing', len(combo_raw_edgelist[\"from\"].unique()))\n", "\n", "# the number of articles in the original dataset that have any INCOMING citations\n", "remember('all_cited', len(combo_raw_edgelist[\"to\"].unique()))" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eidcluster
02-s2.0-711490889871
12-s2.0-703498168881
22-s2.0-799537117111
32-s2.0-795516307511
42-s2.0-800514691031
52-s2.0-848667188511
62-s2.0-848776855511
72-s2.0-848644425471
82-s2.0-848614208641
92-s2.0-848874834871
102-s2.0-809551448471
112-s2.0-848850383091
122-s2.0-848860995691
132-s2.0-848633797831
142-s2.0-848990936631
152-s2.0-848791098591
162-s2.0-830551683091
172-s2.0-848763043221
182-s2.0-848661681471
192-s2.0-848778174281
202-s2.0-848734812561
212-s2.0-848617948971
222-s2.0-848995082981
232-s2.0-848980824651
242-s2.0-848790217741
252-s2.0-800549880411
262-s2.0-849443941181
272-s2.0-848705723011
282-s2.0-849071673201
292-s2.0-849146757211
.........
61102-s2.0-8485608683912
61112-s2.0-8485951012212
61122-s2.0-8490512120912
61132-s2.0-8488375861312
61142-s2.0-8487795310012
61152-s2.0-8490437676612
61162-s2.0-8490583718212
61172-s2.0-8490046121812
61182-s2.0-8375522878513
61192-s2.0-8488679597513
61202-s2.0-8487613278513
61212-s2.0-8490312133413
61222-s2.0-8486372040013
61232-s2.0-8487318093813
61242-s2.0-8491411283813
61252-s2.0-8487879574813
61262-s2.0-8488801166613
61272-s2.0-8494210121813
61282-s2.0-8005275211314
61292-s2.0-8487407470714
61302-s2.0-8494258223514
61312-s2.0-7084913036014
61322-s2.0-8486415263014
61332-s2.0-8486870916115
61342-s2.0-8489635001515
61352-s2.0-8494410493315
61362-s2.0-8487553950616
61372-s2.0-8490226295416
61382-s2.0-8490995448117
61392-s2.0-8492146967818
\n", "

6140 rows × 2 columns

\n", "
" ], "text/plain": [ " eid cluster\n", "0 2-s2.0-71149088987 1\n", "1 2-s2.0-70349816888 1\n", "2 2-s2.0-79953711711 1\n", "3 2-s2.0-79551630751 1\n", "4 2-s2.0-80051469103 1\n", "5 2-s2.0-84866718851 1\n", "6 2-s2.0-84877685551 1\n", "7 2-s2.0-84864442547 1\n", "8 2-s2.0-84861420864 1\n", "9 2-s2.0-84887483487 1\n", "10 2-s2.0-80955144847 1\n", "11 2-s2.0-84885038309 1\n", "12 2-s2.0-84886099569 1\n", "13 2-s2.0-84863379783 1\n", "14 2-s2.0-84899093663 1\n", "15 2-s2.0-84879109859 1\n", "16 2-s2.0-83055168309 1\n", "17 2-s2.0-84876304322 1\n", "18 2-s2.0-84866168147 1\n", "19 2-s2.0-84877817428 1\n", "20 2-s2.0-84873481256 1\n", "21 2-s2.0-84861794897 1\n", "22 2-s2.0-84899508298 1\n", "23 2-s2.0-84898082465 1\n", "24 2-s2.0-84879021774 1\n", "25 2-s2.0-80054988041 1\n", "26 2-s2.0-84944394118 1\n", "27 2-s2.0-84870572301 1\n", "28 2-s2.0-84907167320 1\n", "29 2-s2.0-84914675721 1\n", "... ... ...\n", "6110 2-s2.0-84856086839 12\n", "6111 2-s2.0-84859510122 12\n", "6112 2-s2.0-84905121209 12\n", "6113 2-s2.0-84883758613 12\n", "6114 2-s2.0-84877953100 12\n", "6115 2-s2.0-84904376766 12\n", "6116 2-s2.0-84905837182 12\n", "6117 2-s2.0-84900461218 12\n", "6118 2-s2.0-83755228785 13\n", "6119 2-s2.0-84886795975 13\n", "6120 2-s2.0-84876132785 13\n", "6121 2-s2.0-84903121334 13\n", "6122 2-s2.0-84863720400 13\n", "6123 2-s2.0-84873180938 13\n", "6124 2-s2.0-84914112838 13\n", "6125 2-s2.0-84878795748 13\n", "6126 2-s2.0-84888011666 13\n", "6127 2-s2.0-84942101218 13\n", "6128 2-s2.0-80052752113 14\n", "6129 2-s2.0-84874074707 14\n", "6130 2-s2.0-84942582235 14\n", "6131 2-s2.0-70849130360 14\n", "6132 2-s2.0-84864152630 14\n", "6133 2-s2.0-84868709161 15\n", "6134 2-s2.0-84896350015 15\n", "6135 2-s2.0-84944104933 15\n", "6136 2-s2.0-84875539506 16\n", "6137 2-s2.0-84902262954 16\n", "6138 2-s2.0-84909954481 17\n", "6139 2-s2.0-84921469678 18\n", "\n", "[6140 rows x 2 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "remember('g_sm_clusters', g_sm_clu[[\"eid\", \"cluster\"]])" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['all_citations',\n", " 'all_cited',\n", " 'all_citing',\n", " 'cluster_edgelist',\n", " 'g_sm_clusters',\n", " 'sm_citations',\n", " 'sm_cited',\n", " 'sm_citing',\n", " 'total_articles']" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(r.keys())" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "#save the r function to rdata file\n", "def save_to_r(r_dict, filename=\"output.RData\"):\n", " for var_name, x in r.items():\n", " var_name = var_name.replace('_', '.')\n", " if type(x) == np.int64:\n", " x = np.asscalar(x)\n", " \n", " if type(x) == pd.DataFrame:\n", " rx = pandas2ri.py2ri(x)\n", " else:\n", " rx = x\n", " \n", " robjects.r.assign(var_name, x)\n", "\n", " # create a new variable called in R\n", " robjects.r(\"r <- sapply(ls(), function (x) {eval(parse(text=x))})\")\n", " robjects.r('save(\"r\", file=\"{}\")'.format(filename))\n", " robjects.r(\"rm(list=ls())\")\n", " \n", "save_to_r(r, \"../../paper/data/network_data.RData\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 1 }