]> code.communitydata.science - social-media-chapter.git/blob - code/bibliometrics/00_citation_network_analysis.ipynb
ignore rendered HTML
[social-media-chapter.git] / code / bibliometrics / 00_citation_network_analysis.ipynb
1 {
2  "cells": [
3   {
4    "cell_type": "markdown",
5    "metadata": {},
6    "source": [
7     "# Import data and get things setup"
8    ]
9   },
10   {
11    "cell_type": "code",
12    "execution_count": 52,
13    "metadata": {},
14    "outputs": [],
15    "source": [
16     "import random\n",
17     "random.seed(9001)"
18    ]
19   },
20   {
21    "cell_type": "code",
22    "execution_count": 53,
23    "metadata": {
24     "scrolled": true
25    },
26    "outputs": [
27     {
28      "name": "stdout",
29      "output_type": "stream",
30      "text": [
31       "Populating the interactive namespace from numpy and matplotlib\n"
32      ]
33     },
34     {
35      "name": "stderr",
36      "output_type": "stream",
37      "text": [
38       "/usr/lib/python3/dist-packages/IPython/core/magics/pylab.py:161: UserWarning: pylab import has clobbered these variables: ['sin', 'pi', 'median', 'random', 'percentile', 'save', 'deprecated', 'Rectangle', 'load', 'mean', 'plot', 'cos']\n",
39       "`%matplotlib` prevents importing * from pylab and numpy\n",
40       "  \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n"
41      ]
42     }
43    ],
44    "source": [
45     "# turn on the magic so we have inline figures\n",
46     "%pylab inline\n",
47     "import matplotlib\n",
48     "matplotlib.style.use('ggplot')\n",
49     "from IPython.display import display"
50    ]
51   },
52   {
53    "cell_type": "code",
54    "execution_count": 54,
55    "metadata": {},
56    "outputs": [],
57    "source": [
58     "# import code to write r modules and create our variable we'll write to\n",
59     "import rpy2.robjects as robjects\n",
60     "from rpy2.robjects import pandas2ri\n",
61     "pandas2ri.activate()\n",
62     "\n",
63     "r = {}\n",
64     "def remember(name, x):\n",
65     "    r[name] = x\n",
66     "    display(x)"
67    ]
68   },
69   {
70    "cell_type": "code",
71    "execution_count": 55,
72    "metadata": {},
73    "outputs": [],
74    "source": [
75     "# load in modules we'll need for analysis\n",
76     "import subprocess\n",
77     "import csv\n",
78     "from igraph import *\n",
79     "import pandas as pd\n",
80     "import numpy as np\n",
81     "import re"
82    ]
83   },
84   {
85    "cell_type": "code",
86    "execution_count": 56,
87    "metadata": {
88     "scrolled": true
89    },
90    "outputs": [],
91    "source": [
92     "# grab the largest connected compontent with a little function\n",
93     "def get_largest_component(g):\n",
94     "    g_components = g.components(mode=\"WEAK\")\n",
95     "    max_size = max(g_components.sizes())\n",
96     "    for g_tmp in g_components.subgraphs():\n",
97     "        if g_tmp.vcount() == max_size:\n",
98     "            return(g_tmp)"
99    ]
100   },
101   {
102    "cell_type": "code",
103    "execution_count": 57,
104    "metadata": {},
105    "outputs": [],
106    "source": [
107     "# look the full edgelist into igraph\n",
108     "def edge_list_iter(df):\n",
109     "    for i, row in df.iterrows():\n",
110     "        yield (row['from'], row['to'])"
111    ]
112   },
113   {
114    "cell_type": "code",
115    "execution_count": 58,
116    "metadata": {},
117    "outputs": [],
118    "source": [
119     "# list top 5 journals for each of the clusters\n",
120     "def top_journals_for_clusters(clu):\n",
121     "    articles_tmp = pd.merge(clu, articles[['eid', 'source_title']])\n",
122     "    \n",
123     "    output = pd.DataFrame()\n",
124     "    for cid in articles_tmp['cluster'].unique():\n",
125     "        journal_counts = articles_tmp['source_title'][articles_tmp['cluster'] == cid].value_counts().head(5)\n",
126     "        tmp = pd.DataFrame({'cluster' : cid, 'count' : journal_counts })        \n",
127     "        output = output.append(tmp)\n",
128     "\n",
129     "    output = output.reset_index()\n",
130     "    output = output.rename(columns = {'index' : \"journal\"})\n",
131     "    return(output)"
132    ]
133   },
134   {
135    "cell_type": "code",
136    "execution_count": 59,
137    "metadata": {},
138    "outputs": [],
139    "source": [
140     "def infomap_edgelist(g, edgelist_filename, directed=True):\n",
141     "    nodes_tmp = pd.DataFrame([ {'node_infomap' : v.index, \n",
142     "                                'eid' : v['name']} for v in g.vs ])\n",
143     "\n",
144     "    # write out the edgelist to an external file so we can call infomap on it\n",
145     "    with open(edgelist_filename + \".txt\", 'w') as f:\n",
146     "        for e in g.es:\n",
147     "            if e.source != e.target:\n",
148     "                if 'weight' in e.attributes():\n",
149     "                    print(\"{}\\t{}\\t{}\".format(e.source, e.target, e['weight']), file=f)\n",
150     "                else:\n",
151     "                    print(\"{}\\t{}\".format(e.source, e.target), file=f)\n",
152     "\n",
153     "                    \n",
154     "    # run the external program to generate the infomap clustering\n",
155     "    infomap_cmdline = [\"infomap/Infomap\", edgelist_filename + \".txt\", \"output_dir -z --map --clu --tree\"]\n",
156     "    if directed:\n",
157     "        infomap_cmdline.append(\"-d\")\n",
158     "    subprocess.call(infomap_cmdline)\n",
159     "\n",
160     "    # load up the clu data\n",
161     "    clu = pd.read_csv(\"output_dir/\" + edgelist_filename + \".clu\",\n",
162     "                      header=None, comment=\"#\", delim_whitespace=True)\n",
163     "    clu.columns = ['node_infomap', 'cluster', 'flow']\n",
164     "    \n",
165     "    return pd.merge(clu, nodes_tmp, on=\"node_infomap\")"
166    ]
167   },
168   {
169    "cell_type": "code",
170    "execution_count": 60,
171    "metadata": {},
172    "outputs": [],
173    "source": [
174     "def write_graphml(g, clu, graphml_filename):\n",
175     "    clu = clu[['node_infomap', 'cluster']].sort_values('node_infomap')\n",
176     "    g.vs[\"cluster\"] =  clu[\"cluster\"].tolist()\n",
177     "    g.write_graphml(graphml_filename)"
178    ]
179   },
180   {
181    "cell_type": "code",
182    "execution_count": 61,
183    "metadata": {},
184    "outputs": [],
185    "source": [
186     "# load article data\n",
187     "articles = pd.read_csv(\"../../processed_data/abstracts.tsv\", delimiter=\"\\t\")"
188    ]
189   },
190   {
191    "cell_type": "markdown",
192    "metadata": {},
193    "source": [
194     "# network for just the central \"social media\" set"
195    ]
196   },
197   {
198    "cell_type": "code",
199    "execution_count": 62,
200    "metadata": {},
201    "outputs": [],
202    "source": [
203     "# this contains the list of all INCOMING citations to for paper in the original set\n",
204     "raw_edgelist = pd.read_csv(\"../../processed_data/social_media_edgelist.txt\", delimiter=\"\\t\")"
205    ]
206   },
207   {
208    "cell_type": "code",
209    "execution_count": 63,
210    "metadata": {},
211    "outputs": [],
212    "source": [
213     "g_sm_all = Graph.TupleList([i for i in edge_list_iter(raw_edgelist)], directed=True)"
214    ]
215   },
216   {
217    "cell_type": "code",
218    "execution_count": 64,
219    "metadata": {},
220    "outputs": [],
221    "source": [
222     "g_sm = get_largest_component(g_sm_all)\n",
223     "g_sm = g_sm.simplify()"
224    ]
225   },
226   {
227    "cell_type": "code",
228    "execution_count": 65,
229    "metadata": {},
230    "outputs": [],
231    "source": [
232     "g_sm_clu = infomap_edgelist(g_sm, \"sm_edgelist_infomap\", directed=True)"
233    ]
234   },
235   {
236    "cell_type": "code",
237    "execution_count": 66,
238    "metadata": {},
239    "outputs": [
240     {
241      "data": {
242       "text/plain": [
243        "2     1817\n",
244        "1     1748\n",
245        "3     1088\n",
246        "4      653\n",
247        "6      355\n",
248        "10     114\n",
249        "5      104\n",
250        "9       90\n",
251        "8       59\n",
252        "7       44\n",
253        "12      27\n",
254        "11      19\n",
255        "13      10\n",
256        "14       5\n",
257        "15       3\n",
258        "16       2\n",
259        "18       1\n",
260        "17       1\n",
261        "Name: cluster, dtype: int64"
262       ]
263      },
264      "execution_count": 66,
265      "metadata": {},
266      "output_type": "execute_result"
267     }
268    ],
269    "source": [
270     "g_sm_clu['cluster'].value_counts()"
271    ]
272   },
273   {
274    "cell_type": "code",
275    "execution_count": 67,
276    "metadata": {},
277    "outputs": [
278     {
279      "data": {
280       "text/html": [
281        "<div>\n",
282        "<style>\n",
283        "    .dataframe thead tr:only-child th {\n",
284        "        text-align: right;\n",
285        "    }\n",
286        "\n",
287        "    .dataframe thead th {\n",
288        "        text-align: left;\n",
289        "    }\n",
290        "\n",
291        "    .dataframe tbody tr th {\n",
292        "        vertical-align: top;\n",
293        "    }\n",
294        "</style>\n",
295        "<table border=\"1\" class=\"dataframe\">\n",
296        "  <thead>\n",
297        "    <tr style=\"text-align: right;\">\n",
298        "      <th></th>\n",
299        "      <th>journal</th>\n",
300        "      <th>cluster</th>\n",
301        "      <th>count</th>\n",
302        "    </tr>\n",
303        "  </thead>\n",
304        "  <tbody>\n",
305        "    <tr>\n",
306        "      <th>40</th>\n",
307        "      <td>Lecture Notes in Computer Science (including s...</td>\n",
308        "      <td>9</td>\n",
309        "      <td>4</td>\n",
310        "    </tr>\n",
311        "    <tr>\n",
312        "      <th>41</th>\n",
313        "      <td>WSDM 2013 - Proceedings of the 6th ACM Interna...</td>\n",
314        "      <td>9</td>\n",
315        "      <td>4</td>\n",
316        "    </tr>\n",
317        "    <tr>\n",
318        "      <th>42</th>\n",
319        "      <td>Conference on Human Factors in Computing Syste...</td>\n",
320        "      <td>9</td>\n",
321        "      <td>2</td>\n",
322        "    </tr>\n",
323        "    <tr>\n",
324        "      <th>43</th>\n",
325        "      <td>WWW 2013 Companion - Proceedings of the 22nd I...</td>\n",
326        "      <td>9</td>\n",
327        "      <td>2</td>\n",
328        "    </tr>\n",
329        "    <tr>\n",
330        "      <th>44</th>\n",
331        "      <td>PLoS ONE</td>\n",
332        "      <td>9</td>\n",
333        "      <td>2</td>\n",
334        "    </tr>\n",
335        "  </tbody>\n",
336        "</table>\n",
337        "</div>"
338       ],
339       "text/plain": [
340        "                                              journal  cluster  count\n",
341        "40  Lecture Notes in Computer Science (including s...        9      4\n",
342        "41  WSDM 2013 - Proceedings of the 6th ACM Interna...        9      4\n",
343        "42  Conference on Human Factors in Computing Syste...        9      2\n",
344        "43  WWW 2013 Companion - Proceedings of the 22nd I...        9      2\n",
345        "44                                           PLoS ONE        9      2"
346       ]
347      },
348      "execution_count": 67,
349      "metadata": {},
350      "output_type": "execute_result"
351     }
352    ],
353    "source": [
354     "tmp = top_journals_for_clusters(g_sm_clu)\n",
355     "tmp[tmp.cluster == 9]"
356    ]
357   },
358   {
359    "cell_type": "code",
360    "execution_count": 68,
361    "metadata": {},
362    "outputs": [],
363    "source": [
364     "write_graphml(g_sm, g_sm_clu, \"g_sm.graphml\")"
365    ]
366   },
367   {
368    "cell_type": "markdown",
369    "metadata": {},
370    "source": [
371     "# larger network that contains the incoming cites to citing articles"
372    ]
373   },
374   {
375    "cell_type": "code",
376    "execution_count": 69,
377    "metadata": {},
378    "outputs": [],
379    "source": [
380     "# this contains the list of all INCOMING citations to everything in the original set\n",
381     "# plus every INCOMING citation to every paper that cites one of those papers\n",
382     "raw_edgelist_files = [\"../../processed_data/citation_edgelist.txt\",\n",
383     "                      \"../../processed_data/social_media_edgelist.txt\"]\n",
384     "combo_raw_edgelist = pd.concat([pd.read_csv(x, delimiter=\"\\t\") for x in raw_edgelist_files])"
385    ]
386   },
387   {
388    "cell_type": "code",
389    "execution_count": 70,
390    "metadata": {},
391    "outputs": [],
392    "source": [
393     "g_full_all = Graph.TupleList([i for i in edge_list_iter(combo_raw_edgelist)], directed=True)"
394    ]
395   },
396   {
397    "cell_type": "code",
398    "execution_count": 71,
399    "metadata": {},
400    "outputs": [],
401    "source": [
402     "g_full = get_largest_component(g_full_all)\n",
403     "g_full = g_full.simplify()"
404    ]
405   },
406   {
407    "cell_type": "code",
408    "execution_count": 72,
409    "metadata": {},
410    "outputs": [],
411    "source": [
412     "g_full_clu = infomap_edgelist(g_full, \"citation_edglist_infomap\", directed=True)"
413    ]
414   },
415   {
416    "cell_type": "code",
417    "execution_count": 73,
418    "metadata": {},
419    "outputs": [
420     {
421      "data": {
422       "text/plain": [
423        "1     9243\n",
424        "2     8225\n",
425        "3     6826\n",
426        "4     3227\n",
427        "6     2835\n",
428        "5     2704\n",
429        "7     1911\n",
430        "9      810\n",
431        "8      803\n",
432        "10     589\n",
433        "11     520\n",
434        "12     491\n",
435        "13     336\n",
436        "14     219\n",
437        "15     175\n",
438        "17     162\n",
439        "16     153\n",
440        "22     139\n",
441        "18     135\n",
442        "19     118\n",
443        "25     117\n",
444        "23     106\n",
445        "21      93\n",
446        "24      88\n",
447        "30      84\n",
448        "28      79\n",
449        "27      78\n",
450        "32      76\n",
451        "26      73\n",
452        "20      71\n",
453        "      ... \n",
454        "54      26\n",
455        "56      25\n",
456        "52      23\n",
457        "49      23\n",
458        "55      22\n",
459        "58      19\n",
460        "62      18\n",
461        "61      18\n",
462        "63      18\n",
463        "60      17\n",
464        "66      15\n",
465        "59      15\n",
466        "57      15\n",
467        "65      14\n",
468        "68      13\n",
469        "53       7\n",
470        "64       6\n",
471        "73       6\n",
472        "71       4\n",
473        "70       4\n",
474        "74       3\n",
475        "67       3\n",
476        "72       3\n",
477        "69       3\n",
478        "75       2\n",
479        "78       1\n",
480        "79       1\n",
481        "77       1\n",
482        "80       1\n",
483        "76       1\n",
484        "Name: cluster, Length: 80, dtype: int64"
485       ]
486      },
487      "execution_count": 73,
488      "metadata": {},
489      "output_type": "execute_result"
490     }
491    ],
492    "source": [
493     "g_full_clu['cluster'].value_counts()"
494    ]
495   },
496   {
497    "cell_type": "code",
498    "execution_count": 74,
499    "metadata": {},
500    "outputs": [
501     {
502      "data": {
503       "text/html": [
504        "<div>\n",
505        "<style>\n",
506        "    .dataframe thead tr:only-child th {\n",
507        "        text-align: right;\n",
508        "    }\n",
509        "\n",
510        "    .dataframe thead th {\n",
511        "        text-align: left;\n",
512        "    }\n",
513        "\n",
514        "    .dataframe tbody tr th {\n",
515        "        vertical-align: top;\n",
516        "    }\n",
517        "</style>\n",
518        "<table border=\"1\" class=\"dataframe\">\n",
519        "  <thead>\n",
520        "    <tr style=\"text-align: right;\">\n",
521        "      <th></th>\n",
522        "      <th>journal</th>\n",
523        "      <th>cluster</th>\n",
524        "      <th>count</th>\n",
525        "    </tr>\n",
526        "  </thead>\n",
527        "  <tbody>\n",
528        "    <tr>\n",
529        "      <th>0</th>\n",
530        "      <td>Public Relations Review</td>\n",
531        "      <td>1</td>\n",
532        "      <td>119</td>\n",
533        "    </tr>\n",
534        "    <tr>\n",
535        "      <th>1</th>\n",
536        "      <td>Lecture Notes in Computer Science (including s...</td>\n",
537        "      <td>1</td>\n",
538        "      <td>81</td>\n",
539        "    </tr>\n",
540        "    <tr>\n",
541        "      <th>2</th>\n",
542        "      <td>Computers in Human Behavior</td>\n",
543        "      <td>1</td>\n",
544        "      <td>71</td>\n",
545        "    </tr>\n",
546        "    <tr>\n",
547        "      <th>3</th>\n",
548        "      <td>Proceedings of the Annual Hawaii International...</td>\n",
549        "      <td>1</td>\n",
550        "      <td>49</td>\n",
551        "    </tr>\n",
552        "    <tr>\n",
553        "      <th>4</th>\n",
554        "      <td>Government Information Quarterly</td>\n",
555        "      <td>1</td>\n",
556        "      <td>40</td>\n",
557        "    </tr>\n",
558        "    <tr>\n",
559        "      <th>5</th>\n",
560        "      <td>Journal of Medical Internet Research</td>\n",
561        "      <td>2</td>\n",
562        "      <td>149</td>\n",
563        "    </tr>\n",
564        "    <tr>\n",
565        "      <th>6</th>\n",
566        "      <td>PLoS ONE</td>\n",
567        "      <td>2</td>\n",
568        "      <td>43</td>\n",
569        "    </tr>\n",
570        "    <tr>\n",
571        "      <th>7</th>\n",
572        "      <td>Studies in Health Technology and Informatics</td>\n",
573        "      <td>2</td>\n",
574        "      <td>41</td>\n",
575        "    </tr>\n",
576        "    <tr>\n",
577        "      <th>8</th>\n",
578        "      <td>Lecture Notes in Computer Science (including s...</td>\n",
579        "      <td>2</td>\n",
580        "      <td>32</td>\n",
581        "    </tr>\n",
582        "    <tr>\n",
583        "      <th>9</th>\n",
584        "      <td>Annals of Emergency Medicine</td>\n",
585        "      <td>2</td>\n",
586        "      <td>17</td>\n",
587        "    </tr>\n",
588        "    <tr>\n",
589        "      <th>10</th>\n",
590        "      <td>Lecture Notes in Computer Science (including s...</td>\n",
591        "      <td>3</td>\n",
592        "      <td>180</td>\n",
593        "    </tr>\n",
594        "    <tr>\n",
595        "      <th>11</th>\n",
596        "      <td>ACM International Conference Proceeding Series</td>\n",
597        "      <td>3</td>\n",
598        "      <td>51</td>\n",
599        "    </tr>\n",
600        "    <tr>\n",
601        "      <th>12</th>\n",
602        "      <td>International Conference on Information and Kn...</td>\n",
603        "      <td>3</td>\n",
604        "      <td>38</td>\n",
605        "    </tr>\n",
606        "    <tr>\n",
607        "      <th>13</th>\n",
608        "      <td>CEUR Workshop Proceedings</td>\n",
609        "      <td>3</td>\n",
610        "      <td>37</td>\n",
611        "    </tr>\n",
612        "    <tr>\n",
613        "      <th>14</th>\n",
614        "      <td>PLoS ONE</td>\n",
615        "      <td>3</td>\n",
616        "      <td>36</td>\n",
617        "    </tr>\n",
618        "    <tr>\n",
619        "      <th>15</th>\n",
620        "      <td>Information Communication and Society</td>\n",
621        "      <td>4</td>\n",
622        "      <td>70</td>\n",
623        "    </tr>\n",
624        "    <tr>\n",
625        "      <th>16</th>\n",
626        "      <td>New Media and Society</td>\n",
627        "      <td>4</td>\n",
628        "      <td>34</td>\n",
629        "    </tr>\n",
630        "    <tr>\n",
631        "      <th>17</th>\n",
632        "      <td>First Monday</td>\n",
633        "      <td>4</td>\n",
634        "      <td>24</td>\n",
635        "    </tr>\n",
636        "    <tr>\n",
637        "      <th>18</th>\n",
638        "      <td>Lecture Notes in Computer Science (including s...</td>\n",
639        "      <td>4</td>\n",
640        "      <td>23</td>\n",
641        "    </tr>\n",
642        "    <tr>\n",
643        "      <th>19</th>\n",
644        "      <td>Computers in Human Behavior</td>\n",
645        "      <td>4</td>\n",
646        "      <td>21</td>\n",
647        "    </tr>\n",
648        "    <tr>\n",
649        "      <th>20</th>\n",
650        "      <td>Computers in Human Behavior</td>\n",
651        "      <td>5</td>\n",
652        "      <td>42</td>\n",
653        "    </tr>\n",
654        "    <tr>\n",
655        "      <th>21</th>\n",
656        "      <td>Cyberpsychology, Behavior, and Social Networking</td>\n",
657        "      <td>5</td>\n",
658        "      <td>42</td>\n",
659        "    </tr>\n",
660        "    <tr>\n",
661        "      <th>22</th>\n",
662        "      <td>Personality and Individual Differences</td>\n",
663        "      <td>5</td>\n",
664        "      <td>11</td>\n",
665        "    </tr>\n",
666        "    <tr>\n",
667        "      <th>23</th>\n",
668        "      <td>Journal of Medical Internet Research</td>\n",
669        "      <td>5</td>\n",
670        "      <td>11</td>\n",
671        "    </tr>\n",
672        "    <tr>\n",
673        "      <th>24</th>\n",
674        "      <td>Journal of Adolescent Health</td>\n",
675        "      <td>5</td>\n",
676        "      <td>11</td>\n",
677        "    </tr>\n",
678        "    <tr>\n",
679        "      <th>25</th>\n",
680        "      <td>Computers in Human Behavior</td>\n",
681        "      <td>6</td>\n",
682        "      <td>38</td>\n",
683        "    </tr>\n",
684        "    <tr>\n",
685        "      <th>26</th>\n",
686        "      <td>Lecture Notes in Computer Science (including s...</td>\n",
687        "      <td>6</td>\n",
688        "      <td>24</td>\n",
689        "    </tr>\n",
690        "    <tr>\n",
691        "      <th>27</th>\n",
692        "      <td>Computers and Education</td>\n",
693        "      <td>6</td>\n",
694        "      <td>16</td>\n",
695        "    </tr>\n",
696        "    <tr>\n",
697        "      <th>28</th>\n",
698        "      <td>Conference on Human Factors in Computing Syste...</td>\n",
699        "      <td>6</td>\n",
700        "      <td>11</td>\n",
701        "    </tr>\n",
702        "    <tr>\n",
703        "      <th>29</th>\n",
704        "      <td>Journal of Marketing Education</td>\n",
705        "      <td>6</td>\n",
706        "      <td>11</td>\n",
707        "    </tr>\n",
708        "    <tr>\n",
709        "      <th>...</th>\n",
710        "      <td>...</td>\n",
711        "      <td>...</td>\n",
712        "      <td>...</td>\n",
713        "    </tr>\n",
714        "    <tr>\n",
715        "      <th>286</th>\n",
716        "      <td>Medical Journal of Australia</td>\n",
717        "      <td>63</td>\n",
718        "      <td>1</td>\n",
719        "    </tr>\n",
720        "    <tr>\n",
721        "      <th>287</th>\n",
722        "      <td>Nicotine and Tobacco Research</td>\n",
723        "      <td>63</td>\n",
724        "      <td>1</td>\n",
725        "    </tr>\n",
726        "    <tr>\n",
727        "      <th>288</th>\n",
728        "      <td>35th International Conference on Information S...</td>\n",
729        "      <td>64</td>\n",
730        "      <td>1</td>\n",
731        "    </tr>\n",
732        "    <tr>\n",
733        "      <th>289</th>\n",
734        "      <td>First Monday</td>\n",
735        "      <td>64</td>\n",
736        "      <td>1</td>\n",
737        "    </tr>\n",
738        "    <tr>\n",
739        "      <th>290</th>\n",
740        "      <td>Cyberpsychology, Behavior, and Social Networking</td>\n",
741        "      <td>64</td>\n",
742        "      <td>1</td>\n",
743        "    </tr>\n",
744        "    <tr>\n",
745        "      <th>291</th>\n",
746        "      <td>HT'12 - Proceedings of 23rd ACM Conference on ...</td>\n",
747        "      <td>65</td>\n",
748        "      <td>1</td>\n",
749        "    </tr>\n",
750        "    <tr>\n",
751        "      <th>292</th>\n",
752        "      <td>IEEE/ACM Transactions on Networking</td>\n",
753        "      <td>65</td>\n",
754        "      <td>1</td>\n",
755        "    </tr>\n",
756        "    <tr>\n",
757        "      <th>293</th>\n",
758        "      <td>Journal of Healthcare Engineering</td>\n",
759        "      <td>65</td>\n",
760        "      <td>1</td>\n",
761        "    </tr>\n",
762        "    <tr>\n",
763        "      <th>294</th>\n",
764        "      <td>International Journal of Information Management</td>\n",
765        "      <td>66</td>\n",
766        "      <td>2</td>\n",
767        "    </tr>\n",
768        "    <tr>\n",
769        "      <th>295</th>\n",
770        "      <td>Journal of Theoretical and Applied Electronic ...</td>\n",
771        "      <td>66</td>\n",
772        "      <td>1</td>\n",
773        "    </tr>\n",
774        "    <tr>\n",
775        "      <th>296</th>\n",
776        "      <td>Journal of Experimental and Theoretical Artifi...</td>\n",
777        "      <td>66</td>\n",
778        "      <td>1</td>\n",
779        "    </tr>\n",
780        "    <tr>\n",
781        "      <th>297</th>\n",
782        "      <td>McKinsey Quarterly</td>\n",
783        "      <td>66</td>\n",
784        "      <td>1</td>\n",
785        "    </tr>\n",
786        "    <tr>\n",
787        "      <th>298</th>\n",
788        "      <td>Lecture Notes in Computer Science (including s...</td>\n",
789        "      <td>66</td>\n",
790        "      <td>1</td>\n",
791        "    </tr>\n",
792        "    <tr>\n",
793        "      <th>299</th>\n",
794        "      <td>Science (New York, N.Y.)</td>\n",
795        "      <td>67</td>\n",
796        "      <td>1</td>\n",
797        "    </tr>\n",
798        "    <tr>\n",
799        "      <th>300</th>\n",
800        "      <td>International Conference on Information and Kn...</td>\n",
801        "      <td>68</td>\n",
802        "      <td>1</td>\n",
803        "    </tr>\n",
804        "    <tr>\n",
805        "      <th>301</th>\n",
806        "      <td>Lecture Notes in Computer Science (including s...</td>\n",
807        "      <td>68</td>\n",
808        "      <td>1</td>\n",
809        "    </tr>\n",
810        "    <tr>\n",
811        "      <th>302</th>\n",
812        "      <td>16th Americas Conference on Information System...</td>\n",
813        "      <td>68</td>\n",
814        "      <td>1</td>\n",
815        "    </tr>\n",
816        "    <tr>\n",
817        "      <th>303</th>\n",
818        "      <td>Procedia Engineering</td>\n",
819        "      <td>68</td>\n",
820        "      <td>1</td>\n",
821        "    </tr>\n",
822        "    <tr>\n",
823        "      <th>304</th>\n",
824        "      <td>International Journal of Virtual and Personal ...</td>\n",
825        "      <td>68</td>\n",
826        "      <td>1</td>\n",
827        "    </tr>\n",
828        "    <tr>\n",
829        "      <th>305</th>\n",
830        "      <td>Scientometrics</td>\n",
831        "      <td>69</td>\n",
832        "      <td>1</td>\n",
833        "    </tr>\n",
834        "    <tr>\n",
835        "      <th>306</th>\n",
836        "      <td>Conference on Human Factors in Computing Syste...</td>\n",
837        "      <td>70</td>\n",
838        "      <td>2</td>\n",
839        "    </tr>\n",
840        "    <tr>\n",
841        "      <th>307</th>\n",
842        "      <td>NyS</td>\n",
843        "      <td>71</td>\n",
844        "      <td>2</td>\n",
845        "    </tr>\n",
846        "    <tr>\n",
847        "      <th>308</th>\n",
848        "      <td>Aslib Proceedings: New Information Perspectives</td>\n",
849        "      <td>71</td>\n",
850        "      <td>1</td>\n",
851        "    </tr>\n",
852        "    <tr>\n",
853        "      <th>309</th>\n",
854        "      <td>WWW 2013 Companion - Proceedings of the 22nd I...</td>\n",
855        "      <td>72</td>\n",
856        "      <td>1</td>\n",
857        "    </tr>\n",
858        "    <tr>\n",
859        "      <th>310</th>\n",
860        "      <td>Cyberpsychology, Behavior, and Social Networking</td>\n",
861        "      <td>72</td>\n",
862        "      <td>1</td>\n",
863        "    </tr>\n",
864        "    <tr>\n",
865        "      <th>311</th>\n",
866        "      <td>PACIS 2011 - 15th Pacific Asia Conference on I...</td>\n",
867        "      <td>73</td>\n",
868        "      <td>1</td>\n",
869        "    </tr>\n",
870        "    <tr>\n",
871        "      <th>312</th>\n",
872        "      <td>Proceedings of the International Conference on...</td>\n",
873        "      <td>73</td>\n",
874        "      <td>1</td>\n",
875        "    </tr>\n",
876        "    <tr>\n",
877        "      <th>313</th>\n",
878        "      <td>Online (Wilton, Connecticut)</td>\n",
879        "      <td>74</td>\n",
880        "      <td>1</td>\n",
881        "    </tr>\n",
882        "    <tr>\n",
883        "      <th>314</th>\n",
884        "      <td>Catalan Journal of Communication and Cultural ...</td>\n",
885        "      <td>75</td>\n",
886        "      <td>1</td>\n",
887        "    </tr>\n",
888        "    <tr>\n",
889        "      <th>315</th>\n",
890        "      <td>Proceedings - Pacific Asia Conference on Infor...</td>\n",
891        "      <td>75</td>\n",
892        "      <td>1</td>\n",
893        "    </tr>\n",
894        "  </tbody>\n",
895        "</table>\n",
896        "<p>316 rows × 3 columns</p>\n",
897        "</div>"
898       ],
899       "text/plain": [
900        "                                               journal  cluster  count\n",
901        "0                              Public Relations Review        1    119\n",
902        "1    Lecture Notes in Computer Science (including s...        1     81\n",
903        "2                          Computers in Human Behavior        1     71\n",
904        "3    Proceedings of the Annual Hawaii International...        1     49\n",
905        "4                     Government Information Quarterly        1     40\n",
906        "5                 Journal of Medical Internet Research        2    149\n",
907        "6                                             PLoS ONE        2     43\n",
908        "7         Studies in Health Technology and Informatics        2     41\n",
909        "8    Lecture Notes in Computer Science (including s...        2     32\n",
910        "9                         Annals of Emergency Medicine        2     17\n",
911        "10   Lecture Notes in Computer Science (including s...        3    180\n",
912        "11      ACM International Conference Proceeding Series        3     51\n",
913        "12   International Conference on Information and Kn...        3     38\n",
914        "13                           CEUR Workshop Proceedings        3     37\n",
915        "14                                            PLoS ONE        3     36\n",
916        "15               Information Communication and Society        4     70\n",
917        "16                               New Media and Society        4     34\n",
918        "17                                        First Monday        4     24\n",
919        "18   Lecture Notes in Computer Science (including s...        4     23\n",
920        "19                         Computers in Human Behavior        4     21\n",
921        "20                         Computers in Human Behavior        5     42\n",
922        "21    Cyberpsychology, Behavior, and Social Networking        5     42\n",
923        "22              Personality and Individual Differences        5     11\n",
924        "23                Journal of Medical Internet Research        5     11\n",
925        "24                        Journal of Adolescent Health        5     11\n",
926        "25                         Computers in Human Behavior        6     38\n",
927        "26   Lecture Notes in Computer Science (including s...        6     24\n",
928        "27                             Computers and Education        6     16\n",
929        "28   Conference on Human Factors in Computing Syste...        6     11\n",
930        "29                      Journal of Marketing Education        6     11\n",
931        "..                                                 ...      ...    ...\n",
932        "286                       Medical Journal of Australia       63      1\n",
933        "287                      Nicotine and Tobacco Research       63      1\n",
934        "288  35th International Conference on Information S...       64      1\n",
935        "289                                       First Monday       64      1\n",
936        "290   Cyberpsychology, Behavior, and Social Networking       64      1\n",
937        "291  HT'12 - Proceedings of 23rd ACM Conference on ...       65      1\n",
938        "292                IEEE/ACM Transactions on Networking       65      1\n",
939        "293                  Journal of Healthcare Engineering       65      1\n",
940        "294    International Journal of Information Management       66      2\n",
941        "295  Journal of Theoretical and Applied Electronic ...       66      1\n",
942        "296  Journal of Experimental and Theoretical Artifi...       66      1\n",
943        "297                                 McKinsey Quarterly       66      1\n",
944        "298  Lecture Notes in Computer Science (including s...       66      1\n",
945        "299                           Science (New York, N.Y.)       67      1\n",
946        "300  International Conference on Information and Kn...       68      1\n",
947        "301  Lecture Notes in Computer Science (including s...       68      1\n",
948        "302  16th Americas Conference on Information System...       68      1\n",
949        "303                               Procedia Engineering       68      1\n",
950        "304  International Journal of Virtual and Personal ...       68      1\n",
951        "305                                     Scientometrics       69      1\n",
952        "306  Conference on Human Factors in Computing Syste...       70      2\n",
953        "307                                                NyS       71      2\n",
954        "308    Aslib Proceedings: New Information Perspectives       71      1\n",
955        "309  WWW 2013 Companion - Proceedings of the 22nd I...       72      1\n",
956        "310   Cyberpsychology, Behavior, and Social Networking       72      1\n",
957        "311  PACIS 2011 - 15th Pacific Asia Conference on I...       73      1\n",
958        "312  Proceedings of the International Conference on...       73      1\n",
959        "313                       Online (Wilton, Connecticut)       74      1\n",
960        "314  Catalan Journal of Communication and Cultural ...       75      1\n",
961        "315  Proceedings - Pacific Asia Conference on Infor...       75      1\n",
962        "\n",
963        "[316 rows x 3 columns]"
964       ]
965      },
966      "execution_count": 74,
967      "metadata": {},
968      "output_type": "execute_result"
969     }
970    ],
971    "source": [
972     "top_journals_for_clusters(g_full_clu)"
973    ]
974   },
975   {
976    "cell_type": "code",
977    "execution_count": 75,
978    "metadata": {},
979    "outputs": [],
980    "source": [
981     "write_graphml(g_full, g_full_clu, \"g_full.graphml\")"
982    ]
983   },
984   {
985    "cell_type": "markdown",
986    "metadata": {},
987    "source": [
988     "# create the meta-network of connections between clusters"
989    ]
990   },
991   {
992    "cell_type": "code",
993    "execution_count": 76,
994    "metadata": {},
995    "outputs": [
996     {
997      "data": {
998       "text/html": [
999        "<div>\n",
1000        "<style>\n",
1001        "    .dataframe thead tr:only-child th {\n",
1002        "        text-align: right;\n",
1003        "    }\n",
1004        "\n",
1005        "    .dataframe thead th {\n",
1006        "        text-align: left;\n",
1007        "    }\n",
1008        "\n",
1009        "    .dataframe tbody tr th {\n",
1010        "        vertical-align: top;\n",
1011        "    }\n",
1012        "</style>\n",
1013        "<table border=\"1\" class=\"dataframe\">\n",
1014        "  <thead>\n",
1015        "    <tr style=\"text-align: right;\">\n",
1016        "      <th></th>\n",
1017        "      <th>to_cluster</th>\n",
1018        "      <th>from_cluster</th>\n",
1019        "      <th>value</th>\n",
1020        "    </tr>\n",
1021        "  </thead>\n",
1022        "  <tbody>\n",
1023        "    <tr>\n",
1024        "      <th>1</th>\n",
1025        "      <td>2</td>\n",
1026        "      <td>1</td>\n",
1027        "      <td>396</td>\n",
1028        "    </tr>\n",
1029        "    <tr>\n",
1030        "      <th>2</th>\n",
1031        "      <td>3</td>\n",
1032        "      <td>1</td>\n",
1033        "      <td>278</td>\n",
1034        "    </tr>\n",
1035        "    <tr>\n",
1036        "      <th>3</th>\n",
1037        "      <td>4</td>\n",
1038        "      <td>1</td>\n",
1039        "      <td>233</td>\n",
1040        "    </tr>\n",
1041        "    <tr>\n",
1042        "      <th>4</th>\n",
1043        "      <td>5</td>\n",
1044        "      <td>1</td>\n",
1045        "      <td>171</td>\n",
1046        "    </tr>\n",
1047        "    <tr>\n",
1048        "      <th>5</th>\n",
1049        "      <td>6</td>\n",
1050        "      <td>1</td>\n",
1051        "      <td>85</td>\n",
1052        "    </tr>\n",
1053        "    <tr>\n",
1054        "      <th>6</th>\n",
1055        "      <td>7</td>\n",
1056        "      <td>1</td>\n",
1057        "      <td>57</td>\n",
1058        "    </tr>\n",
1059        "    <tr>\n",
1060        "      <th>7</th>\n",
1061        "      <td>8</td>\n",
1062        "      <td>1</td>\n",
1063        "      <td>86</td>\n",
1064        "    </tr>\n",
1065        "    <tr>\n",
1066        "      <th>8</th>\n",
1067        "      <td>9</td>\n",
1068        "      <td>1</td>\n",
1069        "      <td>25</td>\n",
1070        "    </tr>\n",
1071        "    <tr>\n",
1072        "      <th>9</th>\n",
1073        "      <td>10</td>\n",
1074        "      <td>1</td>\n",
1075        "      <td>29</td>\n",
1076        "    </tr>\n",
1077        "    <tr>\n",
1078        "      <th>10</th>\n",
1079        "      <td>11</td>\n",
1080        "      <td>1</td>\n",
1081        "      <td>12</td>\n",
1082        "    </tr>\n",
1083        "    <tr>\n",
1084        "      <th>11</th>\n",
1085        "      <td>12</td>\n",
1086        "      <td>1</td>\n",
1087        "      <td>0</td>\n",
1088        "    </tr>\n",
1089        "    <tr>\n",
1090        "      <th>12</th>\n",
1091        "      <td>13</td>\n",
1092        "      <td>1</td>\n",
1093        "      <td>3</td>\n",
1094        "    </tr>\n",
1095        "    <tr>\n",
1096        "      <th>13</th>\n",
1097        "      <td>1</td>\n",
1098        "      <td>2</td>\n",
1099        "      <td>412</td>\n",
1100        "    </tr>\n",
1101        "    <tr>\n",
1102        "      <th>15</th>\n",
1103        "      <td>3</td>\n",
1104        "      <td>2</td>\n",
1105        "      <td>117</td>\n",
1106        "    </tr>\n",
1107        "    <tr>\n",
1108        "      <th>16</th>\n",
1109        "      <td>4</td>\n",
1110        "      <td>2</td>\n",
1111        "      <td>126</td>\n",
1112        "    </tr>\n",
1113        "    <tr>\n",
1114        "      <th>17</th>\n",
1115        "      <td>5</td>\n",
1116        "      <td>2</td>\n",
1117        "      <td>187</td>\n",
1118        "    </tr>\n",
1119        "    <tr>\n",
1120        "      <th>18</th>\n",
1121        "      <td>6</td>\n",
1122        "      <td>2</td>\n",
1123        "      <td>104</td>\n",
1124        "    </tr>\n",
1125        "    <tr>\n",
1126        "      <th>19</th>\n",
1127        "      <td>7</td>\n",
1128        "      <td>2</td>\n",
1129        "      <td>175</td>\n",
1130        "    </tr>\n",
1131        "    <tr>\n",
1132        "      <th>20</th>\n",
1133        "      <td>8</td>\n",
1134        "      <td>2</td>\n",
1135        "      <td>68</td>\n",
1136        "    </tr>\n",
1137        "    <tr>\n",
1138        "      <th>21</th>\n",
1139        "      <td>9</td>\n",
1140        "      <td>2</td>\n",
1141        "      <td>16</td>\n",
1142        "    </tr>\n",
1143        "    <tr>\n",
1144        "      <th>22</th>\n",
1145        "      <td>10</td>\n",
1146        "      <td>2</td>\n",
1147        "      <td>4</td>\n",
1148        "    </tr>\n",
1149        "    <tr>\n",
1150        "      <th>23</th>\n",
1151        "      <td>11</td>\n",
1152        "      <td>2</td>\n",
1153        "      <td>3</td>\n",
1154        "    </tr>\n",
1155        "    <tr>\n",
1156        "      <th>24</th>\n",
1157        "      <td>12</td>\n",
1158        "      <td>2</td>\n",
1159        "      <td>0</td>\n",
1160        "    </tr>\n",
1161        "    <tr>\n",
1162        "      <th>25</th>\n",
1163        "      <td>13</td>\n",
1164        "      <td>2</td>\n",
1165        "      <td>4</td>\n",
1166        "    </tr>\n",
1167        "    <tr>\n",
1168        "      <th>26</th>\n",
1169        "      <td>1</td>\n",
1170        "      <td>3</td>\n",
1171        "      <td>184</td>\n",
1172        "    </tr>\n",
1173        "    <tr>\n",
1174        "      <th>27</th>\n",
1175        "      <td>2</td>\n",
1176        "      <td>3</td>\n",
1177        "      <td>150</td>\n",
1178        "    </tr>\n",
1179        "    <tr>\n",
1180        "      <th>29</th>\n",
1181        "      <td>4</td>\n",
1182        "      <td>3</td>\n",
1183        "      <td>174</td>\n",
1184        "    </tr>\n",
1185        "    <tr>\n",
1186        "      <th>30</th>\n",
1187        "      <td>5</td>\n",
1188        "      <td>3</td>\n",
1189        "      <td>345</td>\n",
1190        "    </tr>\n",
1191        "    <tr>\n",
1192        "      <th>31</th>\n",
1193        "      <td>6</td>\n",
1194        "      <td>3</td>\n",
1195        "      <td>11</td>\n",
1196        "    </tr>\n",
1197        "    <tr>\n",
1198        "      <th>32</th>\n",
1199        "      <td>7</td>\n",
1200        "      <td>3</td>\n",
1201        "      <td>99</td>\n",
1202        "    </tr>\n",
1203        "    <tr>\n",
1204        "      <th>...</th>\n",
1205        "      <td>...</td>\n",
1206        "      <td>...</td>\n",
1207        "      <td>...</td>\n",
1208        "    </tr>\n",
1209        "    <tr>\n",
1210        "      <th>204</th>\n",
1211        "      <td>10</td>\n",
1212        "      <td>16</td>\n",
1213        "      <td>0</td>\n",
1214        "    </tr>\n",
1215        "    <tr>\n",
1216        "      <th>205</th>\n",
1217        "      <td>11</td>\n",
1218        "      <td>16</td>\n",
1219        "      <td>0</td>\n",
1220        "    </tr>\n",
1221        "    <tr>\n",
1222        "      <th>206</th>\n",
1223        "      <td>12</td>\n",
1224        "      <td>16</td>\n",
1225        "      <td>0</td>\n",
1226        "    </tr>\n",
1227        "    <tr>\n",
1228        "      <th>207</th>\n",
1229        "      <td>13</td>\n",
1230        "      <td>16</td>\n",
1231        "      <td>1</td>\n",
1232        "    </tr>\n",
1233        "    <tr>\n",
1234        "      <th>208</th>\n",
1235        "      <td>1</td>\n",
1236        "      <td>17</td>\n",
1237        "      <td>0</td>\n",
1238        "    </tr>\n",
1239        "    <tr>\n",
1240        "      <th>209</th>\n",
1241        "      <td>2</td>\n",
1242        "      <td>17</td>\n",
1243        "      <td>0</td>\n",
1244        "    </tr>\n",
1245        "    <tr>\n",
1246        "      <th>210</th>\n",
1247        "      <td>3</td>\n",
1248        "      <td>17</td>\n",
1249        "      <td>0</td>\n",
1250        "    </tr>\n",
1251        "    <tr>\n",
1252        "      <th>211</th>\n",
1253        "      <td>4</td>\n",
1254        "      <td>17</td>\n",
1255        "      <td>3</td>\n",
1256        "    </tr>\n",
1257        "    <tr>\n",
1258        "      <th>212</th>\n",
1259        "      <td>5</td>\n",
1260        "      <td>17</td>\n",
1261        "      <td>4</td>\n",
1262        "    </tr>\n",
1263        "    <tr>\n",
1264        "      <th>213</th>\n",
1265        "      <td>6</td>\n",
1266        "      <td>17</td>\n",
1267        "      <td>0</td>\n",
1268        "    </tr>\n",
1269        "    <tr>\n",
1270        "      <th>214</th>\n",
1271        "      <td>7</td>\n",
1272        "      <td>17</td>\n",
1273        "      <td>0</td>\n",
1274        "    </tr>\n",
1275        "    <tr>\n",
1276        "      <th>215</th>\n",
1277        "      <td>8</td>\n",
1278        "      <td>17</td>\n",
1279        "      <td>2</td>\n",
1280        "    </tr>\n",
1281        "    <tr>\n",
1282        "      <th>216</th>\n",
1283        "      <td>9</td>\n",
1284        "      <td>17</td>\n",
1285        "      <td>0</td>\n",
1286        "    </tr>\n",
1287        "    <tr>\n",
1288        "      <th>217</th>\n",
1289        "      <td>10</td>\n",
1290        "      <td>17</td>\n",
1291        "      <td>0</td>\n",
1292        "    </tr>\n",
1293        "    <tr>\n",
1294        "      <th>218</th>\n",
1295        "      <td>11</td>\n",
1296        "      <td>17</td>\n",
1297        "      <td>0</td>\n",
1298        "    </tr>\n",
1299        "    <tr>\n",
1300        "      <th>219</th>\n",
1301        "      <td>12</td>\n",
1302        "      <td>17</td>\n",
1303        "      <td>0</td>\n",
1304        "    </tr>\n",
1305        "    <tr>\n",
1306        "      <th>220</th>\n",
1307        "      <td>13</td>\n",
1308        "      <td>17</td>\n",
1309        "      <td>0</td>\n",
1310        "    </tr>\n",
1311        "    <tr>\n",
1312        "      <th>221</th>\n",
1313        "      <td>1</td>\n",
1314        "      <td>18</td>\n",
1315        "      <td>3</td>\n",
1316        "    </tr>\n",
1317        "    <tr>\n",
1318        "      <th>222</th>\n",
1319        "      <td>2</td>\n",
1320        "      <td>18</td>\n",
1321        "      <td>0</td>\n",
1322        "    </tr>\n",
1323        "    <tr>\n",
1324        "      <th>223</th>\n",
1325        "      <td>3</td>\n",
1326        "      <td>18</td>\n",
1327        "      <td>0</td>\n",
1328        "    </tr>\n",
1329        "    <tr>\n",
1330        "      <th>224</th>\n",
1331        "      <td>4</td>\n",
1332        "      <td>18</td>\n",
1333        "      <td>2</td>\n",
1334        "    </tr>\n",
1335        "    <tr>\n",
1336        "      <th>225</th>\n",
1337        "      <td>5</td>\n",
1338        "      <td>18</td>\n",
1339        "      <td>2</td>\n",
1340        "    </tr>\n",
1341        "    <tr>\n",
1342        "      <th>226</th>\n",
1343        "      <td>6</td>\n",
1344        "      <td>18</td>\n",
1345        "      <td>0</td>\n",
1346        "    </tr>\n",
1347        "    <tr>\n",
1348        "      <th>227</th>\n",
1349        "      <td>7</td>\n",
1350        "      <td>18</td>\n",
1351        "      <td>0</td>\n",
1352        "    </tr>\n",
1353        "    <tr>\n",
1354        "      <th>228</th>\n",
1355        "      <td>8</td>\n",
1356        "      <td>18</td>\n",
1357        "      <td>0</td>\n",
1358        "    </tr>\n",
1359        "    <tr>\n",
1360        "      <th>229</th>\n",
1361        "      <td>9</td>\n",
1362        "      <td>18</td>\n",
1363        "      <td>0</td>\n",
1364        "    </tr>\n",
1365        "    <tr>\n",
1366        "      <th>230</th>\n",
1367        "      <td>10</td>\n",
1368        "      <td>18</td>\n",
1369        "      <td>0</td>\n",
1370        "    </tr>\n",
1371        "    <tr>\n",
1372        "      <th>231</th>\n",
1373        "      <td>11</td>\n",
1374        "      <td>18</td>\n",
1375        "      <td>0</td>\n",
1376        "    </tr>\n",
1377        "    <tr>\n",
1378        "      <th>232</th>\n",
1379        "      <td>12</td>\n",
1380        "      <td>18</td>\n",
1381        "      <td>0</td>\n",
1382        "    </tr>\n",
1383        "    <tr>\n",
1384        "      <th>233</th>\n",
1385        "      <td>13</td>\n",
1386        "      <td>18</td>\n",
1387        "      <td>0</td>\n",
1388        "    </tr>\n",
1389        "  </tbody>\n",
1390        "</table>\n",
1391        "<p>221 rows × 3 columns</p>\n",
1392        "</div>"
1393       ],
1394       "text/plain": [
1395        "     to_cluster from_cluster  value\n",
1396        "1             2            1    396\n",
1397        "2             3            1    278\n",
1398        "3             4            1    233\n",
1399        "4             5            1    171\n",
1400        "5             6            1     85\n",
1401        "6             7            1     57\n",
1402        "7             8            1     86\n",
1403        "8             9            1     25\n",
1404        "9            10            1     29\n",
1405        "10           11            1     12\n",
1406        "11           12            1      0\n",
1407        "12           13            1      3\n",
1408        "13            1            2    412\n",
1409        "15            3            2    117\n",
1410        "16            4            2    126\n",
1411        "17            5            2    187\n",
1412        "18            6            2    104\n",
1413        "19            7            2    175\n",
1414        "20            8            2     68\n",
1415        "21            9            2     16\n",
1416        "22           10            2      4\n",
1417        "23           11            2      3\n",
1418        "24           12            2      0\n",
1419        "25           13            2      4\n",
1420        "26            1            3    184\n",
1421        "27            2            3    150\n",
1422        "29            4            3    174\n",
1423        "30            5            3    345\n",
1424        "31            6            3     11\n",
1425        "32            7            3     99\n",
1426        "..          ...          ...    ...\n",
1427        "204          10           16      0\n",
1428        "205          11           16      0\n",
1429        "206          12           16      0\n",
1430        "207          13           16      1\n",
1431        "208           1           17      0\n",
1432        "209           2           17      0\n",
1433        "210           3           17      0\n",
1434        "211           4           17      3\n",
1435        "212           5           17      4\n",
1436        "213           6           17      0\n",
1437        "214           7           17      0\n",
1438        "215           8           17      2\n",
1439        "216           9           17      0\n",
1440        "217          10           17      0\n",
1441        "218          11           17      0\n",
1442        "219          12           17      0\n",
1443        "220          13           17      0\n",
1444        "221           1           18      3\n",
1445        "222           2           18      0\n",
1446        "223           3           18      0\n",
1447        "224           4           18      2\n",
1448        "225           5           18      2\n",
1449        "226           6           18      0\n",
1450        "227           7           18      0\n",
1451        "228           8           18      0\n",
1452        "229           9           18      0\n",
1453        "230          10           18      0\n",
1454        "231          11           18      0\n",
1455        "232          12           18      0\n",
1456        "233          13           18      0\n",
1457        "\n",
1458        "[221 rows x 3 columns]"
1459       ]
1460      },
1461      "metadata": {},
1462      "output_type": "display_data"
1463     }
1464    ],
1465    "source": [
1466     "edgelist_tmp = pd.merge(raw_edgelist, g_sm_clu[[\"eid\", \"cluster\"]], how=\"inner\", left_on=\"to\", right_on=\"eid\")\n",
1467     "edgelist_tmp = edgelist_tmp.rename(columns={'cluster' : 'to_cluster'})\n",
1468     "edgelist_tmp.drop('eid', 1, inplace=True)\n",
1469     "                                          \n",
1470     "edgelist_tmp = pd.merge(edgelist_tmp, g_sm_clu[[\"eid\", \"cluster\"]], how=\"inner\", left_on=\"from\", right_on=\"eid\")\n",
1471     "edgelist_tmp = edgelist_tmp.rename(columns={\"cluster\" : 'from_cluster'})\n",
1472     "edgelist_tmp.drop('eid', 1, inplace=True)\n",
1473     "\n",
1474     "edgelist_tmp = edgelist_tmp[[\"to_cluster\", \"from_cluster\"]]\n",
1475     "edgelist_tmp = edgelist_tmp[edgelist_tmp[\"to_cluster\"] != edgelist_tmp[\"from_cluster\"]]\n",
1476     "\n",
1477     "cluster_edgelist = pd.crosstab(edgelist_tmp[\"to_cluster\"], edgelist_tmp[\"from_cluster\"])\n",
1478     "cluster_edgelist[\"to_cluster\"] = cluster_edgelist.index\n",
1479     "\n",
1480     "cluster_edgelist = pd.melt(cluster_edgelist, id_vars=[\"to_cluster\"])\n",
1481     "cluster_edgelist = cluster_edgelist[cluster_edgelist['to_cluster'] != cluster_edgelist['from_cluster']]\n",
1482     "\n",
1483     "remember(\"cluster_edgelist\", cluster_edgelist)"
1484    ]
1485   },
1486   {
1487    "cell_type": "code",
1488    "execution_count": 77,
1489    "metadata": {},
1490    "outputs": [],
1491    "source": [
1492     "top_clusters = g_sm_clu[\"cluster\"].value_counts().head(6).index\n",
1493     "\n",
1494     "# write the edgelist for the total number of clusters (currently 1-6)\n",
1495     "cluster_edgelist_output = cluster_edgelist[(cluster_edgelist[\"to_cluster\"].isin(top_clusters)) &\n",
1496     "                                           (cluster_edgelist[\"from_cluster\"].isin(top_clusters))]\n",
1497     "\n",
1498     "cluster_edgelist_output = cluster_edgelist_output[cluster_edgelist_output[\"value\"] > 0]\n",
1499     "\n",
1500     "g_cluster = Graph.TupleList([tuple(x) for x in cluster_edgelist_output[[\"from_cluster\", \"to_cluster\"]].values], directed=True)\n",
1501     "g_cluster.es[\"weight\"] = cluster_edgelist_output[\"value\"].tolist()\n",
1502     "\n",
1503     "# assign the number of total articles as an attribute for each node\n",
1504     "g_cluster.vs[\"papers\"] = g_sm_clu[\"cluster\"].value_counts()[[x[\"name\"] for x in g_cluster.vs]].tolist()\n",
1505     "\n",
1506     "g_cluster.write_graphml(\"clusters.graphml\")"
1507    ]
1508   },
1509   {
1510    "cell_type": "markdown",
1511    "metadata": {},
1512    "source": [
1513     "# create network stats for tables (overall and within clusters)"
1514    ]
1515   },
1516   {
1517    "cell_type": "code",
1518    "execution_count": 78,
1519    "metadata": {},
1520    "outputs": [],
1521    "source": [
1522     "def create_network_stats(g):\n",
1523     "    network_stats = pd.DataFrame({'eid' : g.vs['name'],\n",
1524     "                                  'eig_cent' : g.eigenvector_centrality(),\n",
1525     "                                  'indegree' : g.indegree(),\n",
1526     "                                  'betweenness' : g.betweenness()})\n",
1527     "\n",
1528     "    network_stats = pd.merge(network_stats,\n",
1529     "                             articles[['eid', 'title', 'source_title']],\n",
1530     "                             how=\"inner\")\n",
1531     "    return network_stats"
1532    ]
1533   },
1534   {
1535    "cell_type": "code",
1536    "execution_count": 79,
1537    "metadata": {},
1538    "outputs": [],
1539    "source": [
1540     "network_stats = create_network_stats(g_full)"
1541    ]
1542   },
1543   {
1544    "cell_type": "code",
1545    "execution_count": 80,
1546    "metadata": {},
1547    "outputs": [
1548     {
1549      "data": {
1550       "text/html": [
1551        "<div>\n",
1552        "<style>\n",
1553        "    .dataframe thead tr:only-child th {\n",
1554        "        text-align: right;\n",
1555        "    }\n",
1556        "\n",
1557        "    .dataframe thead th {\n",
1558        "        text-align: left;\n",
1559        "    }\n",
1560        "\n",
1561        "    .dataframe tbody tr th {\n",
1562        "        vertical-align: top;\n",
1563        "    }\n",
1564        "</style>\n",
1565        "<table border=\"1\" class=\"dataframe\">\n",
1566        "  <thead>\n",
1567        "    <tr style=\"text-align: right;\">\n",
1568        "      <th></th>\n",
1569        "      <th>betweenness</th>\n",
1570        "      <th>eid</th>\n",
1571        "      <th>eig_cent</th>\n",
1572        "      <th>indegree</th>\n",
1573        "      <th>title</th>\n",
1574        "      <th>source_title</th>\n",
1575        "    </tr>\n",
1576        "  </thead>\n",
1577        "  <tbody>\n",
1578        "    <tr>\n",
1579        "      <th>2275</th>\n",
1580        "      <td>6393.560498</td>\n",
1581        "      <td>2-s2.0-71149088987</td>\n",
1582        "      <td>1.000000e+00</td>\n",
1583        "      <td>1876</td>\n",
1584        "      <td>Users of the world, unite! The challenges and ...</td>\n",
1585        "      <td>Business Horizons</td>\n",
1586        "    </tr>\n",
1587        "    <tr>\n",
1588        "      <th>179</th>\n",
1589        "      <td>0.000000</td>\n",
1590        "      <td>2-s2.0-43449135033</td>\n",
1591        "      <td>6.899762e-15</td>\n",
1592        "      <td>645</td>\n",
1593        "      <td>Why we twitter: Understanding microblogging us...</td>\n",
1594        "      <td>Joint Ninth WebKDD and First SNA-KDD 2007 Work...</td>\n",
1595        "    </tr>\n",
1596        "    <tr>\n",
1597        "      <th>5120</th>\n",
1598        "      <td>669.625397</td>\n",
1599        "      <td>2-s2.0-79953711711</td>\n",
1600        "      <td>7.271520e-02</td>\n",
1601        "      <td>468</td>\n",
1602        "      <td>Social media? Get serious! Understanding the f...</td>\n",
1603        "      <td>Business Horizons</td>\n",
1604        "    </tr>\n",
1605        "    <tr>\n",
1606        "      <th>1855</th>\n",
1607        "      <td>0.000000</td>\n",
1608        "      <td>2-s2.0-67349268124</td>\n",
1609        "      <td>2.974873e-01</td>\n",
1610        "      <td>450</td>\n",
1611        "      <td>Social media: The new hybrid element of the pr...</td>\n",
1612        "      <td>Business Horizons</td>\n",
1613        "    </tr>\n",
1614        "  </tbody>\n",
1615        "</table>\n",
1616        "</div>"
1617       ],
1618       "text/plain": [
1619        "      betweenness                 eid      eig_cent  indegree  \\\n",
1620        "2275  6393.560498  2-s2.0-71149088987  1.000000e+00      1876   \n",
1621        "179      0.000000  2-s2.0-43449135033  6.899762e-15       645   \n",
1622        "5120   669.625397  2-s2.0-79953711711  7.271520e-02       468   \n",
1623        "1855     0.000000  2-s2.0-67349268124  2.974873e-01       450   \n",
1624        "\n",
1625        "                                                  title  \\\n",
1626        "2275  Users of the world, unite! The challenges and ...   \n",
1627        "179   Why we twitter: Understanding microblogging us...   \n",
1628        "5120  Social media? Get serious! Understanding the f...   \n",
1629        "1855  Social media: The new hybrid element of the pr...   \n",
1630        "\n",
1631        "                                           source_title  \n",
1632        "2275                                  Business Horizons  \n",
1633        "179   Joint Ninth WebKDD and First SNA-KDD 2007 Work...  \n",
1634        "5120                                  Business Horizons  \n",
1635        "1855                                  Business Horizons  "
1636       ]
1637      },
1638      "execution_count": 80,
1639      "metadata": {},
1640      "output_type": "execute_result"
1641     }
1642    ],
1643    "source": [
1644     "network_stats.sort_values(\"indegree\", ascending=False).head(4)"
1645    ]
1646   },
1647   {
1648    "cell_type": "code",
1649    "execution_count": 81,
1650    "metadata": {},
1651    "outputs": [
1652     {
1653      "data": {
1654       "text/html": [
1655        "<div>\n",
1656        "<style>\n",
1657        "    .dataframe thead tr:only-child th {\n",
1658        "        text-align: right;\n",
1659        "    }\n",
1660        "\n",
1661        "    .dataframe thead th {\n",
1662        "        text-align: left;\n",
1663        "    }\n",
1664        "\n",
1665        "    .dataframe tbody tr th {\n",
1666        "        vertical-align: top;\n",
1667        "    }\n",
1668        "</style>\n",
1669        "<table border=\"1\" class=\"dataframe\">\n",
1670        "  <thead>\n",
1671        "    <tr style=\"text-align: right;\">\n",
1672        "      <th></th>\n",
1673        "      <th>betweenness</th>\n",
1674        "      <th>eid</th>\n",
1675        "      <th>eig_cent</th>\n",
1676        "      <th>indegree</th>\n",
1677        "      <th>title</th>\n",
1678        "      <th>source_title</th>\n",
1679        "    </tr>\n",
1680        "  </thead>\n",
1681        "  <tbody>\n",
1682        "    <tr>\n",
1683        "      <th>2275</th>\n",
1684        "      <td>6393.560498</td>\n",
1685        "      <td>2-s2.0-71149088987</td>\n",
1686        "      <td>1.000000</td>\n",
1687        "      <td>1876</td>\n",
1688        "      <td>Users of the world, unite! The challenges and ...</td>\n",
1689        "      <td>Business Horizons</td>\n",
1690        "    </tr>\n",
1691        "    <tr>\n",
1692        "      <th>2259</th>\n",
1693        "      <td>0.000000</td>\n",
1694        "      <td>2-s2.0-70349816888</td>\n",
1695        "      <td>0.605279</td>\n",
1696        "      <td>70</td>\n",
1697        "      <td>The fairyland of Second Life: Virtual social w...</td>\n",
1698        "      <td>Business Horizons</td>\n",
1699        "    </tr>\n",
1700        "    <tr>\n",
1701        "      <th>3612</th>\n",
1702        "      <td>0.000000</td>\n",
1703        "      <td>2-s2.0-77949522596</td>\n",
1704        "      <td>0.563979</td>\n",
1705        "      <td>335</td>\n",
1706        "      <td>Networked narratives: Understanding word-of-mo...</td>\n",
1707        "      <td>Journal of Marketing</td>\n",
1708        "    </tr>\n",
1709        "    <tr>\n",
1710        "      <th>7088</th>\n",
1711        "      <td>0.000000</td>\n",
1712        "      <td>2-s2.0-79551582037</td>\n",
1713        "      <td>0.432951</td>\n",
1714        "      <td>36</td>\n",
1715        "      <td>Online Personal Branding: Processes, Challenge...</td>\n",
1716        "      <td>Journal of Interactive Marketing</td>\n",
1717        "    </tr>\n",
1718        "  </tbody>\n",
1719        "</table>\n",
1720        "</div>"
1721       ],
1722       "text/plain": [
1723        "      betweenness                 eid  eig_cent  indegree  \\\n",
1724        "2275  6393.560498  2-s2.0-71149088987  1.000000      1876   \n",
1725        "2259     0.000000  2-s2.0-70349816888  0.605279        70   \n",
1726        "3612     0.000000  2-s2.0-77949522596  0.563979       335   \n",
1727        "7088     0.000000  2-s2.0-79551582037  0.432951        36   \n",
1728        "\n",
1729        "                                                  title  \\\n",
1730        "2275  Users of the world, unite! The challenges and ...   \n",
1731        "2259  The fairyland of Second Life: Virtual social w...   \n",
1732        "3612  Networked narratives: Understanding word-of-mo...   \n",
1733        "7088  Online Personal Branding: Processes, Challenge...   \n",
1734        "\n",
1735        "                          source_title  \n",
1736        "2275                 Business Horizons  \n",
1737        "2259                 Business Horizons  \n",
1738        "3612              Journal of Marketing  \n",
1739        "7088  Journal of Interactive Marketing  "
1740       ]
1741      },
1742      "execution_count": 81,
1743      "metadata": {},
1744      "output_type": "execute_result"
1745     }
1746    ],
1747    "source": [
1748     "network_stats.sort_values(\"eig_cent\", ascending=False).head(4)"
1749    ]
1750   },
1751   {
1752    "cell_type": "code",
1753    "execution_count": 82,
1754    "metadata": {},
1755    "outputs": [
1756     {
1757      "data": {
1758       "text/html": [
1759        "<div>\n",
1760        "<style>\n",
1761        "    .dataframe thead tr:only-child th {\n",
1762        "        text-align: right;\n",
1763        "    }\n",
1764        "\n",
1765        "    .dataframe thead th {\n",
1766        "        text-align: left;\n",
1767        "    }\n",
1768        "\n",
1769        "    .dataframe tbody tr th {\n",
1770        "        vertical-align: top;\n",
1771        "    }\n",
1772        "</style>\n",
1773        "<table border=\"1\" class=\"dataframe\">\n",
1774        "  <thead>\n",
1775        "    <tr style=\"text-align: right;\">\n",
1776        "      <th></th>\n",
1777        "      <th>betweenness</th>\n",
1778        "      <th>eid</th>\n",
1779        "      <th>eig_cent</th>\n",
1780        "      <th>indegree</th>\n",
1781        "      <th>title</th>\n",
1782        "      <th>source_title</th>\n",
1783        "    </tr>\n",
1784        "  </thead>\n",
1785        "  <tbody>\n",
1786        "    <tr>\n",
1787        "      <th>2275</th>\n",
1788        "      <td>6393.560498</td>\n",
1789        "      <td>2-s2.0-71149088987</td>\n",
1790        "      <td>1.000000e+00</td>\n",
1791        "      <td>1876</td>\n",
1792        "      <td>Users of the world, unite! The challenges and ...</td>\n",
1793        "      <td>Business Horizons</td>\n",
1794        "    </tr>\n",
1795        "    <tr>\n",
1796        "      <th>401</th>\n",
1797        "      <td>6220.250000</td>\n",
1798        "      <td>2-s2.0-70350491889</td>\n",
1799        "      <td>3.749870e-16</td>\n",
1800        "      <td>103</td>\n",
1801        "      <td>Crisis in a networked world: Features of compu...</td>\n",
1802        "      <td>Social Science Computer Review</td>\n",
1803        "    </tr>\n",
1804        "    <tr>\n",
1805        "      <th>2781</th>\n",
1806        "      <td>5131.824639</td>\n",
1807        "      <td>2-s2.0-84888047300</td>\n",
1808        "      <td>1.310283e-01</td>\n",
1809        "      <td>31</td>\n",
1810        "      <td>Social media metrics - A framework and guideli...</td>\n",
1811        "      <td>Journal of Interactive Marketing</td>\n",
1812        "    </tr>\n",
1813        "    <tr>\n",
1814        "      <th>3821</th>\n",
1815        "      <td>4319.747561</td>\n",
1816        "      <td>2-s2.0-84910136235</td>\n",
1817        "      <td>3.045168e-18</td>\n",
1818        "      <td>8</td>\n",
1819        "      <td>What are health-related users tweeting? A qual...</td>\n",
1820        "      <td>Journal of Medical Internet Research</td>\n",
1821        "    </tr>\n",
1822        "  </tbody>\n",
1823        "</table>\n",
1824        "</div>"
1825       ],
1826       "text/plain": [
1827        "      betweenness                 eid      eig_cent  indegree  \\\n",
1828        "2275  6393.560498  2-s2.0-71149088987  1.000000e+00      1876   \n",
1829        "401   6220.250000  2-s2.0-70350491889  3.749870e-16       103   \n",
1830        "2781  5131.824639  2-s2.0-84888047300  1.310283e-01        31   \n",
1831        "3821  4319.747561  2-s2.0-84910136235  3.045168e-18         8   \n",
1832        "\n",
1833        "                                                  title  \\\n",
1834        "2275  Users of the world, unite! The challenges and ...   \n",
1835        "401   Crisis in a networked world: Features of compu...   \n",
1836        "2781  Social media metrics - A framework and guideli...   \n",
1837        "3821  What are health-related users tweeting? A qual...   \n",
1838        "\n",
1839        "                              source_title  \n",
1840        "2275                     Business Horizons  \n",
1841        "401         Social Science Computer Review  \n",
1842        "2781      Journal of Interactive Marketing  \n",
1843        "3821  Journal of Medical Internet Research  "
1844       ]
1845      },
1846      "execution_count": 82,
1847      "metadata": {},
1848      "output_type": "execute_result"
1849     }
1850    ],
1851    "source": [
1852     "network_stats.sort_values(\"betweenness\", ascending=False).head(4)"
1853    ]
1854   },
1855   {
1856    "cell_type": "code",
1857    "execution_count": 83,
1858    "metadata": {
1859     "scrolled": true
1860    },
1861    "outputs": [
1862     {
1863      "data": {
1864       "text/plain": [
1865        "<matplotlib.axes._subplots.AxesSubplot at 0x7f178179c908>"
1866       ]
1867      },
1868      "execution_count": 83,
1869      "metadata": {},
1870      "output_type": "execute_result"
1871     },
1872     {
1873      "data": {
1874       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAFKFJREFUeJzt3W9sW2fdxvHLifenJSVN7eKQLZNo1kqk2tYad0CgW/6YTlQIdRVEjBdoC6PJsjGyMbHhF9MkFinSiBwJGoEgRKNIaENKKEggJFO6ogRo4ixd1bAt6ZjUqFlM7NK667LO8XleVPPT0KS1XZ/45Ob7eVWf2T5X7sTXnJ9PznFZlmUJAGCskmIHAADYi6IHAMNR9ABgOIoeAAxH0QOA4Sh6ADAcRQ8AhqPoAcBwFD0AGI6iBwDDuYu589HRUUWjUbW2tur06dN5PYfX69Xc3FyBkxWW0zM6PZ9ExkJwej7J+Rmdlq+qqiqr+xW16AOBgAKBQDEjAIDxGN0AgOEoegAwHEUPAIaj6AHAcBQ9ABiOogcAw1H0AGC4oh5HXwiz99cVbd+lP/td0fYNANniHT0AGM6Wop+fn9fTTz+taDRqx9MDAHKQ1eimt7dXY2NjKi8vV3d3d2b7+Pi4+vv7lU6n1dTUpD179kiSDh48qM9+9rP2JAYA5CSrd/T19fUKhUKLtqXTafX19SkUCikcDmtoaEjT09N67bXXdOutt2r9+vW2BAYA5Card/S1tbWKxWKLtk1NTamyslI+n0+SVFdXp5GREc3Pz+v999/X9PS0brzxRm3fvl0lJXwUAADFkvdRN4lEQh6PJ3Pb4/FocnJS3/zmNyVJhw8f1rp165Yt+UgkokgkIknq6uqS1+vNK8dsXo8qjGwzu93uvL++leD0fBIZC8Hp+STnZ3R6vuXkXfSWZV2xzeVyZf5dX19/1ccHg0EFg8HMbSed4zlb2WZ22jms/5vT80lkLASn55Ocn9Fp+bI9H33eMxWPx6N4PJ65HY/HVVFRkdNzjI6O6qc//Wm+EQAAWci76GtqajQzM6NYLKZUKqXh4eGcLyISCATU2tqabwQAQBayGt309PRoYmJCyWRSbW1tam5uVmNjo1paWtTZ2al0Oq2GhgZVV1fntPPLLyUIALBHVkXf0dGx5Ha/3y+/35/3zrmUIADYj+MeAcBwRS16PowFAPsV9eyVjG4AwH6MbgDAcIxuAMBwjG4AwHCMbgDAcBQ9ABiOGT0AGI4ZPQAYjtENABiOogcAw1H0AGA4PowFAMPxYSwAGI7RDQAYjqIHAMNR9ABgOIoeAAzHUTcAYDiOugEAwzG6AQDDUfQAYDiKHgAMR9EDgOEoegAwHEUPAIbjOHoAMBzH0QOA4RjdAIDhKHoAMBxFDwCGo+gBwHAUPQAYjqIHAMNR9ABgOIoeAAxH0QOA4Qr+l7HT09P6wx/+oGQyqTvuuEO7du0q9C4AADnIquh7e3s1Njam8vJydXd3Z7aPj4+rv79f6XRaTU1N2rNnj2699Vbt27dP6XSa89gAgANkNbqpr69XKBRatC2dTquvr0+hUEjhcFhDQ0Oanp6WdOlkZc8++6zuuOOOwicGAOQkq6Kvra1VWVnZom1TU1OqrKyUz+eT2+1WXV2dRkZGJF06Wdnzzz+vv/71r4VPDADISd4z+kQiIY/Hk7nt8Xg0OTmpEydO6B//+IdSqZS2b9++7OMjkYgikYgkqaurS16vN68cs3k9qjCyzex2u/P++laC0/NJZCwEp+eTnJ/R6fmWk3fRW5Z1xTaXy6WtW7dq69at13x8MBhUMBjM3J6bm8s3StFkm9nr9Tr663N6PomMheD0fJLzMzotX1VVVVb3y/vwSo/Ho3g8nrkdj8dVUVGR03Nw4REAsF/eRV9TU6OZmRnFYjGlUikNDw/nfBGRQCCg1tbWfCMAALKQ1eimp6dHExMTSiaTamtrU3NzsxobG9XS0qLOzk6l02k1NDSouro6p52Pjo4qGo1S9gBgo6yKvqOjY8ntfr9ffr8/751zKUEAsB+nQAAAwxW16PkwFgDsV/Bz3eSC0Q0A2I/RDQAYjtENABiO0Q0AGI7RDQAYjqIHAMMxowcAwzGjBwDDMboBAMNR9ABgOIoeAAzHh7EAYDg+jAUAwzG6AQDDUfQAYDiKHgAMR9EDgOE46gYADMdRNwBgOEY3AGA4ih4ADEfRA4DhKHoAMBxFDwCGo+gBwHAcRw8AhuM4egAwHKMbADAcRQ8AhqPoAcBwFD0AGI6iBwDDUfQAYDiKHgAMR9EDgOEoegAwnC1/GXv06FGNjY3p3Llzuu+++3TXXXfZsRsAQBayLvre3l6NjY2pvLxc3d3dme3j4+Pq7+9XOp1WU1OT9uzZo7vvvlt33323zp8/rwMHDlD0AFBEWY9u6uvrFQqFFm1Lp9Pq6+tTKBRSOBzW0NCQpqenM/99YGBA9913X+HSAgBylnXR19bWqqysbNG2qakpVVZWyufzye12q66uTiMjI7IsS7/61a+0bds2bdq0qeChAQDZu64ZfSKRkMfjydz2eDyanJzUH//4Rx0/flwXLlzQO++8o127dl3x2EgkokgkIknq6uqS1+vNK8NsftELItvMbrc7769vJTg9n0TGQnB6Psn5GZ2ebznXVfSWZV2xzeVyaffu3dq9e/dVHxsMBhUMBjO35+bmridKUWSb2ev1Ovrrc3o+iYyF4PR8kvMzOi1fVVVVVve7rsMrPR6P4vF45nY8HldFRUXWj+fCIwBgv+sq+pqaGs3MzCgWiymVSml4eDinC4kEAgG1trZeTwQAwDVkPbrp6enRxMSEksmk2tra1NzcrMbGRrW0tKizs1PpdFoNDQ2qrq7Oeuejo6OKRqOUPQDYKOui7+joWHK73++X3+/Pa+dcShAA7McpEADAcEUtej6MBQD72XKum2wxugEA+zG6AQDDMboBAMMxugEAwzG6AQDDUfQAYDhm9ABgOGb0AGA4RjcAYDiKHgAMR9EDgOH4MBYADMeHsQBgOEY3AGA4ih4ADEfRA4DhKHoAMBxH3QCA4TjqBgAMx+gGAAxH0QOA4Sh6ADAcRQ8AhqPoAcBwRT3qZrVb+NaXs7rfbIH3W/qz3xX4GQGYjOPoAcBwHEcPAIZjRg8AhqPoAcBwFD0AGI6iBwDDUfQAYDiKHgAMR9EDgOEoegAwHEUPAIYr+F/Gzs7OamBgQBcuXNB3v/vdQj89ACBHWb2j7+3t1cMPP3xFcY+Pj+s73/mOvv3tb+u3v/2tJMnn8+mRRx4pfFIAQF6yKvr6+nqFQqFF29LptPr6+hQKhRQOhzU0NKTp6WlbQgIA8pdV0dfW1qqsrGzRtqmpKVVWVsrn88ntdquurk4jIyO2hAQA5C/vGX0ikZDH48nc9ng8mpycVDKZ1K9//Wu9/fbbGhwc1P3337/k4yORiCKRiCSpq6tLXq83rxyFPtf7apDvWi3H7XYX/DkLjYzXz+n5JOdndHq+5eRd9JZlXbHN5XJp3bp12rdv3zUfHwwGFQwGM7fn5ubyjfI/p9Br5fV6Hb/+ZLx+Ts8nOT+j0/JVVVVldb+8D6/0eDyKx+OZ2/F4XBUVFTk9BxceAQD75V30NTU1mpmZUSwWUyqV0vDwcM4XEQkEAmptbc03AgAgC1mNbnp6ejQxMaFkMqm2tjY1NzersbFRLS0t6uzsVDqdVkNDg6qrq3Pa+ejoqKLRKGUPADbKqug7OjqW3O73++X3+/PeOZcSBAD7cQoEADBcUYueD2MBwH4FP9dNLhjdAID9GN0AgOEY3QCA4RjdAIDhGN0AgOEoegAwHDN6ADAcM3oAMByjGwAwHEUPAIaj6AHAcHwYCwCG48NYADAcoxsAMBxFDwCGo+gBwHAUPQAYrqgfxnJx8PwsfOvLBX2+2RzuW/qz3xV03wDsx1E3AGA4RjcAYDiKHgAMR9EDgOEoegAwHEUPAIaj6AHAcJy9EgAMx3H0AGA4RjcAYDiKHgAMR9EDgOEoegAwHEUPAIaj6AHAcBQ9ABiOogcAw1H0AGC4gv9l7Pz8vH7+85/L7XZr69at2rlzZ6F3AQDIQVZF39vbq7GxMZWXl6u7uzuzfXx8XP39/Uqn02pqatKePXt09OhRfeYzn1EgEFA4HKboAaDIshrd1NfXKxQKLdqWTqfV19enUCikcDisoaEhTU9PKx6Py+v1XnryEiZDAFBsWTVxbW2tysrKFm2bmppSZWWlfD6f3G636urqNDIyIo/Ho3g8LkmyLKvwiQEAOcl7Rp9IJOTxeDK3PR6PJicn9cUvflG/+MUvNDY2pk996lPLPj4SiSgSiUiSurq6Mr8F5Go2r0chXwvf+nJR9uv+/dG8f0ZWitvtdnRGp+eTipdx9v667O5nw759g8M2POtieRf9Uu/WXS6Xbr75ZrW3t1/z8cFgUMFgMHN7bm4u3yj4H5BKpRz/M+L1eh2d0en5pNWRsdCu5+utqqrK6n55D9EvH9FIUjweV0VFRU7PwYVHAMB+eRd9TU2NZmZmFIvFlEqlNDw8nPNFRAKBgFpbW/ONAADIQlajm56eHk1MTCiZTKqtrU3Nzc1qbGxUS0uLOjs7lU6n1dDQoOrqarvzAgBylFXRd3R0LLnd7/fL7/fnvfPR0VFFo1He1QOAjbhmLAAYrqh/0cSHsQBgP97RA4DhOEcBABjOZXGeAgAw2qp/R//MM88UO8I1OT2j0/NJZCwEp+eTnJ/R6fmWs+qLHgBwdRQ9ABiu9Lnnnnuu2CGu16ZNm4od4ZqcntHp+SQyFoLT80nOz+j0fEvhw1gAMByjGwAwXFH/YOp6LXXN2pU2Nzen/fv36z//+Y9cLpeCwaB2796tl19+WX/+85/10Y9+VJL0wAMPZM4LNDg4qEOHDqmkpEQPPfSQtm3bZnvORx99VDfffLNKSkpUWlqqrq4unT9/XuFwWP/+97+1ceNGPfHEEyorK5NlWerv79err76qm266Se3t7bb+unr69GmFw+HM7VgspubmZr377rtFXcOlrpWcz5odPnxYAwMDkqS9e/eqvr7e1owHDhxQNBqV2+2Wz+dTe3u7PvKRjygWi+mJJ57InMN88+bN2rdvnyTprbfe0v79+3Xx4kVt375dDz30kFwuly358nlt2PlaXypjOBzW6dOnJUkXLlzQ2rVr9cILLxRlDQvCWqUWFhasxx57zHrnnXesDz74wHrqqaesU6dOrXiORCJhnTx50rIsy7pw4YL1+OOPW6dOnbJeeukl6+DBg1fc/9SpU9ZTTz1lXbx40ZqdnbUee+wxa2Fhwfac7e3t1tmzZxdtO3DggDU4OGhZlmUNDg5aBw4csCzLsqLRqNXZ2Wml02nrjTfesL7//e/bnu9DCwsL1sMPP2zFYrGir+GJEyeskydPWk8++WRmW65rlkwmrUcffdRKJpOL/m1nxvHxcSuVSmXyfphxdnZ20f0u98wzz1hvvPGGlU6nrc7OTmtsbMy2fLl+X+1+rS+V8XIvvvii9Zvf/MayrOKsYSGs2tHNctesXWkVFRWZd25r1qzRLbfcokQisez9R0ZGVFdXpxtuuEEf+9jHVFlZqampqZWKe0WWe++9V5J07733ZtZvdHRU99xzj1wul7Zs2aJ3331XZ86cWZFMx48fV2VlpTZu3HjV3CuxhktdKznXNRsfH9edd96psrIylZWV6c4779T4+LitGe+66y6VlpZKkrZs2XLVn0dJOnPmjN577z1t2bJFLpdL99xzT8FeS0vlW85y31e7X+tXy2hZlv72t7/pc5/73FWfw841LIRVO7pZ7pq1xRSLxfSvf/1Lt99+u15//XX96U9/0pEjR7Rp0yZ94xvfUFlZmRKJhDZv3px5zIYNG675QiyUzs5OSdIXvvAFBYNBnT17NnNVsIqKCp07d07SpbW9/LqdHo9HiUQi5yuI5WNoaGjRi8ppa5jrmv33z+lKZpWkQ4cOqa7u/6+HGovF9L3vfU9r1qzR1772NX3yk59c8rVkd8Zcv6/Feq3/85//VHl5uT7+8Y9ntjllDXOxaoveWuaatcUyPz+v7u5uPfjgg1q7dq127dqlr3zlK5Kkl156Sb/85S/V3t6+ZO6V8IMf/EAbNmzQ2bNn9fzzz1/1WpPFWttUKqVoNKqvf/3rkuS4NbyaXNZspX5OBwYGVFpaqp07d0q69D+m3t5erVu3Tm+99ZZeeOEFdXd3r/h65vp9LeZr/b/feDhlDXO1akc3hbhmbaGkUil1d3dr586d+vSnPy1JWr9+vUpKSlRSUqKmpiadPHlyydyJREIbNmywPeOH+ygvL9eOHTs0NTWl8vLyzEjmzJkzmQ/HPB7PogsWr9Tavvrqq/rEJz6h9evXS3LeGkrKec02bNhwRdaVWMvDhw8rGo3q8ccfz5TiDTfcoHXr1km6dCy4z+fTzMzMkq8lO9cz1+9rsV7rCwsLOnr06KLfiJyyhrlatUVfiGvWFoJlWfrJT36iW265RV/60pcy2y+faR89ejRzmcVAIKDh4WF98MEHisVimpmZ0e23325rxvn5eb333nuZf7/22mu67bbbFAgE9Morr0iSXnnlFe3YsSOT8ciRI7IsS2+++abWrl1blLGNk9bwQ7mu2bZt23Ts2DGdP39e58+f17Fjx2w/ymp8fFwHDx7U008/rZtuuimz/dy5c0qn05Kk2dlZzczMyOfzqaKiQmvWrNGbb74py7J05MgRW19LuX5fi/VaP378uKqqqhaNZJyyhrla1X8wNTY2phdffDFzzdq9e/eueIbXX39dzz77rG677bbMO6cHHnhAQ0NDevvtt+VyubRx40bt27cvU5YDAwP6y1/+opKSEj344IPavn27rRlnZ2f1wx/+UNKldymf//zntXfvXiWTSYXDYc3Nzcnr9erJJ5/MHCrY19enY8eO6cYbb1R7e7tqampszfj+++/rkUce0Y9//GOtXbtWkvSjH/2oqGt4+bWSy8vL1dzcrB07duS8ZocOHdLg4KCkS4dXNjQ02JpxcHBQqVQq8wHjh4cA/v3vf9fLL7+s0tJSlZSU6Ktf/WqmjE6ePKne3l5dvHhR27ZtU0tLS0HGI0vlO3HiRM7fVztf60tlbGxs1P79+7V582bt2rUrc99irGEhrOqiBwBc26od3QAAskPRA4DhKHoAMBxFDwCGo+gBwHAUPQAYjqIHAMNR9ABguP8DaoV4MSni/p8AAAAASUVORK5CYII=\n",
1875       "text/plain": [
1876        "<matplotlib.figure.Figure at 0x7f178c2ee4e0>"
1877       ]
1878      },
1879      "metadata": {},
1880      "output_type": "display_data"
1881     }
1882    ],
1883    "source": [
1884     "network_stats['indegree'].hist(log = True)"
1885    ]
1886   },
1887   {
1888    "cell_type": "markdown",
1889    "metadata": {},
1890    "source": [
1891     "# things to store"
1892    ]
1893   },
1894   {
1895    "cell_type": "code",
1896    "execution_count": 84,
1897    "metadata": {},
1898    "outputs": [
1899     {
1900      "data": {
1901       "text/plain": [
1902        "23131"
1903       ]
1904      },
1905      "metadata": {},
1906      "output_type": "display_data"
1907     }
1908    ],
1909    "source": [
1910     "remember('total_articles', articles.shape[0])"
1911    ]
1912   },
1913   {
1914    "cell_type": "code",
1915    "execution_count": 85,
1916    "metadata": {},
1917    "outputs": [
1918     {
1919      "data": {
1920       "text/plain": [
1921        "35620"
1922       ]
1923      },
1924      "metadata": {},
1925      "output_type": "display_data"
1926     },
1927     {
1928      "data": {
1929       "text/plain": [
1930        "4807"
1931       ]
1932      },
1933      "metadata": {},
1934      "output_type": "display_data"
1935     },
1936     {
1937      "data": {
1938       "text/plain": [
1939        "3864"
1940       ]
1941      },
1942      "metadata": {},
1943      "output_type": "display_data"
1944     }
1945    ],
1946    "source": [
1947     "# total number of citations in the sm dataset\n",
1948     "remember('sm_citations', raw_edgelist.shape[0])\n",
1949     "\n",
1950     "remember('sm_citing', len(raw_edgelist[\"from\"].unique()))\n",
1951     "\n",
1952     "# the number of articles in the original dataset that have any INCOMING citations\n",
1953     "remember('sm_cited', len(raw_edgelist[\"to\"].unique()))"
1954    ]
1955   },
1956   {
1957    "cell_type": "code",
1958    "execution_count": 86,
1959    "metadata": {},
1960    "outputs": [
1961     {
1962      "data": {
1963       "text/plain": [
1964        "212773"
1965       ]
1966      },
1967      "metadata": {},
1968      "output_type": "display_data"
1969     },
1970     {
1971      "data": {
1972       "text/plain": [
1973        "42935"
1974       ]
1975      },
1976      "metadata": {},
1977      "output_type": "display_data"
1978     },
1979     {
1980      "data": {
1981       "text/plain": [
1982        "9710"
1983       ]
1984      },
1985      "metadata": {},
1986      "output_type": "display_data"
1987     }
1988    ],
1989    "source": [
1990     "# total number of citations in the sm dataset\n",
1991     "remember('all_citations', combo_raw_edgelist.shape[0])\n",
1992     "\n",
1993     "remember('all_citing', len(combo_raw_edgelist[\"from\"].unique()))\n",
1994     "\n",
1995     "# the number of articles in the original dataset that have any INCOMING citations\n",
1996     "remember('all_cited', len(combo_raw_edgelist[\"to\"].unique()))"
1997    ]
1998   },
1999   {
2000    "cell_type": "code",
2001    "execution_count": 87,
2002    "metadata": {},
2003    "outputs": [
2004     {
2005      "data": {
2006       "text/html": [
2007        "<div>\n",
2008        "<style>\n",
2009        "    .dataframe thead tr:only-child th {\n",
2010        "        text-align: right;\n",
2011        "    }\n",
2012        "\n",
2013        "    .dataframe thead th {\n",
2014        "        text-align: left;\n",
2015        "    }\n",
2016        "\n",
2017        "    .dataframe tbody tr th {\n",
2018        "        vertical-align: top;\n",
2019        "    }\n",
2020        "</style>\n",
2021        "<table border=\"1\" class=\"dataframe\">\n",
2022        "  <thead>\n",
2023        "    <tr style=\"text-align: right;\">\n",
2024        "      <th></th>\n",
2025        "      <th>eid</th>\n",
2026        "      <th>cluster</th>\n",
2027        "    </tr>\n",
2028        "  </thead>\n",
2029        "  <tbody>\n",
2030        "    <tr>\n",
2031        "      <th>0</th>\n",
2032        "      <td>2-s2.0-71149088987</td>\n",
2033        "      <td>1</td>\n",
2034        "    </tr>\n",
2035        "    <tr>\n",
2036        "      <th>1</th>\n",
2037        "      <td>2-s2.0-70349816888</td>\n",
2038        "      <td>1</td>\n",
2039        "    </tr>\n",
2040        "    <tr>\n",
2041        "      <th>2</th>\n",
2042        "      <td>2-s2.0-79953711711</td>\n",
2043        "      <td>1</td>\n",
2044        "    </tr>\n",
2045        "    <tr>\n",
2046        "      <th>3</th>\n",
2047        "      <td>2-s2.0-79551630751</td>\n",
2048        "      <td>1</td>\n",
2049        "    </tr>\n",
2050        "    <tr>\n",
2051        "      <th>4</th>\n",
2052        "      <td>2-s2.0-80051469103</td>\n",
2053        "      <td>1</td>\n",
2054        "    </tr>\n",
2055        "    <tr>\n",
2056        "      <th>5</th>\n",
2057        "      <td>2-s2.0-84866718851</td>\n",
2058        "      <td>1</td>\n",
2059        "    </tr>\n",
2060        "    <tr>\n",
2061        "      <th>6</th>\n",
2062        "      <td>2-s2.0-84877685551</td>\n",
2063        "      <td>1</td>\n",
2064        "    </tr>\n",
2065        "    <tr>\n",
2066        "      <th>7</th>\n",
2067        "      <td>2-s2.0-84864442547</td>\n",
2068        "      <td>1</td>\n",
2069        "    </tr>\n",
2070        "    <tr>\n",
2071        "      <th>8</th>\n",
2072        "      <td>2-s2.0-84861420864</td>\n",
2073        "      <td>1</td>\n",
2074        "    </tr>\n",
2075        "    <tr>\n",
2076        "      <th>9</th>\n",
2077        "      <td>2-s2.0-84887483487</td>\n",
2078        "      <td>1</td>\n",
2079        "    </tr>\n",
2080        "    <tr>\n",
2081        "      <th>10</th>\n",
2082        "      <td>2-s2.0-80955144847</td>\n",
2083        "      <td>1</td>\n",
2084        "    </tr>\n",
2085        "    <tr>\n",
2086        "      <th>11</th>\n",
2087        "      <td>2-s2.0-84885038309</td>\n",
2088        "      <td>1</td>\n",
2089        "    </tr>\n",
2090        "    <tr>\n",
2091        "      <th>12</th>\n",
2092        "      <td>2-s2.0-84886099569</td>\n",
2093        "      <td>1</td>\n",
2094        "    </tr>\n",
2095        "    <tr>\n",
2096        "      <th>13</th>\n",
2097        "      <td>2-s2.0-84863379783</td>\n",
2098        "      <td>1</td>\n",
2099        "    </tr>\n",
2100        "    <tr>\n",
2101        "      <th>14</th>\n",
2102        "      <td>2-s2.0-84899093663</td>\n",
2103        "      <td>1</td>\n",
2104        "    </tr>\n",
2105        "    <tr>\n",
2106        "      <th>15</th>\n",
2107        "      <td>2-s2.0-84879109859</td>\n",
2108        "      <td>1</td>\n",
2109        "    </tr>\n",
2110        "    <tr>\n",
2111        "      <th>16</th>\n",
2112        "      <td>2-s2.0-83055168309</td>\n",
2113        "      <td>1</td>\n",
2114        "    </tr>\n",
2115        "    <tr>\n",
2116        "      <th>17</th>\n",
2117        "      <td>2-s2.0-84876304322</td>\n",
2118        "      <td>1</td>\n",
2119        "    </tr>\n",
2120        "    <tr>\n",
2121        "      <th>18</th>\n",
2122        "      <td>2-s2.0-84866168147</td>\n",
2123        "      <td>1</td>\n",
2124        "    </tr>\n",
2125        "    <tr>\n",
2126        "      <th>19</th>\n",
2127        "      <td>2-s2.0-84877817428</td>\n",
2128        "      <td>1</td>\n",
2129        "    </tr>\n",
2130        "    <tr>\n",
2131        "      <th>20</th>\n",
2132        "      <td>2-s2.0-84873481256</td>\n",
2133        "      <td>1</td>\n",
2134        "    </tr>\n",
2135        "    <tr>\n",
2136        "      <th>21</th>\n",
2137        "      <td>2-s2.0-84861794897</td>\n",
2138        "      <td>1</td>\n",
2139        "    </tr>\n",
2140        "    <tr>\n",
2141        "      <th>22</th>\n",
2142        "      <td>2-s2.0-84899508298</td>\n",
2143        "      <td>1</td>\n",
2144        "    </tr>\n",
2145        "    <tr>\n",
2146        "      <th>23</th>\n",
2147        "      <td>2-s2.0-84898082465</td>\n",
2148        "      <td>1</td>\n",
2149        "    </tr>\n",
2150        "    <tr>\n",
2151        "      <th>24</th>\n",
2152        "      <td>2-s2.0-84879021774</td>\n",
2153        "      <td>1</td>\n",
2154        "    </tr>\n",
2155        "    <tr>\n",
2156        "      <th>25</th>\n",
2157        "      <td>2-s2.0-80054988041</td>\n",
2158        "      <td>1</td>\n",
2159        "    </tr>\n",
2160        "    <tr>\n",
2161        "      <th>26</th>\n",
2162        "      <td>2-s2.0-84944394118</td>\n",
2163        "      <td>1</td>\n",
2164        "    </tr>\n",
2165        "    <tr>\n",
2166        "      <th>27</th>\n",
2167        "      <td>2-s2.0-84870572301</td>\n",
2168        "      <td>1</td>\n",
2169        "    </tr>\n",
2170        "    <tr>\n",
2171        "      <th>28</th>\n",
2172        "      <td>2-s2.0-84907167320</td>\n",
2173        "      <td>1</td>\n",
2174        "    </tr>\n",
2175        "    <tr>\n",
2176        "      <th>29</th>\n",
2177        "      <td>2-s2.0-84914675721</td>\n",
2178        "      <td>1</td>\n",
2179        "    </tr>\n",
2180        "    <tr>\n",
2181        "      <th>...</th>\n",
2182        "      <td>...</td>\n",
2183        "      <td>...</td>\n",
2184        "    </tr>\n",
2185        "    <tr>\n",
2186        "      <th>6110</th>\n",
2187        "      <td>2-s2.0-84856086839</td>\n",
2188        "      <td>12</td>\n",
2189        "    </tr>\n",
2190        "    <tr>\n",
2191        "      <th>6111</th>\n",
2192        "      <td>2-s2.0-84859510122</td>\n",
2193        "      <td>12</td>\n",
2194        "    </tr>\n",
2195        "    <tr>\n",
2196        "      <th>6112</th>\n",
2197        "      <td>2-s2.0-84905121209</td>\n",
2198        "      <td>12</td>\n",
2199        "    </tr>\n",
2200        "    <tr>\n",
2201        "      <th>6113</th>\n",
2202        "      <td>2-s2.0-84883758613</td>\n",
2203        "      <td>12</td>\n",
2204        "    </tr>\n",
2205        "    <tr>\n",
2206        "      <th>6114</th>\n",
2207        "      <td>2-s2.0-84877953100</td>\n",
2208        "      <td>12</td>\n",
2209        "    </tr>\n",
2210        "    <tr>\n",
2211        "      <th>6115</th>\n",
2212        "      <td>2-s2.0-84904376766</td>\n",
2213        "      <td>12</td>\n",
2214        "    </tr>\n",
2215        "    <tr>\n",
2216        "      <th>6116</th>\n",
2217        "      <td>2-s2.0-84905837182</td>\n",
2218        "      <td>12</td>\n",
2219        "    </tr>\n",
2220        "    <tr>\n",
2221        "      <th>6117</th>\n",
2222        "      <td>2-s2.0-84900461218</td>\n",
2223        "      <td>12</td>\n",
2224        "    </tr>\n",
2225        "    <tr>\n",
2226        "      <th>6118</th>\n",
2227        "      <td>2-s2.0-83755228785</td>\n",
2228        "      <td>13</td>\n",
2229        "    </tr>\n",
2230        "    <tr>\n",
2231        "      <th>6119</th>\n",
2232        "      <td>2-s2.0-84886795975</td>\n",
2233        "      <td>13</td>\n",
2234        "    </tr>\n",
2235        "    <tr>\n",
2236        "      <th>6120</th>\n",
2237        "      <td>2-s2.0-84876132785</td>\n",
2238        "      <td>13</td>\n",
2239        "    </tr>\n",
2240        "    <tr>\n",
2241        "      <th>6121</th>\n",
2242        "      <td>2-s2.0-84903121334</td>\n",
2243        "      <td>13</td>\n",
2244        "    </tr>\n",
2245        "    <tr>\n",
2246        "      <th>6122</th>\n",
2247        "      <td>2-s2.0-84863720400</td>\n",
2248        "      <td>13</td>\n",
2249        "    </tr>\n",
2250        "    <tr>\n",
2251        "      <th>6123</th>\n",
2252        "      <td>2-s2.0-84873180938</td>\n",
2253        "      <td>13</td>\n",
2254        "    </tr>\n",
2255        "    <tr>\n",
2256        "      <th>6124</th>\n",
2257        "      <td>2-s2.0-84914112838</td>\n",
2258        "      <td>13</td>\n",
2259        "    </tr>\n",
2260        "    <tr>\n",
2261        "      <th>6125</th>\n",
2262        "      <td>2-s2.0-84878795748</td>\n",
2263        "      <td>13</td>\n",
2264        "    </tr>\n",
2265        "    <tr>\n",
2266        "      <th>6126</th>\n",
2267        "      <td>2-s2.0-84888011666</td>\n",
2268        "      <td>13</td>\n",
2269        "    </tr>\n",
2270        "    <tr>\n",
2271        "      <th>6127</th>\n",
2272        "      <td>2-s2.0-84942101218</td>\n",
2273        "      <td>13</td>\n",
2274        "    </tr>\n",
2275        "    <tr>\n",
2276        "      <th>6128</th>\n",
2277        "      <td>2-s2.0-80052752113</td>\n",
2278        "      <td>14</td>\n",
2279        "    </tr>\n",
2280        "    <tr>\n",
2281        "      <th>6129</th>\n",
2282        "      <td>2-s2.0-84874074707</td>\n",
2283        "      <td>14</td>\n",
2284        "    </tr>\n",
2285        "    <tr>\n",
2286        "      <th>6130</th>\n",
2287        "      <td>2-s2.0-84942582235</td>\n",
2288        "      <td>14</td>\n",
2289        "    </tr>\n",
2290        "    <tr>\n",
2291        "      <th>6131</th>\n",
2292        "      <td>2-s2.0-70849130360</td>\n",
2293        "      <td>14</td>\n",
2294        "    </tr>\n",
2295        "    <tr>\n",
2296        "      <th>6132</th>\n",
2297        "      <td>2-s2.0-84864152630</td>\n",
2298        "      <td>14</td>\n",
2299        "    </tr>\n",
2300        "    <tr>\n",
2301        "      <th>6133</th>\n",
2302        "      <td>2-s2.0-84868709161</td>\n",
2303        "      <td>15</td>\n",
2304        "    </tr>\n",
2305        "    <tr>\n",
2306        "      <th>6134</th>\n",
2307        "      <td>2-s2.0-84896350015</td>\n",
2308        "      <td>15</td>\n",
2309        "    </tr>\n",
2310        "    <tr>\n",
2311        "      <th>6135</th>\n",
2312        "      <td>2-s2.0-84944104933</td>\n",
2313        "      <td>15</td>\n",
2314        "    </tr>\n",
2315        "    <tr>\n",
2316        "      <th>6136</th>\n",
2317        "      <td>2-s2.0-84875539506</td>\n",
2318        "      <td>16</td>\n",
2319        "    </tr>\n",
2320        "    <tr>\n",
2321        "      <th>6137</th>\n",
2322        "      <td>2-s2.0-84902262954</td>\n",
2323        "      <td>16</td>\n",
2324        "    </tr>\n",
2325        "    <tr>\n",
2326        "      <th>6138</th>\n",
2327        "      <td>2-s2.0-84909954481</td>\n",
2328        "      <td>17</td>\n",
2329        "    </tr>\n",
2330        "    <tr>\n",
2331        "      <th>6139</th>\n",
2332        "      <td>2-s2.0-84921469678</td>\n",
2333        "      <td>18</td>\n",
2334        "    </tr>\n",
2335        "  </tbody>\n",
2336        "</table>\n",
2337        "<p>6140 rows × 2 columns</p>\n",
2338        "</div>"
2339       ],
2340       "text/plain": [
2341        "                     eid  cluster\n",
2342        "0     2-s2.0-71149088987        1\n",
2343        "1     2-s2.0-70349816888        1\n",
2344        "2     2-s2.0-79953711711        1\n",
2345        "3     2-s2.0-79551630751        1\n",
2346        "4     2-s2.0-80051469103        1\n",
2347        "5     2-s2.0-84866718851        1\n",
2348        "6     2-s2.0-84877685551        1\n",
2349        "7     2-s2.0-84864442547        1\n",
2350        "8     2-s2.0-84861420864        1\n",
2351        "9     2-s2.0-84887483487        1\n",
2352        "10    2-s2.0-80955144847        1\n",
2353        "11    2-s2.0-84885038309        1\n",
2354        "12    2-s2.0-84886099569        1\n",
2355        "13    2-s2.0-84863379783        1\n",
2356        "14    2-s2.0-84899093663        1\n",
2357        "15    2-s2.0-84879109859        1\n",
2358        "16    2-s2.0-83055168309        1\n",
2359        "17    2-s2.0-84876304322        1\n",
2360        "18    2-s2.0-84866168147        1\n",
2361        "19    2-s2.0-84877817428        1\n",
2362        "20    2-s2.0-84873481256        1\n",
2363        "21    2-s2.0-84861794897        1\n",
2364        "22    2-s2.0-84899508298        1\n",
2365        "23    2-s2.0-84898082465        1\n",
2366        "24    2-s2.0-84879021774        1\n",
2367        "25    2-s2.0-80054988041        1\n",
2368        "26    2-s2.0-84944394118        1\n",
2369        "27    2-s2.0-84870572301        1\n",
2370        "28    2-s2.0-84907167320        1\n",
2371        "29    2-s2.0-84914675721        1\n",
2372        "...                  ...      ...\n",
2373        "6110  2-s2.0-84856086839       12\n",
2374        "6111  2-s2.0-84859510122       12\n",
2375        "6112  2-s2.0-84905121209       12\n",
2376        "6113  2-s2.0-84883758613       12\n",
2377        "6114  2-s2.0-84877953100       12\n",
2378        "6115  2-s2.0-84904376766       12\n",
2379        "6116  2-s2.0-84905837182       12\n",
2380        "6117  2-s2.0-84900461218       12\n",
2381        "6118  2-s2.0-83755228785       13\n",
2382        "6119  2-s2.0-84886795975       13\n",
2383        "6120  2-s2.0-84876132785       13\n",
2384        "6121  2-s2.0-84903121334       13\n",
2385        "6122  2-s2.0-84863720400       13\n",
2386        "6123  2-s2.0-84873180938       13\n",
2387        "6124  2-s2.0-84914112838       13\n",
2388        "6125  2-s2.0-84878795748       13\n",
2389        "6126  2-s2.0-84888011666       13\n",
2390        "6127  2-s2.0-84942101218       13\n",
2391        "6128  2-s2.0-80052752113       14\n",
2392        "6129  2-s2.0-84874074707       14\n",
2393        "6130  2-s2.0-84942582235       14\n",
2394        "6131  2-s2.0-70849130360       14\n",
2395        "6132  2-s2.0-84864152630       14\n",
2396        "6133  2-s2.0-84868709161       15\n",
2397        "6134  2-s2.0-84896350015       15\n",
2398        "6135  2-s2.0-84944104933       15\n",
2399        "6136  2-s2.0-84875539506       16\n",
2400        "6137  2-s2.0-84902262954       16\n",
2401        "6138  2-s2.0-84909954481       17\n",
2402        "6139  2-s2.0-84921469678       18\n",
2403        "\n",
2404        "[6140 rows x 2 columns]"
2405       ]
2406      },
2407      "metadata": {},
2408      "output_type": "display_data"
2409     }
2410    ],
2411    "source": [
2412     "remember('g_sm_clusters', g_sm_clu[[\"eid\", \"cluster\"]])"
2413    ]
2414   },
2415   {
2416    "cell_type": "code",
2417    "execution_count": 88,
2418    "metadata": {},
2419    "outputs": [
2420     {
2421      "data": {
2422       "text/plain": [
2423        "['all_citations',\n",
2424        " 'all_cited',\n",
2425        " 'all_citing',\n",
2426        " 'cluster_edgelist',\n",
2427        " 'g_sm_clusters',\n",
2428        " 'sm_citations',\n",
2429        " 'sm_cited',\n",
2430        " 'sm_citing',\n",
2431        " 'total_articles']"
2432       ]
2433      },
2434      "execution_count": 88,
2435      "metadata": {},
2436      "output_type": "execute_result"
2437     }
2438    ],
2439    "source": [
2440     "sorted(r.keys())"
2441    ]
2442   },
2443   {
2444    "cell_type": "code",
2445    "execution_count": 89,
2446    "metadata": {},
2447    "outputs": [],
2448    "source": [
2449     "#save the r function to rdata file\n",
2450     "def save_to_r(r_dict, filename=\"output.RData\"):\n",
2451     "    for var_name, x in r.items():\n",
2452     "        var_name = var_name.replace('_', '.')\n",
2453     "        if type(x) == np.int64:\n",
2454     "            x = np.asscalar(x)\n",
2455     "        \n",
2456     "        if type(x) == pd.DataFrame:\n",
2457     "            rx = pandas2ri.py2ri(x)\n",
2458     "        else:\n",
2459     "            rx = x\n",
2460     "        \n",
2461     "        robjects.r.assign(var_name, x)\n",
2462     "\n",
2463     "        # create a new variable called in R\n",
2464     "    robjects.r(\"r <- sapply(ls(), function (x) {eval(parse(text=x))})\")\n",
2465     "    robjects.r('save(\"r\", file=\"{}\")'.format(filename))\n",
2466     "    robjects.r(\"rm(list=ls())\")\n",
2467     "    \n",
2468     "save_to_r(r, \"../../paper/data/network_data.RData\")"
2469    ]
2470   }
2471  ],
2472  "metadata": {
2473   "kernelspec": {
2474    "display_name": "Python 3",
2475    "language": "python",
2476    "name": "python3"
2477   },
2478   "language_info": {
2479    "codemirror_mode": {
2480     "name": "ipython",
2481     "version": 3
2482    },
2483    "file_extension": ".py",
2484    "mimetype": "text/x-python",
2485    "name": "python",
2486    "nbconvert_exporter": "python",
2487    "pygments_lexer": "ipython3",
2488    "version": "3.6.4"
2489   }
2490  },
2491  "nbformat": 4,
2492  "nbformat_minor": 1
2493 }

Community Data Science Collective || Want to submit a patch?