2 @article{howison_flossmole_2006,
5 issn = {1554-1045, 1554-1053},
6 url = {http://www.igi-global.com/article/international-journal-information-technology-web/2610},
7 doi = {10.4018/jitwe.2006070102},
10 journaltitle = {International Journal of Information Technology and Web Engineering},
11 author = {Howison, James and Conklin, Megan and Crowston, Kevin},
12 urldate = {2013-06-15},
16 @article{bohannon_google_2011,
17 title = {Google Books, Wikipedia, and the Future of Culturomics},
19 issn = {0036-8075, 1095-9203},
20 url = {http://www.sciencemag.org/content/331/6014/135},
21 doi = {10.1126/science.331.6014.135},
22 abstract = {As a follow-up to the quantitative analysis of data obtained from Google Books published online in Science on 16 December 2010 and in this week's issue on page 176, one of the study's authors has been using Wikipedia to analyze the fame of scientists whose names appear in books over the centuries. But his effort has been hampered by the online encyclopedia's shortcomings, from the reliability of its information to the organization of its content. Several efforts are under way to improve Wikipedia as a teaching and research tool, including one by the Association for Psychological Science that seeks to create a more complete and accurate representation of its field.},
25 journaltitle = {Science},
26 author = {Bohannon, John},
27 urldate = {2014-02-14},
33 @article{welles_visualizing_2015,
34 title = {Visualizing Computational Social Science The Multiple Lives of a Complex Image},
36 url = {http://scx.sagepub.com/content/37/1/34.short},
39 journaltitle = {Science Communication},
40 author = {Welles, Brooke Foucault and Meirelles, Isabel},
41 urldate = {2015-08-05},
43 file = {[PDF] from sagepub.com:/home/jeremy/Zotero/storage/AMRMRGNB/Welles and Meirelles - 2015 - Visualizing Computational Social Science The Multi.pdf:application/pdf}
46 @article{van_noorden_interdisciplinary_2015,
47 title = {Interdisciplinary research by the numbers},
49 issn = {0028-0836, 1476-4687},
50 url = {http://www.nature.com/doifinder/10.1038/525306a},
51 doi = {10.1038/525306a},
54 journaltitle = {Nature},
55 author = {Van Noorden, Richard},
56 urldate = {2015-09-21},
60 @article{mcfarland_sociology_2015,
61 title = {Sociology in the Era of Big Data: The Ascent of Forensic Social Science},
62 issn = {0003-1232, 1936-4784},
63 url = {http://link.springer.com/article/10.1007/s12108-015-9291-8},
64 doi = {10.1007/s12108-015-9291-8},
65 shorttitle = {Sociology in the Era of Big Data},
67 journaltitle = {The American Sociologist},
68 shortjournal = {Am Soc},
69 author = {{McFarland}, Daniel A. and Lewis, Kevin and Goldberg, Amir},
70 urldate = {2015-09-25},
73 keywords = {Forensic social science, Social Sciences, general, Sociology of science, Sociology, general, Computational social science, Big data},
74 file = {Full Text PDF:/home/jeremy/Zotero/storage/F66XW8K7/McFarland et al. - 2015 - Sociology in the Era of Big Data The Ascent of Fo.pdf:application/pdf}
77 @article{hargittai_is_2015,
78 title = {Is Bigger Always Better? Potential Biases of Big Data Derived from Social Network Sites},
80 issn = {0002-7162, 1552-3349},
81 url = {http://ann.sagepub.com/content/659/1/63},
82 doi = {10.1177/0002716215570866},
83 shorttitle = {Is Bigger Always Better?},
84 abstract = {This article discusses methodological challenges of using big data that rely on specific sites and services as their sampling frames, focusing on social network sites in particular. It draws on survey data to show that people do not select into the use of such sites randomly. Instead, use is biased in certain ways yielding samples that limit the generalizability of findings. Results show that age, gender, race/ethnicity, socioeconomic status, online experiences, and Internet skills all influence the social network sites people use and thus where traces of their behavior show up. This has implications for the types of conclusions one can draw from data derived from users of specific sites. The article ends by noting how big data studies can address the shortcomings that result from biased sampling frames.},
87 journaltitle = {The {ANNALS} of the American Academy of Political and Social Science},
88 shortjournal = {The {ANNALS} of the American Academy of Political and Social Science},
89 author = {Hargittai, Eszter},
90 urldate = {2015-10-19},
93 keywords = {digital inequality, social network sites, sampling, Internet skills, sampling frame, biased sample, Big data}
96 @article{lazer_computational_2009,
97 title = {Computational Social Science},
99 url = {http://www.sciencemag.org},
100 doi = {10.1126/science.1167742},
101 shorttitle = {{SOCIAL} {SCIENCE}},
104 journaltitle = {Science},
105 author = {Lazer, David and Pentland, Alex and Adamic, Lada and Aral, Sinan and Barabasi, Albert-Laszlo and Brewer, Devon and Christakis, Nicholas and Contractor, Noshir and Fowler, James and Gutmann, Myron and Jebara, Tony and King, Gary and Macy, Michael and Roy, Deb and Van Alstyne, Marshall},
106 urldate = {2009-03-06},
108 file = {HighWire Snapshot:/home/jeremy/Zotero/storage/C939DFAS/721.html:text/html;PubMed Central Full Text PDF:/home/jeremy/Zotero/storage/RPX8A4ID/Lazer et al. - 2009 - Life in the network the coming age of computation.pdf:application/pdf}
111 @article{mann_bibliometric_2006,
112 title = {Bibliometric impact measures leveraging topic analysis},
113 abstract = {Measurements of the impact and history of research literature provide a useful complement to scientific digital library collections. Bibliometric indicators have been extensively studied, mostly in the context of journals. However, journal-based metrics poorly capture topical distinctions in fast-moving fields, and are increasingly problematic with the rise of open-access publishing. Recent developments in latent topic models have produced promising results for automatic sub-field discovery. The fine-grained, faceted topics produced by such models provide a clearer view of the topical divisions of a body of research literature and the interactions between those divisions. We demonstrate the usefulness of topic models in measuring impact by applying a new phrase-based topic discovery model to a collection of 300,000 computer science publications, collected by the Rexa automatic citation indexing system},
115 author = {Mann, G.S and Mimno, D and McCallum, A and {2006 IEEE/ACM 6th Joint Conference on Digital Libraries}},
118 file = {Mann et al. - 2006 - Bibliometric impact measures leveraging topic anal.pdf:/home/jeremy/Zotero/storage/RHR8REID/Mann et al. - 2006 - Bibliometric impact measures leveraging topic anal.pdf:application/pdf}
121 @article{reid_mapping_2007,
122 title = {Mapping the contemporary terrorism research domain},
125 abstract = {A systematic view of terrorism research to reveal the intellectual structure of the field and empirically discern the distinct set of core researchers, institutional affiliations, publications, and conceptual areas can help us gain a deeper understanding of approaches to terrorism. This paper responds to this need by using an integrated knowledge-mapping framework that we developed to identify the core researchers and knowledge creation approaches in terrorism. The framework uses three types of analysis: (a) basic analysis of scientific output using citation, bibliometric, and social network analyses, (b) content map analysis of large corpora of literature, and (c) co-citation analysis to analyse linkages among pairs of researchers. We applied domain visualization techniques such as content map analysis, block-modeling, and co-citation analysis to the literature and author citation data from the years 1965 to 2003. The data were gathered from ten databases such as the {ISI} Web of Science. The results reveal: (1) the names of the top 42 core terrorism researchers (e.g., Brian Jenkins, Bruce Hoffman, and Paul Wilkinson) as well as their institutional affiliations; (2) their influential publications; (3) clusters of terrorism researchers who work in similar areas; and (4) that the research focus has shifted from terrorism as a low-intensity conflict to a strategic threat to world powers with increased focus on Osama Bin Laden.},
128 journaltitle = {{YIJHC} International Journal of Human - Computer Studies},
129 author = {Reid, Edna F and Chen, Hsinchun},
132 file = {Reid and Chen - 2007 - Mapping the contemporary terrorism research domain.pdf:/home/jeremy/Zotero/storage/DAN5ATFN/Reid and Chen - 2007 - Mapping the contemporary terrorism research domain.pdf:application/pdf}
135 @article{blei_probabilistic_2012,
136 title = {Probabilistic Topic Models},
139 url = {http://doi.acm.org/10.1145/2133806.2133826},
140 doi = {10.1145/2133806.2133826},
141 abstract = {Surveying a suite of algorithms that offer a solution to managing large document archives.},
144 journaltitle = {Commun. {ACM}},
145 author = {Blei, David M.},
146 urldate = {2016-03-07},
148 file = {Blei - 2012 - Probabilistic Topic Models.pdf:/home/jeremy/Zotero/storage/5HZENWNZ/Blei - 2012 - Probabilistic Topic Models.pdf:application/pdf}
151 @article{schwartz_personality_2013,
152 title = {Personality, Gender, and Age in the Language of Social Media: The Open-Vocabulary Approach},
155 url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0073791},
156 doi = {10.1371/journal.pone.0073791},
157 shorttitle = {Personality, Gender, and Age in the Language of Social Media},
158 abstract = {We analyzed 700 million words, phrases, and topic instances collected from the Facebook messages of 75,000 volunteers, who also took standard personality tests, and found striking variations in language with personality, gender, and age. In our open-vocabulary technique, the data itself drives a comprehensive exploration of language that distinguishes people, finding connections that are not captured with traditional closed-vocabulary word-category analyses. Our analyses shed new light on psychosocial processes yielding results that are face valid (e.g., subjects living in high elevations talk about the mountains), tie in with other research (e.g., neurotic people disproportionately use the phrase ‘sick of’ and the word ‘depressed’), suggest new hypotheses (e.g., an active life implies emotional stability), and give detailed insights (males use the possessive ‘my’ when mentioning their ‘wife’ or ‘girlfriend’ more often than females use ‘my’ with ‘husband’ or 'boyfriend’). To date, this represents the largest study, by an order of magnitude, of language and personality.},
161 journaltitle = {{PLOS} {ONE}},
162 shortjournal = {{PLOS} {ONE}},
163 author = {Schwartz, H. Andrew and Eichstaedt, Johannes C. and Kern, Margaret L. and Dziurzynski, Lukasz and Ramones, Stephanie M. and Agrawal, Megha and Shah, Achal and Kosinski, Michal and Stillwell, David and Seligman, Martin E. P. and Ungar, Lyle H.},
164 urldate = {2016-03-07},
166 keywords = {Social Media, Facebook, Personality, Psychology, language, Psycholinguistics, Forecasting, Vocabulary},
167 file = {Schwartz et al. - 2013 - Personality, Gender, and Age in the Language of So.pdf:/home/jeremy/Zotero/storage/CKR7EZ5S/Schwartz et al. - 2013 - Personality, Gender, and Age in the Language of So.pdf:application/pdf}
170 @article{kovacs_exploring_2015,
171 title = {Exploring the scope of open innovation: a bibliometric review of a decade of research},
173 issn = {0138-9130, 1588-2861},
174 url = {http://link.springer.com/article/10.1007/s11192-015-1628-0},
175 doi = {10.1007/s11192-015-1628-0},
176 shorttitle = {Exploring the scope of open innovation},
177 abstract = {The concept of open innovation has attracted considerable attention since Henry Chesbrough first coined it to capture the increasing reliance of firms on external sources of innovation. Although open innovation has flourished as a topic within innovation management research, it has also triggered debates about the coherence of the research endeavors pursued under this umbrella, including its theoretical foundations. In this paper, we aim to contribute to these debates through a bibliometric review of the first decade of open innovation research. We combine two techniques—bibliographic coupling and co-citation analysis—to (1) visualize the network of publications that explicitly use the label ‘open innovation’ and (2) to arrive at distinct clusters of thematically related publications. Our findings illustrate that open innovation research builds principally on four related streams of prior research, whilst the bibliographic network of open innovation research reveals that seven thematic clusters have been pursued persistently. While such persistence is undoubtedly useful to arrive at in-depth and robust insights, the observed patterns also signal the absence of new, emerging, themes. As such, ‘open innovation’ might benefit from applying its own ideas: sourcing concepts and models from a broader range of theoretical perspectives as well as pursuing a broader range of topics might introduce dynamics resulting in more impact and proliferation.},
180 journaltitle = {Scientometrics},
181 shortjournal = {Scientometrics},
182 author = {Kovács, Adrián and Looy, Bart Van and Cassiman, Bruno},
183 urldate = {2016-04-20},
186 keywords = {open innovation, Library Science, Information Storage and Retrieval, 91-02, Co-citation analysis, Bibliographic coupling, O32, Q55, Interdisciplinary Studies, openness},
187 file = {Kovács et al. - 2015 - Exploring the scope of open innovation a bibliome.pdf:/home/jeremy/Zotero/storage/MFDEMAFC/Kovács et al. - 2015 - Exploring the scope of open innovation a bibliome.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/AITBH9EK/s11192-015-1628-0.html:text/html}
190 @inproceedings{blei_dynamic_2006,
191 title = {Dynamic topic models},
192 url = {http://dl.acm.org/citation.cfm?id=1143859},
194 booktitle = {Proceedings of the 23rd international conference on Machine learning},
196 author = {Blei, David M. and Lafferty, John D.},
197 urldate = {2016-04-21},
199 file = {[PDF] from cmu.edu:/home/jeremy/Zotero/storage/UBSD9KNT/Blei and Lafferty - 2006 - Dynamic topic models.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/MR3H4FSU/citation.html:text/html}
202 @inproceedings{hall_studying_2008,
203 location = {Stroudsburg, {PA}, {USA}},
204 title = {Studying the History of Ideas Using Topic Models},
205 url = {http://dl.acm.org/citation.cfm?id=1613715.1613763},
206 series = {{EMNLP} '08},
207 abstract = {How can the development of ideas in a scientific field be studied over time? We apply unsupervised topic modeling to the {ACL} Anthology to analyze historical trends in the field of Computational Linguistics from 1978 to 2006. We induce topic clusters using Latent Dirichlet Allocation, and examine the strength of each topic over time. Our methods find trends in the field including the rise of probabilistic methods starting in 1988, a steady increase in applications, and a sharp decline of research in semantics and understanding between 1978 and 2001, possibly rising again after 2001. We also introduce a model of the diversity of ideas, topic entropy, using it to show that {COLING} is a more diverse conference than {ACL}, but that both conferences as well as {EMNLP} are becoming broader over time. Finally, we apply Jensen-Shannon divergence of topic distributions to show that all three conferences are converging in the topics they cover.},
209 booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing},
210 publisher = {Association for Computational Linguistics},
211 author = {Hall, David and Jurafsky, Daniel and Manning, Christopher D.},
212 urldate = {2016-04-21},
214 file = {ACM Full Text PDF:/home/jeremy/Zotero/storage/UZV4H35G/Hall et al. - 2008 - Studying the History of Ideas Using Topic Models.pdf:application/pdf}
217 @inproceedings{mitra_language_2014,
218 location = {New York, {NY}, {USA}},
219 title = {The Language That Gets People to Give: Phrases That Predict Success on Kickstarter},
220 isbn = {978-1-4503-2540-0},
221 url = {http://doi.acm.org/10.1145/2531602.2531656},
222 doi = {10.1145/2531602.2531656},
223 series = {{CSCW} '14},
224 shorttitle = {The Language That Gets People to Give},
225 abstract = {Crowdfunding sites like Kickstarter--where entrepreneurs and artists look to the internet for funding--have quickly risen to prominence. However, we know very little about the factors driving the 'crowd' to take projects to their funding goal. In this paper we explore the factors which lead to successfully funding a crowdfunding project. We study a corpus of 45K crowdfunded projects, analyzing 9M phrases and 59 other variables commonly present on crowdfunding sites. The language used in the project has surprising predictive power accounting for 58.56\% of the variance around successful funding. A closer look at the phrases shows they exhibit general persuasion principles. For example, also receive two reflects the principle of Reciprocity and is one of the top predictors of successful funding. We conclude this paper by announcing the release of the predictive phrases along with the control variables as a public dataset, hoping that our work can enable new features on crowdfunding sites--tools to help both backers and project creators make the best use of their time and money.},
227 booktitle = {Proceedings of the 17th {ACM} Conference on Computer Supported Cooperative Work \& Social Computing},
229 author = {Mitra, Tanushree and Gilbert, Eric},
230 urldate = {2016-04-29},
232 keywords = {crowdfunding, natural language processing (nlp), {CMC}}
235 @book{wasserman_social_1994,
236 title = {Social Network Analysis: Methods And Applications},
237 publisher = {Cambridge University Press},
238 author = {Wasserman, Stanley and Faust, Katherine},
242 @article{tausczik_psychological_2010,
243 title = {The Psychological Meaning of Words: {LIWC} and Computerized Text Analysis Methods},
245 issn = {0261-927X, 1552-6526},
246 url = {http://jls.sagepub.com/content/29/1/24},
247 doi = {10.1177/0261927X09351676},
248 shorttitle = {The Psychological Meaning of Words},
249 abstract = {We are in the midst of a technological revolution whereby, for the first time, researchers can link daily word use to a broad array of real-world behaviors. This article reviews several computerized text analysis methods and describes how Linguistic Inquiry and Word Count ({LIWC}) was created and validated. {LIWC} is a transparent text analysis program that counts words in psychologically meaningful categories. Empirical results using {LIWC} demonstrate its ability to detect meaning in a wide variety of experimental settings, including to show attentional focus, emotionality, social relationships, thinking styles, and individual differences.},
252 journaltitle = {Journal of Language and Social Psychology},
253 shortjournal = {Journal of Language and Social Psychology},
254 author = {Tausczik, Yla R. and Pennebaker, James W.},
255 urldate = {2016-07-12},
258 keywords = {attention, {LIWC}, deception, dominance, relationships, pronouns, computerized text analysis},
259 file = {Full Text PDF:/home/jeremy/Zotero/storage/G6TIZD38/Tausczik and Pennebaker - 2010 - The Psychological Meaning of Words LIWC and Compu.pdf:application/pdf}
262 @book{smith_general_2014,
263 title = {General social surveys, 1972-2014},
264 shorttitle = {General social surveys, 1972-2014},
265 publisher = {National Opinion Research Center ({NORC})},
266 author = {Smith, Tom William and Marsden, Peter and Hout, Michael and Kim, Jibum},
270 @book{leskovec_snap_2014,
271 title = {{SNAP} Datasets: Stanford Large Network Dataset Collection},
272 url = {http://snap.stanford.edu/data},
273 author = {Leskovec, Jure and Krevl, Andrej},
277 @article{kozinets_field_2002,
278 title = {The Field Behind the Screen: Using Netnography for Marketing Research in Online Communities},
281 url = {http://journals.ama.org/doi/abs/10.1509/jmkr.39.1.61.18935},
282 doi = {10.1509/jmkr.39.1.61.18935},
283 shorttitle = {The Field Behind the Screen},
284 abstract = {The author develops “netnography” as an online marketing research technique for providing consumer insight. Netnography is ethnography adapted to the study of online communities. As a method, netnography is faster, simpler, and less expensive than traditional ethnography and more naturalistic and unobtrusive than focus groups or interviews. It provides information on the symbolism, meanings, and consumption patterns of online consumer groups. The author provides guidelines that acknowledge the online environment, respect the inherent flexibility and openness of ethnography, and provide rigor and ethics in the conduct of marketing research. As an illustrative example, the author provides a netnography of an online coffee newsgroup and discusses its marketing implications.},
287 journaltitle = {Journal of Marketing Research},
288 shortjournal = {Journal of Marketing Research},
289 author = {Kozinets, Robert V.},
290 urldate = {2016-07-18},
294 @article{chew_pandemics_2010,
295 title = {Pandemics in the Age of Twitter: Content Analysis of Tweets during the 2009 H1N1 Outbreak},
298 url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0014118},
299 doi = {10.1371/journal.pone.0014118},
300 shorttitle = {Pandemics in the Age of Twitter},
301 abstract = {Background
302 Surveys are popular methods to measure public perceptions in emergencies but can be costly and time consuming. We suggest and evaluate a complementary “infoveillance” approach using Twitter during the 2009 H1N1 pandemic. Our study aimed to: 1) monitor the use of the terms “H1N1” versus “swine flu” over time; 2) conduct a content analysis of “tweets”; and 3) validate Twitter as a real-time content, sentiment, and public attention trend-tracking tool.
304 Methodology/Principal Findings
305 Between May 1 and December 31, 2009, we archived over 2 million Twitter posts containing keywords “swine flu,” “swineflu,” and/or “H1N1.” using Infovigil, an infoveillance system. Tweets using “H1N1” increased from 8.8\% to 40.5\% ( R 2 = .788; p \<.001), indicating a gradual adoption of World Health Organization-recommended terminology. 5,395 tweets were randomly selected from 9 days, 4 weeks apart and coded using a tri-axial coding scheme. To track tweet content and to test the feasibility of automated coding, we created database queries for keywords and correlated these results with manual coding. Content analysis indicated resource-related posts were most commonly shared (52.6\%). 4.5\% of cases were identified as misinformation. News websites were the most popular sources (23.2\%), while government and health agencies were linked only 1.5\% of the time. 7/10 automated queries correlated with manual coding. Several Twitter activity peaks coincided with major news stories. Our results correlated well with H1N1 incidence data.
308 This study illustrates the potential of using social media to conduct “infodemiology” studies for public health. 2009 H1N1-related tweets were primarily used to disseminate information from credible sources, but were also a source of opinions and experiences. Tweets can be used for real-time content analysis and knowledge translation research, allowing health authorities to respond to public concerns.},
311 journaltitle = {{PLOS} {ONE}},
312 shortjournal = {{PLOS} {ONE}},
313 author = {Chew, Cynthia and Eysenbach, Gunther},
314 urldate = {2016-07-18},
316 keywords = {Chi square tests, Public and occupational health, Data Mining, H1N1, Swine influenza, twitter, Swine, Internet},
317 file = {Full Text PDF:/home/jeremy/Zotero/storage/KV2JGXGC/Chew and Eysenbach - 2010 - Pandemics in the Age of Twitter Content Analysis .pdf:application/pdf}
320 @inproceedings{agichtein_finding_2008,
321 location = {New York, {NY}, {USA}},
322 title = {Finding High-quality Content in Social Media},
323 isbn = {978-1-59593-927-2},
324 url = {http://doi.acm.org/10.1145/1341531.1341557},
325 doi = {10.1145/1341531.1341557},
326 series = {{WSDM} '08},
327 abstract = {The quality of user-generated content varies drastically from excellent to abuse and spam. As the availability of such content increases, the task of identifying high-quality content sites based on user contributions --social media sites -- becomes increasingly important. Social media in general exhibit a rich variety of information sources: in addition to the content itself, there is a wide array of non-content information available, such as links between items and explicit quality ratings from members of the community. In this paper we investigate methods for exploiting such community feedback to automatically identify high quality content. As a test case, we focus on Yahoo! Answers, a large community question/answering portal that is particularly rich in the amount and types of content and social interactions available in it. We introduce a general classification framework for combining the evidence from different sources of information, that can be tuned automatically for a given social media type and quality definition. In particular, for the community question/answering domain, we show that our system is able to separate high-quality items from the rest with an accuracy close to that of humans},
329 booktitle = {Proceedings of the 2008 International Conference on Web Search and Data Mining},
331 author = {Agichtein, Eugene and Castillo, Carlos and Donato, Debora and Gionis, Aristides and Mishne, Gilad},
332 urldate = {2016-07-19},
334 keywords = {media, user interactions, community question answering},
335 file = {ACM Full Text PDF:/home/jeremy/Zotero/storage/CNFWMINP/Agichtein et al. - 2008 - Finding High-quality Content in Social Media.pdf:application/pdf;ACM Full Text PDF:/home/jeremy/Zotero/storage/9BDZK58M/Agichtein et al. - 2008 - Finding High-quality Content in Social Media.pdf:application/pdf}
338 @inproceedings{resnick_grouplens:_1994,
339 location = {New York, {NY}, {USA}},
340 title = {{GroupLens}: An Open Architecture for Collaborative Filtering of Netnews},
341 isbn = {978-0-89791-689-9},
342 url = {http://doi.acm.org/10.1145/192844.192905},
343 doi = {10.1145/192844.192905},
344 series = {{CSCW} '94},
345 shorttitle = {{GroupLens}},
346 abstract = {Collaborative filters help people make choices based on the opinions of other people. {GroupLens} is a system for collaborative filtering of netnews, to help people find articles they will like in the huge stream of available articles. News reader clients display predicted scores and make it easy for users to rate articles after they read them. Rating servers, called Better Bit Bureaus, gather and disseminate the ratings. The rating servers predict scores based on the heuristic that people who agreed in the past will probably agree again. Users can protect their privacy by entering ratings under pseudonyms, without reducing the effectiveness of the score prediction. The entire architecture is open: alternative software for news clients and Better Bit Bureaus can be developed independently and can interoperate with the components we have developed.},
348 booktitle = {Proceedings of the 1994 {ACM} Conference on Computer Supported Cooperative Work},
350 author = {Resnick, Paul and Iacovou, Neophytos and Suchak, Mitesh and Bergstrom, Peter and Riedl, John},
351 urldate = {2016-07-19},
353 keywords = {collaborative filtering, selective dissemination of information, user model, social filtering, electronic bulletin boards, netnews, information filtering, Usenet},
354 file = {ACM Full Text PDF:/home/jeremy/Zotero/storage/JPUR4MA4/Resnick et al. - 1994 - GroupLens An Open Architecture for Collaborative .pdf:application/pdf}
357 @inproceedings{wang_tm-lda:_2012,
358 title = {{TM}-{LDA}: efficient online modeling of latent topic transitions in social media},
359 isbn = {978-1-4503-1462-6},
360 url = {http://dl.acm.org/citation.cfm?doid=2339530.2339552},
361 doi = {10.1145/2339530.2339552},
362 shorttitle = {{TM}-{LDA}},
364 publisher = {{ACM} Press},
365 author = {Wang, Yu and Agichtein, Eugene and Benzi, Michele},
366 urldate = {2016-07-19},
371 @inproceedings{prier_identifying_2011,
372 location = {Berlin, Heidelberg},
373 title = {Identifying Health-related Topics on Twitter: An Exploration of Tobacco-related Tweets As a Test Topic},
374 isbn = {978-3-642-19655-3},
375 url = {http://dl.acm.org/citation.cfm?id=1964698.1964702},
377 shorttitle = {Identifying Health-related Topics on Twitter},
378 abstract = {Public health-related topics are difficult to identify in large conversational datasets like Twitter. This study examines how to model and discover public health topics and themes in tweets. Tobacco use is chosen as a test case to demonstrate the effectiveness of topic modeling via {LDA} across a large, representational dataset from the United States, as well as across a smaller subset that was seeded by tobacco-related queries. Topic modeling across the large dataset uncovers several public health-related topics, although tobacco is not detected by this method. However, topic modeling across the tobacco subset provides valuable insight about tobacco use in the United States. The methods used in this paper provide a possible toolset for public health researchers and practitioners to better understand public health problems through large datasets of conversational data.},
380 booktitle = {Proceedings of the 4th International Conference on Social Computing, Behavioral-cultural Modeling and Prediction},
381 publisher = {Springer-Verlag},
382 author = {Prier, Kyle W. and Smith, Matthew S. and Giraud-Carrier, Christophe and Hanson, Carl L.},
383 urldate = {2016-07-19},
385 keywords = {Social Media, tobacco use, {LDA}, Data Mining, topic modeling, Social networks, public health}
388 @inproceedings{pennacchiotti_investigating_2011,
389 location = {New York, {NY}, {USA}},
390 title = {Investigating Topic Models for Social Media User Recommendation},
391 isbn = {978-1-4503-0637-9},
392 url = {http://doi.acm.org/10.1145/1963192.1963244},
393 doi = {10.1145/1963192.1963244},
394 series = {{WWW} '11},
395 abstract = {This paper presents a user recommendation system that recommends to a user new friends having similar interests. We automatically discover users' interests using Latent Dirichlet Allocation ({LDA}), a linguistic topic model that represents users as mixtures of topics. Our system is able to recommend friends for 4 million users with high recall, outperforming existing strategies based on graph analysis.},
397 booktitle = {Proceedings of the 20th International Conference Companion on World Wide Web},
399 author = {Pennacchiotti, Marco and Gurumurthy, Siva},
400 urldate = {2016-07-19},
402 keywords = {Social Media, {LDA}, user recommendation, Topic models},
403 file = {ACM Full Text PDF:/home/jeremy/Zotero/storage/R389CKQJ/Pennacchiotti and Gurumurthy - 2011 - Investigating Topic Models for Social Media User R.pdf:application/pdf}
406 @article{yang_identifying_2014,
407 title = {Identifying Interesting Twitter Contents Using Topical Analysis},
410 url = {http://dx.doi.org/10.1016/j.eswa.2013.12.051},
411 doi = {10.1016/j.eswa.2013.12.051},
412 abstract = {Social media platforms such as Twitter are becoming increasingly mainstream which provides valuable user-generated information by publishing and sharing contents. Identifying interesting and useful contents from large text-streams is a crucial issue in social media because many users struggle with information overload. Retweeting as a forwarding function plays an important role in information propagation where the retweet counts simply reflect a tweet's popularity. However, the main reason for retweets may be limited to personal interests and satisfactions. In this paper, we use a topic identification as a proxy to understand a large number of tweets and to score the interestingness of an individual tweet based on its latent topics. Our assumption is that fascinating topics generate contents that may be of potential interest to a wide audience. We propose a novel topic model called Trend Sensitive-Latent Dirichlet Allocation ({TS}-{LDA}) that can efficiently extract latent topics from contents by modeling temporal trends on Twitter over time. The experimental results on real world data from Twitter demonstrate that our proposed method outperforms several other baseline methods.},
413 pages = {4330--4336},
415 journaltitle = {Expert Syst. Appl.},
416 author = {Yang, Min-Chul and Rim, Hae-Chang},
417 urldate = {2016-07-19},
419 keywords = {Social Media, Interesting content, {LDA}, Topic model, twitter}
422 @article{fruchterman_graph_1991,
423 title = {Graph drawing by force-directed placement},
425 rights = {Copyright © 1991 John Wiley \& Sons, Ltd},
427 url = {http://onlinelibrary.wiley.com/doi/10.1002/spe.4380211102/abstract},
428 doi = {10.1002/spe.4380211102},
429 abstract = {We present a modification of the spring-embedder model of Eades [Congressus Numerantium, 42, 149–160, (1984)] for drawing undirected graphs with straight edges. Our heuristic strives for uniform edge lengths, and we develop it in analogy to forces in natural systems, for a simple, elegant, conceptually-intuitive, and efficient algorithm.},
430 pages = {1129--1164},
432 journaltitle = {Software: Practice and Experience},
433 shortjournal = {Softw: Pract. Exper.},
434 author = {Fruchterman, Thomas M. J. and Reingold, Edward M.},
435 urldate = {2016-07-20},
438 keywords = {Multi-level techniques, Force-directed placement, Graph drawing, Simulated annealing},
439 file = {Snapshot:/home/jeremy/Zotero/storage/SR6JA3QW/abstract.html:text/html}
442 @article{bastian_gephi:_2009,
443 title = {Gephi: an open source software for exploring and manipulating networks.},
445 url = {http://www.aaai.org/ocs/index.php/ICWSM/09/paper/viewFile/154/1009/},
446 shorttitle = {Gephi},
448 journaltitle = {{ICWSM}},
449 author = {Bastian, Mathieu and Heymann, Sebastien and Jacomy, Mathieu and {others}},
450 urldate = {2016-07-20},
452 file = {Bastian et al. - 2009 - Gephi an open source software for exploring and m.pdf:/home/jeremy/Zotero/storage/Q82CV3RM/Bastian et al. - 2009 - Gephi an open source software for exploring and m.pdf:application/pdf}
455 @unpublished{binfield_plos_2012,
456 location = {National Institute for Informatics},
457 title = {{PLoS} {ONE} and the rise of the Open Access {MegaJournal}},
458 url = {http://www.nii.ac.jp/sparc/en/event/2011/pdf/20120229_doc3_binfield.pdf},
459 note = {The 5th {SPARC} Japan Seminar 2011},
460 author = {Binfield, Peter},
461 urldate = {2016-07-20},
463 file = {[PDF] from nii.ac.jp:/home/jeremy/Zotero/storage/DU86MXEM/Binfield - 2003 - PLoS ONE and the rise of the Open Access MegaJourn.pdf:application/pdf}
466 @article{subelj_clustering_2016,
467 title = {Clustering Scientific Publications Based on Citation Relations: A Systematic Comparison of Different Methods},
470 url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0154404},
471 doi = {10.1371/journal.pone.0154404},
472 shorttitle = {Clustering Scientific Publications Based on Citation Relations},
473 abstract = {Clustering methods are applied regularly in the bibliometric literature to identify research areas or scientific fields. These methods are for instance used to group publications into clusters based on their relations in a citation network. In the network science literature, many clustering methods, often referred to as graph partitioning or community detection techniques, have been developed. Focusing on the problem of clustering the publications in a citation network, we present a systematic comparison of the performance of a large number of these clustering methods. Using a number of different citation networks, some of them relatively small and others very large, we extensively study the statistical properties of the results provided by different methods. In addition, we also carry out an expert-based assessment of the results produced by different methods. The expert-based assessment focuses on publications in the field of scientometrics. Our findings seem to indicate that there is a trade-off between different properties that may be considered desirable for a good clustering of publications. Overall, map equation methods appear to perform best in our analysis, suggesting that these methods deserve more attention from the bibliometric community.},
476 journaltitle = {{PLOS} {ONE}},
477 shortjournal = {{PLOS} {ONE}},
478 author = {Šubelj, Lovro and Eck, Nees Jan van and Waltman, Ludo},
479 urldate = {2016-07-20},
481 keywords = {Library Science, Bibliometrics, Graphs, Algorithms, Statistical methods, Optimization, Computer and information sciences, Scientometrics},
482 file = {Full Text PDF:/home/jeremy/Zotero/storage/UQJHZF6X/Šubelj et al. - 2016 - Clustering Scientific Publications Based on Citati.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/7T77BK72/article.html:text/html}
485 @article{small_co-citation_1973,
486 title = {Co-citation in the scientific literature: A new measure of the relationship between two documents},
488 rights = {Copyright © 1973 Wiley Periodicals, Inc., A Wiley Company},
490 url = {http://onlinelibrary.wiley.com/doi/10.1002/asi.4630240406/abstract},
491 doi = {10.1002/asi.4630240406},
492 shorttitle = {Co-citation in the scientific literature},
493 abstract = {A new form of document coupling called co-citation is defined as the frequency with which two documents are cited together. The co-citation frequency of two scientific papers can be determined by comparing lists of citing documents in the Science Citation Index and counting identical entries. Networks of co-cited papers can be generated for specific scientific specialties, and an example is drawn from the literature of particle physics. Co-citation patterns are found to differ significantly from bibliographic coupling patterns, but to agree generally with patterns of direct citation. Clusters of co-cited papers provide a new way to study the specialty structure of science. They may provide a new approach to indexing and to the creation of {SDI} profiles.},
496 journaltitle = {Journal of the American Society for Information Science},
497 shortjournal = {J. Am. Soc. Inf. Sci.},
498 author = {Small, Henry},
499 urldate = {2016-07-20},
502 file = {Full Text PDF:/home/jeremy/Zotero/storage/9HF57A4X/Small - 1973 - Co-citation in the scientific literature A new me.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/NF4S7SJ4/abstract.html:text/html}
505 @article{rosvall_map_2010,
506 title = {The map equation},
508 issn = {1951-6355, 1951-6401},
509 url = {http://link.springer.com/article/10.1140/epjst/e2010-01179-1},
510 doi = {10.1140/epjst/e2010-01179-1},
511 abstract = {Many real-world networks are so large that we must simplify their structure before we can extract useful information about the systems they represent. As the tools for doing these simplifications proliferate within the network literature, researchers would benefit from some guidelines about which of the so-called community detection algorithms are most appropriate for the structures they are studying and the questions they are asking. Here we show that different methods highlight different aspects of a network's structure and that the the sort of information that we seek to extract about the system must guide us in our decision. For example, many community detection algorithms, including the popular modularity maximization approach, infer module assignments from an underlying model of the network formation process. However, we are not always as interested in how a system's network structure was formed, as we are in how a network's extant structure influences the system's behavior. To see how structure influences current behavior, we will recognize that links in a network induce movement across the network and result in system-wide interdependence. In doing so, we explicitly acknowledge that most networks carry flow. To highlight and simplify the network structure with respect to this flow, we use the map equation. We present an intuitive derivation of this flow-based and information-theoretic method and provide an interactive on-line application that anyone can use to explore the mechanics of the map equation. The differences between the map equation and the modularity maximization approach are not merely conceptual. Because the map equation attends to patterns of flow on the network and the modularity maximization approach does not, the two methods can yield dramatically different results for some network structures. To illustrate this and build our understanding of each method, we partition several sample networks. We also describe an algorithm and provide source code to efficiently decompose large weighted and directed networks based on the map equation.},
514 journaltitle = {The European Physical Journal Special Topics},
515 shortjournal = {Eur. Phys. J. Spec. Top.},
516 author = {Rosvall, M. and Axelsson, D. and Bergstrom, C. T.},
517 urldate = {2016-07-20},
520 file = {Full Text PDF:/home/jeremy/Zotero/storage/SP7AM2FW/Rosvall et al. - 2010 - The map equation.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/36S24FS9/e2010-01179-1.html:text/html}
523 @article{rosvall_maps_2008,
524 title = {Maps of random walks on complex networks reveal community structure},
526 issn = {0027-8424, 1091-6490},
527 url = {http://www.pnas.org/content/105/4/1118},
528 doi = {10.1073/pnas.0706851105},
529 abstract = {To comprehend the multipartite organization of large-scale biological and social systems, we introduce an information theoretic approach that reveals community structure in weighted and directed networks. We use the probability flow of random walks on a network as a proxy for information flows in the real system and decompose the network into modules by compressing a description of the probability flow. The result is a map that both simplifies and highlights the regularities in the structure and their relationships. We illustrate the method by making a map of scientific communication as captured in the citation patterns of {\textgreater}6,000 journals. We discover a multicentric organization with fields that vary dramatically in size and degree of integration into the network of science. Along the backbone of the network—including physics, chemistry, molecular biology, and medicine—information flows bidirectionally, but the map reveals a directional pattern of citation from the applied fields to the basic sciences.},
530 pages = {1118--1123},
532 journaltitle = {Proceedings of the National Academy of Sciences},
533 shortjournal = {{PNAS}},
534 author = {Rosvall, Martin and Bergstrom, Carl T.},
535 urldate = {2016-07-20},
539 keywords = {compression, clustering, information theory, map of science, bibiometrics},
540 file = {Full Text PDF:/home/jeremy/Zotero/storage/3HQG7TS3/Rosvall and Bergstrom - 2008 - Maps of random walks on complex networks reveal co.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/TG6S96XS/1118.html:text/html}
543 @article{ghosh_what_2013,
544 title = {What are we `tweeting' about obesity? Mapping tweets with topic modeling and Geographic Information System},
547 url = {http://dx.doi.org/10.1080/15230406.2013.776210},
548 doi = {10.1080/15230406.2013.776210},
549 shorttitle = {What are we `tweeting' about obesity?},
550 abstract = {Public health related tweets are difficult to identify in large conversational datasets like Twitter.com. Even more challenging is the visualization and analyses of the spatial patterns encoded in tweets. This study has the following objectives: how can topic modeling be used to identify relevant public health topics such as obesity on Twitter.com? What are the common obesity related themes? What is the spatial pattern of the themes? What are the research challenges of using large conversational datasets from social networking sites? Obesity is chosen as a test theme to demonstrate the effectiveness of topic modeling using Latent Dirichlet Allocation ({LDA}) and spatial analysis using Geographic Information System ({GIS}). The dataset is constructed from tweets (originating from the United States) extracted from Twitter.com on obesity-related queries. Examples of such queries are ‘food deserts’, ‘fast food’, and ‘childhood obesity’. The tweets are also georeferenced and time stamped. Three cohesive and meaningful themes such as ‘childhood obesity and schools’, ‘obesity prevention’, and ‘obesity and food habits’ are extracted from the {LDA} model. The {GIS} analysis of the extracted themes show distinct spatial pattern between rural and urban areas, northern and southern states, and between coasts and inland states. Further, relating the themes with ancillary datasets such as {US} census and locations of fast food restaurants based upon the location of the tweets in a {GIS} environment opened new avenues for spatial analyses and mapping. Therefore the techniques used in this study provide a possible toolset for computational social scientists in general, and health researchers in specific, to better understand health problems from large conversational datasets.},
553 journaltitle = {Cartography and Geographic Information Science},
554 author = {Ghosh, Debarchana (Debs) and Guha, Rajarshi},
555 urldate = {2016-07-19},
557 file = {Full Text PDF:/home/jeremy/Zotero/storage/S3WJGXET/Ghosh and Guha - 2013 - What are we ‘tweeting’ about obesity Mapping twee.pdf:application/pdf}
560 @article{hidalgo_building_2009,
561 title = {The building blocks of economic complexity},
563 issn = {0027-8424, 1091-6490},
564 url = {http://www.pnas.org/content/106/26/10570},
565 doi = {10.1073/pnas.0900943106},
566 abstract = {For Adam Smith, wealth was related to the division of labor. As people and firms specialize in different activities, economic efficiency increases, suggesting that development is associated with an increase in the number of individual activities and with the complexity that emerges from the interactions between them. Here we develop a view of economic growth and development that gives a central role to the complexity of a country's economy by interpreting trade data as a bipartite network in which countries are connected to the products they export, and show that it is possible to quantify the complexity of a country's economy by characterizing the structure of this network. Furthermore, we show that the measures of complexity we derive are correlated with a country's level of income, and that deviations from this relationship are predictive of future growth. This suggests that countries tend to converge to the level of income dictated by the complexity of their productive structures, indicating that development efforts should focus on generating the conditions that would allow complexity to emerge to generate sustained growth and prosperity.},
567 pages = {10570--10575},
569 journaltitle = {Proceedings of the National Academy of Sciences},
570 shortjournal = {{PNAS}},
571 author = {Hidalgo, César A. and Hausmann, Ricardo},
572 urldate = {2016-07-20},
576 keywords = {networks, economic development},
577 file = {Full Text PDF:/home/jeremy/Zotero/storage/BSD98SD2/Hidalgo and Hausmann - 2009 - The building blocks of economic complexity.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/EXMG4VVB/10570.html:text/html}
580 @book{hausmann_atlas_2014,
581 title = {The Atlas of Economic Complexity: Mapping Paths to Prosperity},
582 isbn = {978-0-262-31773-3},
583 shorttitle = {The Atlas of Economic Complexity},
584 abstract = {Why do some countries grow and others do not? The authors of The Atlas of Economic Complexity offer readers an explanation based on "Economic Complexity," a measure of a society's productive knowledge. Prosperous societies are those that have the knowledge to make a larger variety of more complex products. The Atlas of Economic Complexity attempts to measure the amount of productive knowledge countries hold and how they can move to accumulate more of it by making more complex products.Through the graphical representation of the "Product Space," the authors are able to identify each country's "adjacent possible," or potential new products, making it easier to find paths to economic diversification and growth. In addition, they argue that a country's economic complexity and its position in the product space are better predictors of economic growth than many other well-known development indicators, including measures of competitiveness, governance, finance, and schooling.Using innovative visualizations, the book locates each country in the product space, provides complexity and growth potential rankings for 128 countries, and offers individual country pages with detailed information about a country's current capabilities and its diversification options. The maps and visualizations included in the Atlas can be used to find more viable paths to greater productive knowledge and prosperity.},
586 publisher = {{MIT} Press},
587 author = {Hausmann, Ricardo and Hidalgo, César A. and Bustos, Sebastián and Coscia, Michele and Simoes, Alexander and Yildirim, Muhammed A.},
590 keywords = {Business \& Economics / International / Economics, Business \& Economics / Economics / Macroeconomics}
593 @article{hood_literature_2001,
594 title = {The Literature of Bibliometrics, Scientometrics, and Informetrics},
597 url = {http://link.springer.com/10.1023/A:1017919924342},
598 doi = {10.1023/A:1017919924342},
601 journaltitle = {Scientometrics},
602 author = {Hood, William W. and Wilson, Concepción S.},
603 urldate = {2016-07-20},
607 @article{kessler_bibliographic_1963,
608 title = {Bibliographic coupling between scientific papers},
610 rights = {Copyright © 1963 Wiley Periodicals, Inc., A Wiley Company},
612 url = {http://onlinelibrary.wiley.com/doi/10.1002/asi.5090140103/abstract},
613 doi = {10.1002/asi.5090140103},
614 abstract = {This report describes the results of automatic processing of a large number of scientific papers according to a rigorously defined criterion of coupling. The population of papers under study was ordered into groups that satisfy the stated criterion of interrelation. An examination of the papers that constitute the groups shows a high degree of logical correlation.},
617 journaltitle = {American Documentation},
618 shortjournal = {Amer. Doc.},
619 author = {Kessler, M. M.},
620 urldate = {2016-04-20},
623 file = {Kessler - 1963 - Bibliographic coupling between scientific papers.pdf:/home/jeremy/Zotero/storage/SSZX4B3K/Kessler - 1963 - Bibliographic coupling between scientific papers.pdf:application/pdf}
626 @article{macy_factors_2002,
627 title = {From Factors to Actors: Computational Sociology and Agent-Based Modeling},
630 url = {http://www.jstor.org/stable/3069238},
631 shorttitle = {From Factors to Actors},
632 abstract = {Sociologists often model social processes as interactions among variables. We review an alternative approach that models social life as interactions among adaptive agents who influence one another in response to the influence they receive. These agent-based models ({ABMs}) show how simple and predictable local interactions can generate familiar but enigmatic global patterns, such as the diffusion of information, emergence of norms, coordination of conventions, or participation in collective action. Emergent social patterns can also appear unexpectedly and then just as dramatically transform or disappear, as happens in revolutions, market crashes, fads, and feeding frenzies. {ABMs} provide theoretical leverage where the global patterns of interest are more than the aggregation of individual attributes, but at the same time, the emergent pattern cannot be understood without a bottom up dynamical model of the microfoundations at the relational level. We begin with a brief historical sketch of the shift from "factors" to "actors" in computational sociology that shows how agent-based modeling differs fundamentally from earlier sociological uses of computer simulation. We then review recent contributions focused on the emergence of social structure and social order out of local interaction. Although sociology has lagged behind other social sciences in appreciating this new methodology, a distinctive sociological contribution is evident in the papers we review. First, theoretical interest focuses on dynamic social networks that shape and are shaped by agent interaction. Second, {ABMs} are used to perform virtual experiments that test macrosociological theories by manipulating structural factors like network topology, social stratification, or spatial mobility. We conclude our review with a series of recommendations for realizing the rich sociological potential of this approach.},
634 journaltitle = {Annual Review of Sociology},
635 shortjournal = {Annual Review of Sociology},
636 author = {Macy, Michael W. and Willer, Robert},
637 urldate = {2016-07-20},
641 @book{neef_digital_2014,
642 location = {Indianapolis, {IN}},
643 edition = {1 edition},
644 title = {Digital Exhaust: What Everyone Should Know About Big Data, Digitization and Digitally Driven Innovation},
645 isbn = {978-0-13-383796-4},
646 shorttitle = {Digital Exhaust},
647 abstract = {Will "Big Data" supercharge the economy, tyrannize us, or both? Data Exhaust is the definitive primer for everyone who wants to understand all the implications of Big Data, digitally driven innovation, and the accelerating Internet Economy. Renowned digital expert Dale Neef clearly explains: What Big Data really is, and what's new and different about it How Big Data works, and what you need to know about Big Data technologies Where the data is coming from: how Big Data integrates sources ranging from social media to machine sensors, smartphones to financial transactions How companies use Big Data analytics to gain a more nuanced, accurate picture of their customers, their own performance, and the newest trends How governments and individual citizens can also benefit from Big Data How to overcome obstacles to success with Big Data – including poor data that can magnify human error A realistic assessment of Big Data threats to employment and personal privacy, now and in the future Neef places the Big Data phenomenon where it belongs: in the context of the broader global shift to the Internet economy, with all that implies. By doing so, he helps businesses plan Big Data strategy more effectively – and helps citizens and policymakers identify sensible policies for preventing its misuse. By conservative estimate, the global Big Data market will soar past \$50 billion by 2018. But those direct expenses represent just the "tip of the iceberg" when it comes to Big Data's impact. Big Data is now of acute strategic interest for every organization that aims to succeed – and it is equally important to everyone else. Whoever you are, Data Exhaust tells you exactly what you need to know about Big Data – and what to do about it, too.},
649 publisher = {Pearson {FT} Press},
650 author = {Neef, Dale},
654 @article{friedman_regularization_2010,
655 title = {Regularization Paths for Generalized Linear Models via Coordinate Descent},
658 url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2929880/},
659 abstract = {We develop fast algorithms for estimation of generalized linear models with convex penalties. The models include linear regression, two-class logistic regression, and multinomial regression problems while the penalties include ℓ1 (the lasso), ℓ2 (ridge regression) and mixtures of the two (the elastic net). The algorithms use cyclical coordinate descent, computed along a regularization path. The methods can handle large problems and can also deal efficiently with sparse features. In comparative timings we find that the new algorithms are considerably faster than competing methods.},
662 journaltitle = {Journal of statistical software},
663 shortjournal = {J Stat Softw},
664 author = {Friedman, Jerome and Hastie, Trevor and Tibshirani, Rob},
665 urldate = {2016-07-20},
671 @book{james_introduction_2013,
672 location = {New York},
673 title = {An introduction to statistical learning: with applications in R},
674 isbn = {978-1-4614-7137-0},
675 shorttitle = {An introduction to statistical learning},
676 abstract = {"An Introduction to Statistical Learning provides an accessible overview of the field of statistical learning, an essential toolset for making sense of the vast and complex data sets that have emerged in fields ranging from biology to finance to marketing to astrophysics in the past twenty years. This book presents some of the most important modeling and prediction techniques, along with relevant applications. Topics include linear regression, classification, resampling methods, shrinkage approaches, tree-based methods, support vector machines, clustering, and more. Color graphics and real-world examples are used to illustrate the methods presented. Since the goal of this textbook is to facilitate the use of these statistical learning techniques by practitioners in science, industry, and other fields, each chapter contains a tutorial on implementing the analyses and methods presented in R, an extremely popular open source statistical software platform. Two of the authors co-wrote The Elements of Statistical Learning (Hastie, Tibshirani and Friedman, 2nd edition 2009), a popular reference book for statistics and machine learning researchers. An Introduction to Statistical Learning covers many of the same topics, but at a level accessible to a much broader audience. This book is targeted at statisticians and non-statisticians alike who wish to use cutting-edge statistical learning techniques to analyze their data. The text assumes only a previous course in linear regression and no knowledge of matrix algebra. Provides tools for Statistical Learning that are essential for practitioners in science, industry and other fields. Analyses and methods are presented in R. Topics include linear regression, classification, resampling methods, shrinkage approaches, tree-based methods, support vector machines, and clustering. Extensive use of color graphics assist the reader"--Publisher description.},
677 publisher = {Springer},
678 author = {James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert},
682 @article{tibshirani_regression_1996,
683 title = {Regression Shrinkage and Selection via the Lasso},
686 url = {http://www.jstor.org/stable/2346178},
687 abstract = {We propose a new method for estimation in linear models. The `lasso' minimizes the residual sum of squares subject to the sum of the absolute value of the coefficients being less than a constant. Because of the nature of this constraint it tends to produce some coefficients that are exactly 0 and hence gives interpretable models. Our simulation studies suggest that the lasso enjoys some of the favourable properties of both subset selection and ridge regression. It produces interpretable models like subset selection and exhibits the stability of ridge regression. There is also an interesting relationship with recent work in adaptive function estimation by Donoho and Johnstone. The lasso idea is quite general and can be applied in a variety of statistical models: extensions to generalized regression models and tree-based models are briefly described.},
690 journaltitle = {Journal of the Royal Statistical Society. Series B (Methodological)},
691 shortjournal = {Journal of the Royal Statistical Society. Series B (Methodological)},
692 author = {Tibshirani, Robert},
693 urldate = {2016-07-20},
697 @report{bollen_social_2015,
698 title = {Social, Behavioral, and Economic Sciences Perspectives on Robust and Reliable Science},
699 url = {http://www.nsf.gov/sbe/AC_Materials/SBE_Robust_and_Reliable_Research_Report.pdf},
700 institution = {National Science Foundation},
701 author = {Bollen, Kenneth and Cacioppo, John T. and Kaplan, Robert M. and Krosnick, Jon A. and Olds, James L. and Dean, Heather},
705 @article{stodden_toward_2013,
706 title = {Toward Reproducible Computational Research: An Empirical Analysis of Data and Code Policy Adoption by Journals},
709 url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0067111},
710 doi = {10.1371/journal.pone.0067111},
711 shorttitle = {Toward Reproducible Computational Research},
712 abstract = {Journal policy on research data and code availability is an important part of the ongoing shift toward publishing reproducible computational science. This article extends the literature by studying journal data sharing policies by year (for both 2011 and 2012) for a referent set of 170 journals. We make a further contribution by evaluating code sharing policies, supplemental materials policies, and open access status for these 170 journals for each of 2011 and 2012. We build a predictive model of open data and code policy adoption as a function of impact factor and publisher and find higher impact journals more likely to have open data and code policies and scientific societies more likely to have open data and code policies than commercial publishers. We also find open data policies tend to lead open code policies, and we find no relationship between open data and code policies and either supplemental material policies or open access journal status. Of the journals in this study, 38\% had a data policy, 22\% had a code policy, and 66\% had a supplemental materials policy as of June 2012. This reflects a striking one year increase of 16\% in the number of data policies, a 30\% increase in code policies, and a 7\% increase in the number of supplemental materials policies. We introduce a new dataset to the community that categorizes data and code sharing, supplemental materials, and open access policies in 2011 and 2012 for these 170 journals.},
715 journaltitle = {{PLOS} {ONE}},
716 shortjournal = {{PLOS} {ONE}},
717 author = {Stodden, Victoria and Guo, Peixuan and Ma, Zhaokun},
718 urldate = {2016-07-22},
720 keywords = {Reproducibility, science policy, computational biology, Open access, Scientific publishing, open data, Computer and information sciences, Data management},
721 file = {Full Text PDF:/home/jeremy/Zotero/storage/PIC8KFJE/Stodden et al. - 2013 - Toward Reproducible Computational Research An Emp.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/NTS2JK5S/article.html:text/html}
724 @article{leveque_reproducible_2012,
725 title = {Reproducible research for scientific computing: Tools and strategies for changing the culture},
728 shorttitle = {Reproducible research for scientific computing},
731 journaltitle = {Computing in Science and Engineering},
732 author = {{LeVeque}, Randall J. and Mitchell, Ian M. and Stodden, Victoria},
734 file = {LeVeque et al. - 2012 - Reproducible research for scientific computing To.pdf:/home/jeremy/Zotero/storage/2FHZTG9Q/LeVeque et al. - 2012 - Reproducible research for scientific computing To.pdf:application/pdf}
737 @book{wilensky_introduction_2015,
738 location = {Cambridge, Massachusetts},
739 title = {An introduction to agent-based modeling: modeling natural, social, and engineered complex systems with {NetLogo}},
740 shorttitle = {An introduction to agent-based modeling},
741 publisher = {{MIT} Press},
742 author = {Wilensky, Uri and Rand, William},
743 urldate = {2016-07-19},
747 @article{welles_minorities_2014,
748 title = {On minorities and outliers: The case for making Big Data small},
751 url = {http://bds.sagepub.com/content/1/1/2053951714540613},
752 doi = {10.1177/2053951714540613},
753 shorttitle = {On minorities and outliers},
754 abstract = {In this essay, I make the case for choosing to examine small subsets of Big Data datasets—making big data small. Big Data allows us to produce summaries of human behavior at a scale never before possible. But in the push to produce these summaries, we risk losing sight of a secondary but equally important advantage of Big Data—the plentiful representation of minorities. Women, minorities and statistical outliers have historically been omitted from the scientific record, with problematic consequences. Big Data affords the opportunity to remedy those omissions. However, to do so, Big Data researchers must choose to examine very small subsets of otherwise large datasets. I encourage researchers to embrace an ethical, empirical and epistemological stance on Big Data that includes minorities and outliers as reference categories, rather than the exceptions to statistical norms.},
755 pages = {2053951714540613},
757 journaltitle = {Big Data \& Society},
758 author = {Welles, Brooke Foucault},
759 urldate = {2016-07-23},
762 file = {Full Text PDF:/home/jeremy/Zotero/storage/SS8P2JN4/Welles - 2014 - On minorities and outliers The case for making Bi.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/M2HTAVP2/2053951714540613.html:text/html}
765 @book{hansen_analyzing_2010,
766 location = {Burlington, Massachusetts},
767 title = {Analyzing social media networks with {NodeXL}: Insights from a connected world},
768 shorttitle = {Analyzing social media networks with {NodeXL}},
769 publisher = {Morgan Kaufmann},
770 author = {Hansen, Derek and Shneiderman, Ben and Smith, Marc A.},
771 urldate = {2016-07-18},
775 @inproceedings{asur_predicting_2010,
776 title = {Predicting the Future with Social Media},
778 doi = {10.1109/WI-IAT.2010.63},
779 abstract = {In recent years, social media has become ubiquitous and important for social networking and content sharing. And yet, the content that is generated from these websites remains largely untapped. In this paper, we demonstrate how social media content can be used to predict real-world outcomes. In particular, we use the chatter from Twitter.com to forecast box-office revenues for movies. We show that a simple model built from the rate at which tweets are created about particular topics can outperform market-based predictors. We further demonstrate how sentiments extracted from Twitter can be utilized to improve the forecasting power of social media.},
780 eventtitle = {2010 {IEEE}/{WIC}/{ACM} International Conference on Web Intelligence and Intelligent Agent Technology ({WI}-{IAT})},
782 booktitle = {2010 {IEEE}/{WIC}/{ACM} International Conference on Web Intelligence and Intelligent Agent Technology ({WI}-{IAT})},
783 author = {Asur, S. and Huberman, B. A.},
785 keywords = {Web sites, Social Media, attention, prediction, social media content, content sharing, social networking (online), market-based predictors, Twitter.com, social networking},
786 file = {IEEE Xplore Abstract Record:/home/jeremy/Zotero/storage/AT38MBGW/articleDetails.html:text/html;IEEE Xplore Abstract Record:/home/jeremy/Zotero/storage/NAPSZ9F4/login.html:text/html;IEEE Xplore Full Text PDF:/home/jeremy/Zotero/storage/5XINGQC4/Asur and Huberman - 2010 - Predicting the Future with Social Media.pdf:application/pdf}
789 @article{blei_latent_2003,
790 title = {Latent dirichlet allocation},
792 url = {http://dl.acm.org/citation.cfm?id=944937},
794 journaltitle = {The Journal of Machine Learning Research},
795 author = {Blei, David M. and Ng, Andrew Y. and Jordan, Michael I.},
796 urldate = {2015-12-03},
798 file = {Blei et al. - 2003 - Latent dirichlet allocation.pdf:/home/jeremy/Zotero/storage/2K3E7TJH/Blei et al. - 2003 - Latent dirichlet allocation.pdf:application/pdf}
801 @article{dimaggio_exploiting_2013,
802 title = {Exploiting affinities between topic modeling and the sociological perspective on culture: Application to newspaper coverage of U.S. government arts funding},
805 url = {http://linkinghub.elsevier.com/retrieve/pii/S0304422X13000661},
806 doi = {10.1016/j.poetic.2013.08.004},
807 shorttitle = {Exploiting affinities between topic modeling and the sociological perspective on culture},
810 journaltitle = {Poetics},
811 author = {{DiMaggio}, Paul and Nag, Manish and Blei, David},
812 urldate = {2016-01-02},
815 file = {exploiting-affinities.pdf:/home/jeremy/Zotero/storage/7D8NAGNB/exploiting-affinities.pdf:application/pdf}
818 @inproceedings{cheng_can_2014,
819 location = {New York, {NY}, {USA}},
820 title = {Can cascades be predicted?},
821 isbn = {978-1-4503-2744-2},
822 url = {http://doi.acm.org/10.1145/2566486.2567997},
823 doi = {10.1145/2566486.2567997},
824 series = {{WWW} '14},
825 abstract = {On many social networking web sites such as Facebook and Twitter, resharing or reposting functionality allows users to share others' content with their own friends or followers. As content is reshared from user to user, large cascades of reshares can form. While a growing body of research has focused on analyzing and characterizing such cascades, a recent, parallel line of work has argued that the future trajectory of a cascade may be inherently unpredictable. In this work, we develop a framework for addressing cascade prediction problems. On a large sample of photo reshare cascades on Facebook, we find strong performance in predicting whether a cascade will continue to grow in the future. We find that the relative growth of a cascade becomes more predictable as we observe more of its reshares, that temporal and structural features are key predictors of cascade size, and that initially, breadth, rather than depth in a cascade is a better indicator of larger cascades. This prediction performance is robust in the sense that multiple distinct classes of features all achieve similar performance. We also discover that temporal features are predictive of a cascade's eventual shape. Observing independent cascades of the same content, we find that while these cascades differ greatly in size, we are still able to predict which ends up the largest.},
827 booktitle = {Proceedings of the 23rd International Conference on World Wide Web},
829 author = {Cheng, Justin and Adamic, Lada and Dow, P. Alex and Kleinberg, Jon Michael and Leskovec, Jure},
830 urldate = {2015-04-06},
832 keywords = {cascade prediction, contagion, information diffusion},
833 file = {Cheng et al. - 2014 - Can Cascades Be Predicted.pdf:/home/jeremy/Zotero/storage/KPPCCRXU/Cheng et al. - 2014 - Can Cascades Be Predicted.pdf:application/pdf}
836 @article{pedregosa_scikit-learn:_2011,
837 title = {Scikit-learn: Machine learning in python},
839 url = {http://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html},
840 shorttitle = {Scikit-learn},
841 abstract = {Scikit-learn is a Python module integrating a wide range of state-of-the-art machine learning algorithms for medium-scale supervised and unsupervised problems. This package focuses on bringing machine learning to non-specialists using a general-purpose high-level language. Emphasis is put on ease of use, performance, documentation, and {API} consistency. It has minimal dependencies and is distributed under the simplified {BSD} license, encouraging its use in both academic and commercial settings. Source code, binaries, and documentation can be downloaded from http://scikit-learn.sourceforge.net.},
842 pages = {2825--2830},
843 journaltitle = {Journal of Machine Learning Research},
844 author = {Pedregosa, Fabian and Varoquaux, Gaël and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and Duchesnay, Édouard},
845 urldate = {2016-06-07},
847 note = {bibtex: pedregosa\_scikit-learn:\_2011},
848 file = {Scikit-learn\: Machine Learning in Python:/home/jeremy/Zotero/storage/6XS2PM2P/Pedregosa et al. - 2011 - Scikit-learn Machine Learning in Python.pdf:application/pdf}
851 @article{zimmer_okcupid_2016,
852 title = {{OkCupid} Study Reveals the Perils of Big-Data Science},
853 url = {https://www.wired.com/2016/05/okcupid-study-reveals-perils-big-data-science/},
854 abstract = {The data of 70,000 {OKCupid} users is now searchable in a database. Ethicist Michael T Zimmer explains why it doesn't matter that it was "already public."},
855 journaltitle = {{WIRED}},
856 author = {Zimmer, Michael},
857 urldate = {2016-08-31},
859 file = {Snapshot:/home/jeremy/Zotero/storage/KV5P4IA9/okcupid-study-reveals-perils-big-data-science.html:text/html}
862 @article{merton_matthew_1968,
863 title = {The Matthew effect in science},
865 url = {http://www.unc.edu/~fbaum/teaching/PLSC541_Fall06/Merton_Science_1968.pdf},
868 journaltitle = {Science},
869 author = {Merton, Robert K.},
870 urldate = {2014-09-27},
872 file = {[PDF] from unc.edu:/home/jeremy/Zotero/storage/B3H2PG6R/Merton - 1968 - The Matthew effect in science.pdf:application/pdf}
875 @article{barabasi_emergence_1999,
876 title = {Emergence of Scaling in Random Networks},
878 issn = {0036-8075, 1095-9203},
879 url = {http://science.sciencemag.org/content/286/5439/509},
880 doi = {10.1126/science.286.5439.509},
881 abstract = {Systems as diverse as genetic networks or the World Wide Web are best described as networks with complex topology. A common property of many large networks is that the vertex connectivities follow a scale-free power-law distribution. This feature was found to be a consequence of two generic mechanisms: (i) networks expand continuously by the addition of new vertices, and (ii) new vertices attach preferentially to sites that are already well connected. A model based on these two ingredients reproduces the observed stationary scale-free distributions, which indicates that the development of large networks is governed by robust self-organizing phenomena that go beyond the particulars of the individual systems.},
884 journaltitle = {Science},
885 author = {Barabási, Albert-László and Albert, Réka},
886 urldate = {2016-10-06},
890 file = {Barabási and Albert - 1999 - Emergence of Scaling in Random Networks.pdf:/home/jeremy/Zotero/storage/D4DAX5XA/Barabási and Albert - 1999 - Emergence of Scaling in Random Networks.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/JETSMGUZ/509.html:text/html}
893 @article{rosvall_mapping_2010,
894 title = {Mapping Change in Large Networks},
897 url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0008694},
898 doi = {10.1371/journal.pone.0008694},
899 abstract = {Change is a fundamental ingredient of interaction patterns in biology, technology, the economy, and science itself: Interactions within and between organisms change; transportation patterns by air, land, and sea all change; the global financial flow changes; and the frontiers of scientific research change. Networks and clustering methods have become important tools to comprehend instances of these large-scale structures, but without methods to distinguish between real trends and noisy data, these approaches are not useful for studying how networks change. Only if we can assign significance to the partitioning of single networks can we distinguish meaningful structural changes from random fluctuations. Here we show that bootstrap resampling accompanied by significance clustering provides a solution to this problem. To connect changing structures with the changing function of networks, we highlight and summarize the significant structural changes with alluvial diagrams and realize de Solla Price's vision of mapping change in science: studying the citation pattern between about 7000 scientific journals over the past decade, we find that neuroscience has transformed from an interdisciplinary specialty to a mature and stand-alone discipline.},
902 journaltitle = {{PLOS} {ONE}},
903 shortjournal = {{PLOS} {ONE}},
904 author = {Rosvall, Martin and Bergstrom, Carl T.},
905 urldate = {2016-07-08},
907 keywords = {Medicine and health sciences, Behavioral neuroscience, neuroscience, Algorithms, Structure of markets, Molecular neuroscience, Simulated annealing, Cellular neuroscience},
908 file = {Full Text PDF:/home/jeremy/Zotero/storage/79Q8AFD4/Rosvall and Bergstrom - 2010 - Mapping Change in Large Networks.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/7Z6NMBHX/article.html:text/html}
911 @inproceedings{tufekci_big_2014,
912 title = {Big Questions for social media big data: Representativeness, validity and other methodological pitfalls},
913 isbn = {978-1-57735-657-8},
914 shorttitle = {Big Questions for social media big data},
915 abstract = {Large-scale databases of human activity in social media have captured scientific and policy attention, producing a flood of research and discussion. This paper considers methodological and conceptual challenges for this emergent field, with special attention to the validity and representativeness of social media big data analyses. Persistent issues include the over-emphasis of a single platform, Twitter, sampling biases arising from selection by hashtags, and vague and unrepresentative sampling frames. The sociocultural complexity of user behavior aimed at algorithmic invisibility (such as subtweeting, mock-retweeting, use of "screen captures" for text, etc.) further complicate interpretation of big data social media. Other challenges include accounting for field effects, i.e. broadly consequential events that do not diffuse only through the network under study but affect the whole society. The application of network methods from other fields to the study of human social activity may not always be appropriate. The paper concludes with a call to action on practical steps to improve our analytic capacity in this promising, rapidly-growing field.. Copyright © 2014, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.},
916 eventtitle = {Proceedings of the 8th International Conference on Weblogs and Social Media, {ICWSM} 2014},
918 author = {Tufekci, Z.},
922 @article{lazer_parable_2014,
923 title = {The Parable of Google Flu: Traps in Big Data Analysis},
925 rights = {Copyright © 2014, American Association for the Advancement of Science},
926 issn = {0036-8075, 1095-9203},
927 url = {http://science.sciencemag.org/content/343/6176/1203},
928 doi = {10.1126/science.1248506},
929 shorttitle = {The Parable of Google Flu},
930 abstract = {In February 2013, Google Flu Trends ({GFT}) made headlines but not for a reason that Google executives or the creators of the flu tracking system would have hoped. Nature reported that {GFT} was predicting more than double the proportion of doctor visits for influenza-like illness ({ILI}) than the Centers for Disease Control and Prevention ({CDC}), which bases its estimates on surveillance reports from laboratories across the United States (1, 2). This happened despite the fact that {GFT} was built to predict {CDC} reports. Given that {GFT} is often held up as an exemplary use of big data (3, 4), what lessons can we draw from this error?
931 Large errors in flu prediction were largely avoidable, which offers lessons for the use of big data.
932 Large errors in flu prediction were largely avoidable, which offers lessons for the use of big data.},
933 pages = {1203--1205},
935 journaltitle = {Science},
936 author = {Lazer, David and Kennedy, Ryan and King, Gary and Vespignani, Alessandro},
937 urldate = {2016-10-06},
941 file = {Full Text PDF:/home/jeremy/Zotero/storage/UFHNQF8W/Lazer et al. - 2014 - The Parable of Google Flu Traps in Big Data Analy.pdf:application/pdf}
944 @article{boyd_critical_2012,
945 title = {Critical questions for big data},
948 url = {http://dx.doi.org/10.1080/1369118X.2012.678878},
949 doi = {10.1080/1369118X.2012.678878},
950 abstract = {The era of Big Data has begun. Computer scientists, physicists, economists, mathematicians, political scientists, bio-informaticists, sociologists, and other scholars are clamoring for access to the massive quantities of information produced by and about people, things, and their interactions. Diverse groups argue about the potential benefits and costs of analyzing genetic sequences, social media interactions, health records, phone logs, government records, and other digital traces left by people. Significant questions emerge. Will large-scale search data help us create better tools, services, and public goods? Or will it usher in a new wave of privacy incursions and invasive marketing? Will data analytics help us understand online communities and political movements? Or will it be used to track protesters and suppress speech? Will it transform how we study human communication and culture, or narrow the palette of research options and alter what ‘research’ means? Given the rise of Big Data as a socio-technical phenomenon, we argue that it is necessary to critically interrogate its assumptions and biases. In this article, we offer six provocations to spark conversations about the issues of Big Data: a cultural, technological, and scholarly phenomenon that rests on the interplay of technology, analysis, and mythology that provokes extensive utopian and dystopian rhetoric.},
953 journaltitle = {Information, Communication \& Society},
954 author = {given=danah, family=boyd AND Kate Crawford},
955 urldate = {2016-08-09},
957 file = {boyd and Crawford - 2012 - Critical Questions for Big Data.pdf:/home/jeremy/Zotero/storage/XEM23ZJG/boyd and Crawford - 2012 - Critical Questions for Big Data.pdf:application/pdf}
960 @book{silver_signal_2015,
961 location = {New York, New York},
962 title = {The Signal and the Noise: Why So Many Predictions Fail--but Some Don't},
963 isbn = {978-0-14-312508-2},
964 shorttitle = {The Signal and the Noise},
965 abstract = {One of Wall Street Journal's Best Ten Works of Nonfiction in 2012 New York Times Bestseller “Not so different in spirit from the way public intellectuals like John Kenneth Galbraith once shaped discussions of economic policy and public figures like Walter Cronkite helped sway opinion on the Vietnam War…could turn out to be one of the more momentous books of the decade.” —New York Times Book Review "Nate Silver's The Signal and the Noise is The Soul of a New Machine for the 21st century." —Rachel Maddow, author of Drift "A serious treatise about the craft of prediction—without academic mathematics—cheerily aimed at lay readers. Silver's coverage is polymathic, ranging from poker and earthquakes to climate change and terrorism." —New York Review of Books Nate Silver built an innovative system for predicting baseball performance, predicted the 2008 election within a hair’s breadth, and became a national sensation as a blogger—all by the time he was thirty. He solidified his standing as the nation's foremost political forecaster with his near perfect prediction of the 2012 election. Silver is the founder and editor in chief of {FiveThirtyEight}.com. Drawing on his own groundbreaking work, Silver examines the world of prediction, investigating how we can distinguish a true signal from a universe of noisy data. Most predictions fail, often at great cost to society, because most of us have a poor understanding of probability and uncertainty. Both experts and laypeople mistake more confident predictions for more accurate ones. But overconfidence is often the reason for failure. If our appreciation of uncertainty improves, our predictions can get better too. This is the “prediction paradox”: The more humility we have about our ability to make predictions, the more successful we can be in planning for the future.In keeping with his own aim to seek truth from data, Silver visits the most successful forecasters in a range of areas, from hurricanes to baseball, from the poker table to the stock market, from Capitol Hill to the {NBA}. He explains and evaluates how these forecasters think and what bonds they share. What lies behind their success? Are they good—or just lucky? What patterns have they unraveled? And are their forecasts really right? He explores unanticipated commonalities and exposes unexpected juxtapositions. And sometimes, it is not so much how good a prediction is in an absolute sense that matters but how good it is relative to the competition. In other cases, prediction is still a very rudimentary—and dangerous—science.Silver observes that the most accurate forecasters tend to have a superior command of probability, and they tend to be both humble and hardworking. They distinguish the predictable from the unpredictable, and they notice a thousand little details that lead them closer to the truth. Because of their appreciation of probability, they can distinguish the signal from the noise.With everything from the health of the global economy to our ability to fight terrorism dependent on the quality of our predictions, Nate Silver’s insights are an essential read.},
967 publisher = {Penguin Books},
968 author = {Silver, Nate},
972 @online{sandvig_why_2016,
973 title = {Why I Am Suing the Government},
974 url = {https://socialmediacollective.org/2016/07/01/why-i-am-suing-the-government/},
975 titleaddon = {Social Media Collective Research Blog},
977 author = {Sandvig, Christian},
978 urldate = {2016-10-23},
980 file = {Snapshot:/home/jeremy/Zotero/storage/9USUHHJB/why-i-am-suing-the-government.html:text/html}
983 @book{domingos_master_2015,
984 location = {New York, New York},
985 title = {The Master Algorithm: How the Quest for the Ultimate Learning Machine Will Remake Our World},
986 shorttitle = {The Master Algorithm},
987 abstract = {Algorithms increasingly run our lives. They find books, movies, jobs, and dates for us, manage our investments, and discover new drugs. More and more, these algorithms work by learning from the trails of data we leave in our newly digital world. Like curious children, they observe us, imitate, and experiment. And in the world’s top research labs and universities, the race is on to invent the ultimate learning algorithm: one capable of discovering any knowledge from data, and doing anything we want, before we even ask.Machine learning is the automation of discovery—the scientific method on steroids—that enables intelligent robots and computers to program themselves. No field of science today is more important yet more shrouded in mystery. Pedro Domingos, one of the field’s leading lights, lifts the veil for the first time to give us a peek inside the learning machines that power Google, Amazon, and your smartphone. He charts a course through machine learning’s five major schools of thought, showing how they turn ideas from neuroscience, evolution, psychology, physics, and statistics into algorithms ready to serve you. Step by step, he assembles a blueprint for the future universal learner—the Master Algorithm—and discusses what it means for you, and for the future of business, science, and society.If data-ism is today’s rising philosophy, this book will be its bible. The quest for universal learning is one of the most significant, fascinating, and revolutionary intellectual developments of all time. A groundbreaking book, The Master Algorithm is the essential guide for anyone and everyone wanting to understand not just how the revolution will happen, but how to be at its forefront.},
989 publisher = {Basic Books},
990 author = {Domingos, Pedro},
994 @inproceedings{arun_finding_2010,
995 title = {On Finding the Natural Number of Topics with Latent Dirichlet Allocation: Some Observations},
996 isbn = {978-3-642-13656-6},
997 url = {https://link.springer.com/chapter/10.1007/978-3-642-13657-3_43},
998 doi = {10.1007/978-3-642-13657-3_43},
999 series = {Lecture Notes in Computer Science},
1000 shorttitle = {On Finding the Natural Number of Topics with Latent Dirichlet Allocation},
1001 abstract = {It is important to identify the “correct” number of topics in mechanisms like Latent Dirichlet Allocation({LDA}) as they determine the quality of features that are presented as features for classifiers like {SVM}. In this work we propose a measure to identify the correct number of topics and offer empirical evidence in its favor in terms of classification accuracy and the number of topics that are naturally present in the corpus. We show the merit of the measure by applying it on real-world as well as synthetic data sets(both text and images). In proposing this measure, we view {LDA} as a matrix factorization mechanism, wherein a given corpus C is split into two matrix factors M1 and M2 as given by Cd*w = M1d*t x Qt*w. Where d is the number of documents present in the corpus and w is the size of the vocabulary. The quality of the split depends on “t”, the right number of topics chosen. The measure is computed in terms of symmetric {KL}-Divergence of salient distributions that are derived from these matrix factors. We observe that the divergence values are higher for non-optimal number of topics – this is shown by a ’dip’ at the right value for ’t’.},
1002 eventtitle = {Pacific-Asia Conference on Knowledge Discovery and Data Mining},
1004 booktitle = {Advances in Knowledge Discovery and Data Mining},
1005 publisher = {Springer, Berlin, Heidelberg},
1006 author = {Arun, R. and Suresh, V. and Madhavan, C. E. Veni and Murthy, M. N. Narasimha},
1007 urldate = {2017-07-06},
1008 date = {2010-06-21},
1010 file = {Arun et al. - 2010 - On Finding the Natural Number of Topics with Laten.pdf:/home/jeremy/Zotero/storage/EMMCNH7F/Arun et al. - 2010 - On Finding the Natural Number of Topics with Laten.pdf:application/pdf}