@article{howison_flossmole_2006, title = {{FLOSSmole}}, volume = {1}, issn = {1554-1045, 1554-1053}, url = {http://www.igi-global.com/article/international-journal-information-technology-web/2610}, doi = {10.4018/jitwe.2006070102}, pages = {17--26}, number = {3}, journaltitle = {International Journal of Information Technology and Web Engineering}, author = {Howison, James and Conklin, Megan and Crowston, Kevin}, urldate = {2013-06-15}, date = {2006} } @article{bohannon_google_2011, title = {Google Books, Wikipedia, and the Future of Culturomics}, volume = {331}, issn = {0036-8075, 1095-9203}, url = {http://www.sciencemag.org/content/331/6014/135}, doi = {10.1126/science.331.6014.135}, abstract = {As a follow-up to the quantitative analysis of data obtained from Google Books published online in Science on 16 December 2010 and in this week's issue on page 176, one of the study's authors has been using Wikipedia to analyze the fame of scientists whose names appear in books over the centuries. But his effort has been hampered by the online encyclopedia's shortcomings, from the reliability of its information to the organization of its content. Several efforts are under way to improve Wikipedia as a teaching and research tool, including one by the Association for Psychological Science that seeks to create a more complete and accurate representation of its field.}, pages = {135--135}, number = {6014}, journaltitle = {Science}, author = {Bohannon, John}, urldate = {2014-02-14}, date = {2011-01}, langid = {english}, pmid = {21233356} } @article{welles_visualizing_2015, title = {Visualizing Computational Social Science The Multiple Lives of a Complex Image}, volume = {37}, url = {http://scx.sagepub.com/content/37/1/34.short}, pages = {34--58}, number = {1}, journaltitle = {Science Communication}, author = {Welles, Brooke Foucault and Meirelles, Isabel}, urldate = {2015-08-05}, date = {2015}, file = {[PDF] from sagepub.com:/home/jeremy/Zotero/storage/AMRMRGNB/Welles and Meirelles - 2015 - Visualizing Computational Social Science The Multi.pdf:application/pdf} } @article{van_noorden_interdisciplinary_2015, title = {Interdisciplinary research by the numbers}, volume = {525}, issn = {0028-0836, 1476-4687}, url = {http://www.nature.com/doifinder/10.1038/525306a}, doi = {10.1038/525306a}, pages = {306--307}, number = {7569}, journaltitle = {Nature}, author = {Van Noorden, Richard}, urldate = {2015-09-21}, date = {2015-09-16} } @article{mcfarland_sociology_2015, title = {Sociology in the Era of Big Data: The Ascent of Forensic Social Science}, issn = {0003-1232, 1936-4784}, url = {http://link.springer.com/article/10.1007/s12108-015-9291-8}, doi = {10.1007/s12108-015-9291-8}, shorttitle = {Sociology in the Era of Big Data}, pages = {1--24}, journaltitle = {The American Sociologist}, shortjournal = {Am Soc}, author = {{McFarland}, Daniel A. and Lewis, Kevin and Goldberg, Amir}, urldate = {2015-09-25}, date = {2015-09-17}, langid = {english}, keywords = {Forensic social science, Social Sciences, general, Sociology of science, Sociology, general, Computational social science, Big data}, file = {Full Text PDF:/home/jeremy/Zotero/storage/F66XW8K7/McFarland et al. - 2015 - Sociology in the Era of Big Data The Ascent of Fo.pdf:application/pdf} } @article{hargittai_is_2015, title = {Is Bigger Always Better? Potential Biases of Big Data Derived from Social Network Sites}, volume = {659}, issn = {0002-7162, 1552-3349}, url = {http://ann.sagepub.com/content/659/1/63}, doi = {10.1177/0002716215570866}, shorttitle = {Is Bigger Always Better?}, abstract = {This article discusses methodological challenges of using big data that rely on specific sites and services as their sampling frames, focusing on social network sites in particular. It draws on survey data to show that people do not select into the use of such sites randomly. Instead, use is biased in certain ways yielding samples that limit the generalizability of findings. Results show that age, gender, race/ethnicity, socioeconomic status, online experiences, and Internet skills all influence the social network sites people use and thus where traces of their behavior show up. This has implications for the types of conclusions one can draw from data derived from users of specific sites. The article ends by noting how big data studies can address the shortcomings that result from biased sampling frames.}, pages = {63--76}, number = {1}, journaltitle = {The {ANNALS} of the American Academy of Political and Social Science}, shortjournal = {The {ANNALS} of the American Academy of Political and Social Science}, author = {Hargittai, Eszter}, urldate = {2015-10-19}, date = {2015-05-01}, langid = {english}, keywords = {digital inequality, social network sites, sampling, Internet skills, sampling frame, biased sample, Big data} } @article{lazer_computational_2009, title = {Computational Social Science}, volume = {323}, url = {http://www.sciencemag.org}, doi = {10.1126/science.1167742}, shorttitle = {{SOCIAL} {SCIENCE}}, pages = {721--723}, number = {5915}, journaltitle = {Science}, author = {Lazer, David and Pentland, Alex and Adamic, Lada and Aral, Sinan and Barabasi, Albert-Laszlo and Brewer, Devon and Christakis, Nicholas and Contractor, Noshir and Fowler, James and Gutmann, Myron and Jebara, Tony and King, Gary and Macy, Michael and Roy, Deb and Van Alstyne, Marshall}, urldate = {2009-03-06}, date = {2009-02-06}, file = {HighWire Snapshot:/home/jeremy/Zotero/storage/C939DFAS/721.html:text/html;PubMed Central Full Text PDF:/home/jeremy/Zotero/storage/RPX8A4ID/Lazer et al. - 2009 - Life in the network the coming age of computation.pdf:application/pdf} } @article{mann_bibliometric_2006, title = {Bibliometric impact measures leveraging topic analysis}, abstract = {Measurements of the impact and history of research literature provide a useful complement to scientific digital library collections. Bibliometric indicators have been extensively studied, mostly in the context of journals. However, journal-based metrics poorly capture topical distinctions in fast-moving fields, and are increasingly problematic with the rise of open-access publishing. Recent developments in latent topic models have produced promising results for automatic sub-field discovery. The fine-grained, faceted topics produced by such models provide a clearer view of the topical divisions of a body of research literature and the interactions between those divisions. We demonstrate the usefulness of topic models in measuring impact by applying a new phrase-based topic discovery model to a collection of 300,000 computer science publications, collected by the Rexa automatic citation indexing system}, pages = {65--74}, author = {Mann, G.S and Mimno, D and McCallum, A and {2006 IEEE/ACM 6th Joint Conference on Digital Libraries}}, date = {2006}, note = {00083}, file = {Mann et al. - 2006 - Bibliometric impact measures leveraging topic anal.pdf:/home/jeremy/Zotero/storage/RHR8REID/Mann et al. - 2006 - Bibliometric impact measures leveraging topic anal.pdf:application/pdf} } @article{reid_mapping_2007, title = {Mapping the contemporary terrorism research domain}, volume = {65}, issn = {1071-5819}, abstract = {A systematic view of terrorism research to reveal the intellectual structure of the field and empirically discern the distinct set of core researchers, institutional affiliations, publications, and conceptual areas can help us gain a deeper understanding of approaches to terrorism. This paper responds to this need by using an integrated knowledge-mapping framework that we developed to identify the core researchers and knowledge creation approaches in terrorism. The framework uses three types of analysis: (a) basic analysis of scientific output using citation, bibliometric, and social network analyses, (b) content map analysis of large corpora of literature, and (c) co-citation analysis to analyse linkages among pairs of researchers. We applied domain visualization techniques such as content map analysis, block-modeling, and co-citation analysis to the literature and author citation data from the years 1965 to 2003. The data were gathered from ten databases such as the {ISI} Web of Science. The results reveal: (1) the names of the top 42 core terrorism researchers (e.g., Brian Jenkins, Bruce Hoffman, and Paul Wilkinson) as well as their institutional affiliations; (2) their influential publications; (3) clusters of terrorism researchers who work in similar areas; and (4) that the research focus has shifted from terrorism as a low-intensity conflict to a strategic threat to world powers with increased focus on Osama Bin Laden.}, pages = {42--56}, number = {1}, journaltitle = {{YIJHC} International Journal of Human - Computer Studies}, author = {Reid, Edna F and Chen, Hsinchun}, date = {2007}, note = {00091}, file = {Reid and Chen - 2007 - Mapping the contemporary terrorism research domain.pdf:/home/jeremy/Zotero/storage/DAN5ATFN/Reid and Chen - 2007 - Mapping the contemporary terrorism research domain.pdf:application/pdf} } @article{blei_probabilistic_2012, title = {Probabilistic Topic Models}, volume = {55}, issn = {0001-0782}, url = {http://doi.acm.org/10.1145/2133806.2133826}, doi = {10.1145/2133806.2133826}, abstract = {Surveying a suite of algorithms that offer a solution to managing large document archives.}, pages = {77--84}, number = {4}, journaltitle = {Commun. {ACM}}, author = {Blei, David M.}, urldate = {2016-03-07}, date = {2012-04}, file = {Blei - 2012 - Probabilistic Topic Models.pdf:/home/jeremy/Zotero/storage/5HZENWNZ/Blei - 2012 - Probabilistic Topic Models.pdf:application/pdf} } @article{schwartz_personality_2013, title = {Personality, Gender, and Age in the Language of Social Media: The Open-Vocabulary Approach}, volume = {8}, issn = {1932-6203}, url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0073791}, doi = {10.1371/journal.pone.0073791}, shorttitle = {Personality, Gender, and Age in the Language of Social Media}, abstract = {We analyzed 700 million words, phrases, and topic instances collected from the Facebook messages of 75,000 volunteers, who also took standard personality tests, and found striking variations in language with personality, gender, and age. In our open-vocabulary technique, the data itself drives a comprehensive exploration of language that distinguishes people, finding connections that are not captured with traditional closed-vocabulary word-category analyses. Our analyses shed new light on psychosocial processes yielding results that are face valid (e.g., subjects living in high elevations talk about the mountains), tie in with other research (e.g., neurotic people disproportionately use the phrase ‘sick of’ and the word ‘depressed’), suggest new hypotheses (e.g., an active life implies emotional stability), and give detailed insights (males use the possessive ‘my’ when mentioning their ‘wife’ or ‘girlfriend’ more often than females use ‘my’ with ‘husband’ or 'boyfriend’). To date, this represents the largest study, by an order of magnitude, of language and personality.}, pages = {e73791}, number = {9}, journaltitle = {{PLOS} {ONE}}, shortjournal = {{PLOS} {ONE}}, author = {Schwartz, H. Andrew and Eichstaedt, Johannes C. and Kern, Margaret L. and Dziurzynski, Lukasz and Ramones, Stephanie M. and Agrawal, Megha and Shah, Achal and Kosinski, Michal and Stillwell, David and Seligman, Martin E. P. and Ungar, Lyle H.}, urldate = {2016-03-07}, date = {2013-09-25}, keywords = {Social Media, Facebook, Personality, Psychology, language, Psycholinguistics, Forecasting, Vocabulary}, file = {Schwartz et al. - 2013 - Personality, Gender, and Age in the Language of So.pdf:/home/jeremy/Zotero/storage/CKR7EZ5S/Schwartz et al. - 2013 - Personality, Gender, and Age in the Language of So.pdf:application/pdf} } @article{kovacs_exploring_2015, title = {Exploring the scope of open innovation: a bibliometric review of a decade of research}, volume = {104}, issn = {0138-9130, 1588-2861}, url = {http://link.springer.com/article/10.1007/s11192-015-1628-0}, doi = {10.1007/s11192-015-1628-0}, shorttitle = {Exploring the scope of open innovation}, abstract = {The concept of open innovation has attracted considerable attention since Henry Chesbrough first coined it to capture the increasing reliance of firms on external sources of innovation. Although open innovation has flourished as a topic within innovation management research, it has also triggered debates about the coherence of the research endeavors pursued under this umbrella, including its theoretical foundations. In this paper, we aim to contribute to these debates through a bibliometric review of the first decade of open innovation research. We combine two techniques—bibliographic coupling and co-citation analysis—to (1) visualize the network of publications that explicitly use the label ‘open innovation’ and (2) to arrive at distinct clusters of thematically related publications. Our findings illustrate that open innovation research builds principally on four related streams of prior research, whilst the bibliographic network of open innovation research reveals that seven thematic clusters have been pursued persistently. While such persistence is undoubtedly useful to arrive at in-depth and robust insights, the observed patterns also signal the absence of new, emerging, themes. As such, ‘open innovation’ might benefit from applying its own ideas: sourcing concepts and models from a broader range of theoretical perspectives as well as pursuing a broader range of topics might introduce dynamics resulting in more impact and proliferation.}, pages = {951--983}, number = {3}, journaltitle = {Scientometrics}, shortjournal = {Scientometrics}, author = {Kovács, Adrián and Looy, Bart Van and Cassiman, Bruno}, urldate = {2016-04-20}, date = {2015-06-20}, langid = {english}, keywords = {open innovation, Library Science, Information Storage and Retrieval, 91-02, Co-citation analysis, Bibliographic coupling, O32, Q55, Interdisciplinary Studies, openness}, file = {Kovács et al. - 2015 - Exploring the scope of open innovation a bibliome.pdf:/home/jeremy/Zotero/storage/MFDEMAFC/Kovács et al. - 2015 - Exploring the scope of open innovation a bibliome.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/AITBH9EK/s11192-015-1628-0.html:text/html} } @inproceedings{blei_dynamic_2006, title = {Dynamic topic models}, url = {http://dl.acm.org/citation.cfm?id=1143859}, pages = {113--120}, booktitle = {Proceedings of the 23rd international conference on Machine learning}, publisher = {{ACM}}, author = {Blei, David M. and Lafferty, John D.}, urldate = {2016-04-21}, date = {2006}, file = {[PDF] from cmu.edu:/home/jeremy/Zotero/storage/UBSD9KNT/Blei and Lafferty - 2006 - Dynamic topic models.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/MR3H4FSU/citation.html:text/html} } @inproceedings{hall_studying_2008, location = {Stroudsburg, {PA}, {USA}}, title = {Studying the History of Ideas Using Topic Models}, url = {http://dl.acm.org/citation.cfm?id=1613715.1613763}, series = {{EMNLP} '08}, abstract = {How can the development of ideas in a scientific field be studied over time? We apply unsupervised topic modeling to the {ACL} Anthology to analyze historical trends in the field of Computational Linguistics from 1978 to 2006. We induce topic clusters using Latent Dirichlet Allocation, and examine the strength of each topic over time. Our methods find trends in the field including the rise of probabilistic methods starting in 1988, a steady increase in applications, and a sharp decline of research in semantics and understanding between 1978 and 2001, possibly rising again after 2001. We also introduce a model of the diversity of ideas, topic entropy, using it to show that {COLING} is a more diverse conference than {ACL}, but that both conferences as well as {EMNLP} are becoming broader over time. Finally, we apply Jensen-Shannon divergence of topic distributions to show that all three conferences are converging in the topics they cover.}, pages = {363--371}, booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing}, publisher = {Association for Computational Linguistics}, author = {Hall, David and Jurafsky, Daniel and Manning, Christopher D.}, urldate = {2016-04-21}, date = {2008}, file = {ACM Full Text PDF:/home/jeremy/Zotero/storage/UZV4H35G/Hall et al. - 2008 - Studying the History of Ideas Using Topic Models.pdf:application/pdf} } @inproceedings{mitra_language_2014, location = {New York, {NY}, {USA}}, title = {The Language That Gets People to Give: Phrases That Predict Success on Kickstarter}, isbn = {978-1-4503-2540-0}, url = {http://doi.acm.org/10.1145/2531602.2531656}, doi = {10.1145/2531602.2531656}, series = {{CSCW} '14}, shorttitle = {The Language That Gets People to Give}, abstract = {Crowdfunding sites like Kickstarter--where entrepreneurs and artists look to the internet for funding--have quickly risen to prominence. However, we know very little about the factors driving the 'crowd' to take projects to their funding goal. In this paper we explore the factors which lead to successfully funding a crowdfunding project. We study a corpus of 45K crowdfunded projects, analyzing 9M phrases and 59 other variables commonly present on crowdfunding sites. The language used in the project has surprising predictive power accounting for 58.56\% of the variance around successful funding. A closer look at the phrases shows they exhibit general persuasion principles. For example, also receive two reflects the principle of Reciprocity and is one of the top predictors of successful funding. We conclude this paper by announcing the release of the predictive phrases along with the control variables as a public dataset, hoping that our work can enable new features on crowdfunding sites--tools to help both backers and project creators make the best use of their time and money.}, pages = {49--61}, booktitle = {Proceedings of the 17th {ACM} Conference on Computer Supported Cooperative Work \& Social Computing}, publisher = {{ACM}}, author = {Mitra, Tanushree and Gilbert, Eric}, urldate = {2016-04-29}, date = {2014}, keywords = {crowdfunding, natural language processing (nlp), {CMC}} } @book{wasserman_social_1994, title = {Social Network Analysis: Methods And Applications}, publisher = {Cambridge University Press}, author = {Wasserman, Stanley and Faust, Katherine}, date = {1994} } @article{tausczik_psychological_2010, title = {The Psychological Meaning of Words: {LIWC} and Computerized Text Analysis Methods}, volume = {29}, issn = {0261-927X, 1552-6526}, url = {http://jls.sagepub.com/content/29/1/24}, doi = {10.1177/0261927X09351676}, shorttitle = {The Psychological Meaning of Words}, abstract = {We are in the midst of a technological revolution whereby, for the first time, researchers can link daily word use to a broad array of real-world behaviors. This article reviews several computerized text analysis methods and describes how Linguistic Inquiry and Word Count ({LIWC}) was created and validated. {LIWC} is a transparent text analysis program that counts words in psychologically meaningful categories. Empirical results using {LIWC} demonstrate its ability to detect meaning in a wide variety of experimental settings, including to show attentional focus, emotionality, social relationships, thinking styles, and individual differences.}, pages = {24--54}, number = {1}, journaltitle = {Journal of Language and Social Psychology}, shortjournal = {Journal of Language and Social Psychology}, author = {Tausczik, Yla R. and Pennebaker, James W.}, urldate = {2016-07-12}, date = {2010-03-01}, langid = {english}, keywords = {attention, {LIWC}, deception, dominance, relationships, pronouns, computerized text analysis}, file = {Full Text PDF:/home/jeremy/Zotero/storage/G6TIZD38/Tausczik and Pennebaker - 2010 - The Psychological Meaning of Words LIWC and Compu.pdf:application/pdf} } @book{smith_general_2014, title = {General social surveys, 1972-2014}, shorttitle = {General social surveys, 1972-2014}, publisher = {National Opinion Research Center ({NORC})}, author = {Smith, Tom William and Marsden, Peter and Hout, Michael and Kim, Jibum}, date = {2014} } @book{leskovec_snap_2014, title = {{SNAP} Datasets: Stanford Large Network Dataset Collection}, url = {http://snap.stanford.edu/data}, author = {Leskovec, Jure and Krevl, Andrej}, date = {2014-06} } @article{kozinets_field_2002, title = {The Field Behind the Screen: Using Netnography for Marketing Research in Online Communities}, volume = {39}, issn = {0022-2437}, url = {http://journals.ama.org/doi/abs/10.1509/jmkr.39.1.61.18935}, doi = {10.1509/jmkr.39.1.61.18935}, shorttitle = {The Field Behind the Screen}, abstract = {The author develops “netnography” as an online marketing research technique for providing consumer insight. Netnography is ethnography adapted to the study of online communities. As a method, netnography is faster, simpler, and less expensive than traditional ethnography and more naturalistic and unobtrusive than focus groups or interviews. It provides information on the symbolism, meanings, and consumption patterns of online consumer groups. The author provides guidelines that acknowledge the online environment, respect the inherent flexibility and openness of ethnography, and provide rigor and ethics in the conduct of marketing research. As an illustrative example, the author provides a netnography of an online coffee newsgroup and discusses its marketing implications.}, pages = {61--72}, number = {1}, journaltitle = {Journal of Marketing Research}, shortjournal = {Journal of Marketing Research}, author = {Kozinets, Robert V.}, urldate = {2016-07-18}, date = {2002-02-01} } @article{chew_pandemics_2010, title = {Pandemics in the Age of Twitter: Content Analysis of Tweets during the 2009 H1N1 Outbreak}, volume = {5}, issn = {1932-6203}, url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0014118}, doi = {10.1371/journal.pone.0014118}, shorttitle = {Pandemics in the Age of Twitter}, abstract = {Background Surveys are popular methods to measure public perceptions in emergencies but can be costly and time consuming. We suggest and evaluate a complementary “infoveillance” approach using Twitter during the 2009 H1N1 pandemic. Our study aimed to: 1) monitor the use of the terms “H1N1” versus “swine flu” over time; 2) conduct a content analysis of “tweets”; and 3) validate Twitter as a real-time content, sentiment, and public attention trend-tracking tool. Methodology/Principal Findings Between May 1 and December 31, 2009, we archived over 2 million Twitter posts containing keywords “swine flu,” “swineflu,” and/or “H1N1.” using Infovigil, an infoveillance system. Tweets using “H1N1” increased from 8.8\% to 40.5\% ( R 2  = .788; p \<.001), indicating a gradual adoption of World Health Organization-recommended terminology. 5,395 tweets were randomly selected from 9 days, 4 weeks apart and coded using a tri-axial coding scheme. To track tweet content and to test the feasibility of automated coding, we created database queries for keywords and correlated these results with manual coding. Content analysis indicated resource-related posts were most commonly shared (52.6\%). 4.5\% of cases were identified as misinformation. News websites were the most popular sources (23.2\%), while government and health agencies were linked only 1.5\% of the time. 7/10 automated queries correlated with manual coding. Several Twitter activity peaks coincided with major news stories. Our results correlated well with H1N1 incidence data. Conclusions This study illustrates the potential of using social media to conduct “infodemiology” studies for public health. 2009 H1N1-related tweets were primarily used to disseminate information from credible sources, but were also a source of opinions and experiences. Tweets can be used for real-time content analysis and knowledge translation research, allowing health authorities to respond to public concerns.}, pages = {e14118}, number = {11}, journaltitle = {{PLOS} {ONE}}, shortjournal = {{PLOS} {ONE}}, author = {Chew, Cynthia and Eysenbach, Gunther}, urldate = {2016-07-18}, date = {2010-11-29}, keywords = {Chi square tests, Public and occupational health, Data Mining, H1N1, Swine influenza, twitter, Swine, Internet}, file = {Full Text PDF:/home/jeremy/Zotero/storage/KV2JGXGC/Chew and Eysenbach - 2010 - Pandemics in the Age of Twitter Content Analysis .pdf:application/pdf} } @inproceedings{agichtein_finding_2008, location = {New York, {NY}, {USA}}, title = {Finding High-quality Content in Social Media}, isbn = {978-1-59593-927-2}, url = {http://doi.acm.org/10.1145/1341531.1341557}, doi = {10.1145/1341531.1341557}, series = {{WSDM} '08}, abstract = {The quality of user-generated content varies drastically from excellent to abuse and spam. As the availability of such content increases, the task of identifying high-quality content sites based on user contributions --social media sites -- becomes increasingly important. Social media in general exhibit a rich variety of information sources: in addition to the content itself, there is a wide array of non-content information available, such as links between items and explicit quality ratings from members of the community. In this paper we investigate methods for exploiting such community feedback to automatically identify high quality content. As a test case, we focus on Yahoo! Answers, a large community question/answering portal that is particularly rich in the amount and types of content and social interactions available in it. We introduce a general classification framework for combining the evidence from different sources of information, that can be tuned automatically for a given social media type and quality definition. In particular, for the community question/answering domain, we show that our system is able to separate high-quality items from the rest with an accuracy close to that of humans}, pages = {183--194}, booktitle = {Proceedings of the 2008 International Conference on Web Search and Data Mining}, publisher = {{ACM}}, author = {Agichtein, Eugene and Castillo, Carlos and Donato, Debora and Gionis, Aristides and Mishne, Gilad}, urldate = {2016-07-19}, date = {2008}, keywords = {media, user interactions, community question answering}, file = {ACM Full Text PDF:/home/jeremy/Zotero/storage/CNFWMINP/Agichtein et al. - 2008 - Finding High-quality Content in Social Media.pdf:application/pdf;ACM Full Text PDF:/home/jeremy/Zotero/storage/9BDZK58M/Agichtein et al. - 2008 - Finding High-quality Content in Social Media.pdf:application/pdf} } @inproceedings{resnick_grouplens:_1994, location = {New York, {NY}, {USA}}, title = {{GroupLens}: An Open Architecture for Collaborative Filtering of Netnews}, isbn = {978-0-89791-689-9}, url = {http://doi.acm.org/10.1145/192844.192905}, doi = {10.1145/192844.192905}, series = {{CSCW} '94}, shorttitle = {{GroupLens}}, abstract = {Collaborative filters help people make choices based on the opinions of other people. {GroupLens} is a system for collaborative filtering of netnews, to help people find articles they will like in the huge stream of available articles. News reader clients display predicted scores and make it easy for users to rate articles after they read them. Rating servers, called Better Bit Bureaus, gather and disseminate the ratings. The rating servers predict scores based on the heuristic that people who agreed in the past will probably agree again. Users can protect their privacy by entering ratings under pseudonyms, without reducing the effectiveness of the score prediction. The entire architecture is open: alternative software for news clients and Better Bit Bureaus can be developed independently and can interoperate with the components we have developed.}, pages = {175--186}, booktitle = {Proceedings of the 1994 {ACM} Conference on Computer Supported Cooperative Work}, publisher = {{ACM}}, author = {Resnick, Paul and Iacovou, Neophytos and Suchak, Mitesh and Bergstrom, Peter and Riedl, John}, urldate = {2016-07-19}, date = {1994}, keywords = {collaborative filtering, selective dissemination of information, user model, social filtering, electronic bulletin boards, netnews, information filtering, Usenet}, file = {ACM Full Text PDF:/home/jeremy/Zotero/storage/JPUR4MA4/Resnick et al. - 1994 - GroupLens An Open Architecture for Collaborative .pdf:application/pdf} } @inproceedings{wang_tm-lda:_2012, title = {{TM}-{LDA}: efficient online modeling of latent topic transitions in social media}, isbn = {978-1-4503-1462-6}, url = {http://dl.acm.org/citation.cfm?doid=2339530.2339552}, doi = {10.1145/2339530.2339552}, shorttitle = {{TM}-{LDA}}, pages = {123}, publisher = {{ACM} Press}, author = {Wang, Yu and Agichtein, Eugene and Benzi, Michele}, urldate = {2016-07-19}, date = {2012}, langid = {english} } @inproceedings{prier_identifying_2011, location = {Berlin, Heidelberg}, title = {Identifying Health-related Topics on Twitter: An Exploration of Tobacco-related Tweets As a Test Topic}, isbn = {978-3-642-19655-3}, url = {http://dl.acm.org/citation.cfm?id=1964698.1964702}, series = {{SBP}'11}, shorttitle = {Identifying Health-related Topics on Twitter}, abstract = {Public health-related topics are difficult to identify in large conversational datasets like Twitter. This study examines how to model and discover public health topics and themes in tweets. Tobacco use is chosen as a test case to demonstrate the effectiveness of topic modeling via {LDA} across a large, representational dataset from the United States, as well as across a smaller subset that was seeded by tobacco-related queries. Topic modeling across the large dataset uncovers several public health-related topics, although tobacco is not detected by this method. However, topic modeling across the tobacco subset provides valuable insight about tobacco use in the United States. The methods used in this paper provide a possible toolset for public health researchers and practitioners to better understand public health problems through large datasets of conversational data.}, pages = {18--25}, booktitle = {Proceedings of the 4th International Conference on Social Computing, Behavioral-cultural Modeling and Prediction}, publisher = {Springer-Verlag}, author = {Prier, Kyle W. and Smith, Matthew S. and Giraud-Carrier, Christophe and Hanson, Carl L.}, urldate = {2016-07-19}, date = {2011}, keywords = {Social Media, tobacco use, {LDA}, Data Mining, topic modeling, Social networks, public health} } @inproceedings{pennacchiotti_investigating_2011, location = {New York, {NY}, {USA}}, title = {Investigating Topic Models for Social Media User Recommendation}, isbn = {978-1-4503-0637-9}, url = {http://doi.acm.org/10.1145/1963192.1963244}, doi = {10.1145/1963192.1963244}, series = {{WWW} '11}, abstract = {This paper presents a user recommendation system that recommends to a user new friends having similar interests. We automatically discover users' interests using Latent Dirichlet Allocation ({LDA}), a linguistic topic model that represents users as mixtures of topics. Our system is able to recommend friends for 4 million users with high recall, outperforming existing strategies based on graph analysis.}, pages = {101--102}, booktitle = {Proceedings of the 20th International Conference Companion on World Wide Web}, publisher = {{ACM}}, author = {Pennacchiotti, Marco and Gurumurthy, Siva}, urldate = {2016-07-19}, date = {2011}, keywords = {Social Media, {LDA}, user recommendation, Topic models}, file = {ACM Full Text PDF:/home/jeremy/Zotero/storage/R389CKQJ/Pennacchiotti and Gurumurthy - 2011 - Investigating Topic Models for Social Media User R.pdf:application/pdf} } @article{yang_identifying_2014, title = {Identifying Interesting Twitter Contents Using Topical Analysis}, volume = {41}, issn = {0957-4174}, url = {http://dx.doi.org/10.1016/j.eswa.2013.12.051}, doi = {10.1016/j.eswa.2013.12.051}, abstract = {Social media platforms such as Twitter are becoming increasingly mainstream which provides valuable user-generated information by publishing and sharing contents. Identifying interesting and useful contents from large text-streams is a crucial issue in social media because many users struggle with information overload. Retweeting as a forwarding function plays an important role in information propagation where the retweet counts simply reflect a tweet's popularity. However, the main reason for retweets may be limited to personal interests and satisfactions. In this paper, we use a topic identification as a proxy to understand a large number of tweets and to score the interestingness of an individual tweet based on its latent topics. Our assumption is that fascinating topics generate contents that may be of potential interest to a wide audience. We propose a novel topic model called Trend Sensitive-Latent Dirichlet Allocation ({TS}-{LDA}) that can efficiently extract latent topics from contents by modeling temporal trends on Twitter over time. The experimental results on real world data from Twitter demonstrate that our proposed method outperforms several other baseline methods.}, pages = {4330--4336}, number = {9}, journaltitle = {Expert Syst. Appl.}, author = {Yang, Min-Chul and Rim, Hae-Chang}, urldate = {2016-07-19}, date = {2014-07}, keywords = {Social Media, Interesting content, {LDA}, Topic model, twitter} } @article{fruchterman_graph_1991, title = {Graph drawing by force-directed placement}, volume = {21}, rights = {Copyright © 1991 John Wiley \& Sons, Ltd}, issn = {1097-024X}, url = {http://onlinelibrary.wiley.com/doi/10.1002/spe.4380211102/abstract}, doi = {10.1002/spe.4380211102}, abstract = {We present a modification of the spring-embedder model of Eades [Congressus Numerantium, 42, 149–160, (1984)] for drawing undirected graphs with straight edges. Our heuristic strives for uniform edge lengths, and we develop it in analogy to forces in natural systems, for a simple, elegant, conceptually-intuitive, and efficient algorithm.}, pages = {1129--1164}, number = {11}, journaltitle = {Software: Practice and Experience}, shortjournal = {Softw: Pract. Exper.}, author = {Fruchterman, Thomas M. J. and Reingold, Edward M.}, urldate = {2016-07-20}, date = {1991-11-01}, langid = {english}, keywords = {Multi-level techniques, Force-directed placement, Graph drawing, Simulated annealing}, file = {Snapshot:/home/jeremy/Zotero/storage/SR6JA3QW/abstract.html:text/html} } @article{bastian_gephi:_2009, title = {Gephi: an open source software for exploring and manipulating networks.}, volume = {8}, url = {http://www.aaai.org/ocs/index.php/ICWSM/09/paper/viewFile/154/1009/}, shorttitle = {Gephi}, pages = {361--362}, journaltitle = {{ICWSM}}, author = {Bastian, Mathieu and Heymann, Sebastien and Jacomy, Mathieu and {others}}, urldate = {2016-07-20}, date = {2009}, file = {Bastian et al. - 2009 - Gephi an open source software for exploring and m.pdf:/home/jeremy/Zotero/storage/Q82CV3RM/Bastian et al. - 2009 - Gephi an open source software for exploring and m.pdf:application/pdf} } @unpublished{binfield_plos_2012, location = {National Institute for Informatics}, title = {{PLoS} {ONE} and the rise of the Open Access {MegaJournal}}, url = {http://www.nii.ac.jp/sparc/en/event/2011/pdf/20120229_doc3_binfield.pdf}, note = {The 5th {SPARC} Japan Seminar 2011}, author = {Binfield, Peter}, urldate = {2016-07-20}, date = {2012-02-29}, file = {[PDF] from nii.ac.jp:/home/jeremy/Zotero/storage/DU86MXEM/Binfield - 2003 - PLoS ONE and the rise of the Open Access MegaJourn.pdf:application/pdf} } @article{subelj_clustering_2016, title = {Clustering Scientific Publications Based on Citation Relations: A Systematic Comparison of Different Methods}, volume = {11}, issn = {1932-6203}, url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0154404}, doi = {10.1371/journal.pone.0154404}, shorttitle = {Clustering Scientific Publications Based on Citation Relations}, abstract = {Clustering methods are applied regularly in the bibliometric literature to identify research areas or scientific fields. These methods are for instance used to group publications into clusters based on their relations in a citation network. In the network science literature, many clustering methods, often referred to as graph partitioning or community detection techniques, have been developed. Focusing on the problem of clustering the publications in a citation network, we present a systematic comparison of the performance of a large number of these clustering methods. Using a number of different citation networks, some of them relatively small and others very large, we extensively study the statistical properties of the results provided by different methods. In addition, we also carry out an expert-based assessment of the results produced by different methods. The expert-based assessment focuses on publications in the field of scientometrics. Our findings seem to indicate that there is a trade-off between different properties that may be considered desirable for a good clustering of publications. Overall, map equation methods appear to perform best in our analysis, suggesting that these methods deserve more attention from the bibliometric community.}, pages = {e0154404}, number = {4}, journaltitle = {{PLOS} {ONE}}, shortjournal = {{PLOS} {ONE}}, author = {Šubelj, Lovro and Eck, Nees Jan van and Waltman, Ludo}, urldate = {2016-07-20}, date = {2016-04-28}, keywords = {Library Science, Bibliometrics, Graphs, Algorithms, Statistical methods, Optimization, Computer and information sciences, Scientometrics}, file = {Full Text PDF:/home/jeremy/Zotero/storage/UQJHZF6X/Šubelj et al. - 2016 - Clustering Scientific Publications Based on Citati.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/7T77BK72/article.html:text/html} } @article{small_co-citation_1973, title = {Co-citation in the scientific literature: A new measure of the relationship between two documents}, volume = {24}, rights = {Copyright © 1973 Wiley Periodicals, Inc., A Wiley Company}, issn = {1097-4571}, url = {http://onlinelibrary.wiley.com/doi/10.1002/asi.4630240406/abstract}, doi = {10.1002/asi.4630240406}, shorttitle = {Co-citation in the scientific literature}, abstract = {A new form of document coupling called co-citation is defined as the frequency with which two documents are cited together. The co-citation frequency of two scientific papers can be determined by comparing lists of citing documents in the Science Citation Index and counting identical entries. Networks of co-cited papers can be generated for specific scientific specialties, and an example is drawn from the literature of particle physics. Co-citation patterns are found to differ significantly from bibliographic coupling patterns, but to agree generally with patterns of direct citation. Clusters of co-cited papers provide a new way to study the specialty structure of science. They may provide a new approach to indexing and to the creation of {SDI} profiles.}, pages = {265--269}, number = {4}, journaltitle = {Journal of the American Society for Information Science}, shortjournal = {J. Am. Soc. Inf. Sci.}, author = {Small, Henry}, urldate = {2016-07-20}, date = {1973-07-01}, langid = {english}, file = {Full Text PDF:/home/jeremy/Zotero/storage/9HF57A4X/Small - 1973 - Co-citation in the scientific literature A new me.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/NF4S7SJ4/abstract.html:text/html} } @article{rosvall_map_2010, title = {The map equation}, volume = {178}, issn = {1951-6355, 1951-6401}, url = {http://link.springer.com/article/10.1140/epjst/e2010-01179-1}, doi = {10.1140/epjst/e2010-01179-1}, abstract = {Many real-world networks are so large that we must simplify their structure before we can extract useful information about the systems they represent. As the tools for doing these simplifications proliferate within the network literature, researchers would benefit from some guidelines about which of the so-called community detection algorithms are most appropriate for the structures they are studying and the questions they are asking. Here we show that different methods highlight different aspects of a network's structure and that the the sort of information that we seek to extract about the system must guide us in our decision. For example, many community detection algorithms, including the popular modularity maximization approach, infer module assignments from an underlying model of the network formation process. However, we are not always as interested in how a system's network structure was formed, as we are in how a network's extant structure influences the system's behavior. To see how structure influences current behavior, we will recognize that links in a network induce movement across the network and result in system-wide interdependence. In doing so, we explicitly acknowledge that most networks carry flow. To highlight and simplify the network structure with respect to this flow, we use the map equation. We present an intuitive derivation of this flow-based and information-theoretic method and provide an interactive on-line application that anyone can use to explore the mechanics of the map equation. The differences between the map equation and the modularity maximization approach are not merely conceptual. Because the map equation attends to patterns of flow on the network and the modularity maximization approach does not, the two methods can yield dramatically different results for some network structures. To illustrate this and build our understanding of each method, we partition several sample networks. We also describe an algorithm and provide source code to efficiently decompose large weighted and directed networks based on the map equation.}, pages = {13--23}, number = {1}, journaltitle = {The European Physical Journal Special Topics}, shortjournal = {Eur. Phys. J. Spec. Top.}, author = {Rosvall, M. and Axelsson, D. and Bergstrom, C. T.}, urldate = {2016-07-20}, date = {2010-04-17}, langid = {english}, file = {Full Text PDF:/home/jeremy/Zotero/storage/SP7AM2FW/Rosvall et al. - 2010 - The map equation.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/36S24FS9/e2010-01179-1.html:text/html} } @article{rosvall_maps_2008, title = {Maps of random walks on complex networks reveal community structure}, volume = {105}, issn = {0027-8424, 1091-6490}, url = {http://www.pnas.org/content/105/4/1118}, doi = {10.1073/pnas.0706851105}, abstract = {To comprehend the multipartite organization of large-scale biological and social systems, we introduce an information theoretic approach that reveals community structure in weighted and directed networks. We use the probability flow of random walks on a network as a proxy for information flows in the real system and decompose the network into modules by compressing a description of the probability flow. The result is a map that both simplifies and highlights the regularities in the structure and their relationships. We illustrate the method by making a map of scientific communication as captured in the citation patterns of {\textgreater}6,000 journals. We discover a multicentric organization with fields that vary dramatically in size and degree of integration into the network of science. Along the backbone of the network—including physics, chemistry, molecular biology, and medicine—information flows bidirectionally, but the map reveals a directional pattern of citation from the applied fields to the basic sciences.}, pages = {1118--1123}, number = {4}, journaltitle = {Proceedings of the National Academy of Sciences}, shortjournal = {{PNAS}}, author = {Rosvall, Martin and Bergstrom, Carl T.}, urldate = {2016-07-20}, date = {2008-01-29}, langid = {english}, pmid = {18216267}, keywords = {compression, clustering, information theory, map of science, bibiometrics}, file = {Full Text PDF:/home/jeremy/Zotero/storage/3HQG7TS3/Rosvall and Bergstrom - 2008 - Maps of random walks on complex networks reveal co.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/TG6S96XS/1118.html:text/html} } @article{ghosh_what_2013, title = {What are we `tweeting' about obesity? Mapping tweets with topic modeling and Geographic Information System}, volume = {40}, issn = {1523-0406}, url = {http://dx.doi.org/10.1080/15230406.2013.776210}, doi = {10.1080/15230406.2013.776210}, shorttitle = {What are we `tweeting' about obesity?}, abstract = {Public health related tweets are difficult to identify in large conversational datasets like Twitter.com. Even more challenging is the visualization and analyses of the spatial patterns encoded in tweets. This study has the following objectives: how can topic modeling be used to identify relevant public health topics such as obesity on Twitter.com? What are the common obesity related themes? What is the spatial pattern of the themes? What are the research challenges of using large conversational datasets from social networking sites? Obesity is chosen as a test theme to demonstrate the effectiveness of topic modeling using Latent Dirichlet Allocation ({LDA}) and spatial analysis using Geographic Information System ({GIS}). The dataset is constructed from tweets (originating from the United States) extracted from Twitter.com on obesity-related queries. Examples of such queries are ‘food deserts’, ‘fast food’, and ‘childhood obesity’. The tweets are also georeferenced and time stamped. Three cohesive and meaningful themes such as ‘childhood obesity and schools’, ‘obesity prevention’, and ‘obesity and food habits’ are extracted from the {LDA} model. The {GIS} analysis of the extracted themes show distinct spatial pattern between rural and urban areas, northern and southern states, and between coasts and inland states. Further, relating the themes with ancillary datasets such as {US} census and locations of fast food restaurants based upon the location of the tweets in a {GIS} environment opened new avenues for spatial analyses and mapping. Therefore the techniques used in this study provide a possible toolset for computational social scientists in general, and health researchers in specific, to better understand health problems from large conversational datasets.}, pages = {90--102}, number = {2}, journaltitle = {Cartography and Geographic Information Science}, author = {Ghosh, Debarchana (Debs) and Guha, Rajarshi}, urldate = {2016-07-19}, date = {2013-03-01}, file = {Full Text PDF:/home/jeremy/Zotero/storage/S3WJGXET/Ghosh and Guha - 2013 - What are we ‘tweeting’ about obesity Mapping twee.pdf:application/pdf} } @article{hidalgo_building_2009, title = {The building blocks of economic complexity}, volume = {106}, issn = {0027-8424, 1091-6490}, url = {http://www.pnas.org/content/106/26/10570}, doi = {10.1073/pnas.0900943106}, abstract = {For Adam Smith, wealth was related to the division of labor. As people and firms specialize in different activities, economic efficiency increases, suggesting that development is associated with an increase in the number of individual activities and with the complexity that emerges from the interactions between them. Here we develop a view of economic growth and development that gives a central role to the complexity of a country's economy by interpreting trade data as a bipartite network in which countries are connected to the products they export, and show that it is possible to quantify the complexity of a country's economy by characterizing the structure of this network. Furthermore, we show that the measures of complexity we derive are correlated with a country's level of income, and that deviations from this relationship are predictive of future growth. This suggests that countries tend to converge to the level of income dictated by the complexity of their productive structures, indicating that development efforts should focus on generating the conditions that would allow complexity to emerge to generate sustained growth and prosperity.}, pages = {10570--10575}, number = {26}, journaltitle = {Proceedings of the National Academy of Sciences}, shortjournal = {{PNAS}}, author = {Hidalgo, César A. and Hausmann, Ricardo}, urldate = {2016-07-20}, date = {2009-06-30}, langid = {english}, pmid = {19549871}, keywords = {networks, economic development}, file = {Full Text PDF:/home/jeremy/Zotero/storage/BSD98SD2/Hidalgo and Hausmann - 2009 - The building blocks of economic complexity.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/EXMG4VVB/10570.html:text/html} } @book{hausmann_atlas_2014, title = {The Atlas of Economic Complexity: Mapping Paths to Prosperity}, isbn = {978-0-262-31773-3}, shorttitle = {The Atlas of Economic Complexity}, abstract = {Why do some countries grow and others do not? The authors of The Atlas of Economic Complexity offer readers an explanation based on "Economic Complexity," a measure of a society's productive knowledge. Prosperous societies are those that have the knowledge to make a larger variety of more complex products. The Atlas of Economic Complexity attempts to measure the amount of productive knowledge countries hold and how they can move to accumulate more of it by making more complex products.Through the graphical representation of the "Product Space," the authors are able to identify each country's "adjacent possible," or potential new products, making it easier to find paths to economic diversification and growth. In addition, they argue that a country's economic complexity and its position in the product space are better predictors of economic growth than many other well-known development indicators, including measures of competitiveness, governance, finance, and schooling.Using innovative visualizations, the book locates each country in the product space, provides complexity and growth potential rankings for 128 countries, and offers individual country pages with detailed information about a country's current capabilities and its diversification options. The maps and visualizations included in the Atlas can be used to find more viable paths to greater productive knowledge and prosperity.}, pagetotal = {369}, publisher = {{MIT} Press}, author = {Hausmann, Ricardo and Hidalgo, César A. and Bustos, Sebastián and Coscia, Michele and Simoes, Alexander and Yildirim, Muhammed A.}, date = {2014-01-17}, langid = {english}, keywords = {Business \& Economics / International / Economics, Business \& Economics / Economics / Macroeconomics} } @article{hood_literature_2001, title = {The Literature of Bibliometrics, Scientometrics, and Informetrics}, volume = {52}, issn = {01389130}, url = {http://link.springer.com/10.1023/A:1017919924342}, doi = {10.1023/A:1017919924342}, pages = {291--314}, number = {2}, journaltitle = {Scientometrics}, author = {Hood, William W. and Wilson, Concepción S.}, urldate = {2016-07-20}, date = {2001} } @article{kessler_bibliographic_1963, title = {Bibliographic coupling between scientific papers}, volume = {14}, rights = {Copyright © 1963 Wiley Periodicals, Inc., A Wiley Company}, issn = {1936-6108}, url = {http://onlinelibrary.wiley.com/doi/10.1002/asi.5090140103/abstract}, doi = {10.1002/asi.5090140103}, abstract = {This report describes the results of automatic processing of a large number of scientific papers according to a rigorously defined criterion of coupling. The population of papers under study was ordered into groups that satisfy the stated criterion of interrelation. An examination of the papers that constitute the groups shows a high degree of logical correlation.}, pages = {10--25}, number = {1}, journaltitle = {American Documentation}, shortjournal = {Amer. Doc.}, author = {Kessler, M. M.}, urldate = {2016-04-20}, date = {1963-01-01}, langid = {english}, file = {Kessler - 1963 - Bibliographic coupling between scientific papers.pdf:/home/jeremy/Zotero/storage/SSZX4B3K/Kessler - 1963 - Bibliographic coupling between scientific papers.pdf:application/pdf} } @article{macy_factors_2002, title = {From Factors to Actors: Computational Sociology and Agent-Based Modeling}, volume = {28}, issn = {0360-0572}, url = {http://www.jstor.org/stable/3069238}, shorttitle = {From Factors to Actors}, abstract = {Sociologists often model social processes as interactions among variables. We review an alternative approach that models social life as interactions among adaptive agents who influence one another in response to the influence they receive. These agent-based models ({ABMs}) show how simple and predictable local interactions can generate familiar but enigmatic global patterns, such as the diffusion of information, emergence of norms, coordination of conventions, or participation in collective action. Emergent social patterns can also appear unexpectedly and then just as dramatically transform or disappear, as happens in revolutions, market crashes, fads, and feeding frenzies. {ABMs} provide theoretical leverage where the global patterns of interest are more than the aggregation of individual attributes, but at the same time, the emergent pattern cannot be understood without a bottom up dynamical model of the microfoundations at the relational level. We begin with a brief historical sketch of the shift from "factors" to "actors" in computational sociology that shows how agent-based modeling differs fundamentally from earlier sociological uses of computer simulation. We then review recent contributions focused on the emergence of social structure and social order out of local interaction. Although sociology has lagged behind other social sciences in appreciating this new methodology, a distinctive sociological contribution is evident in the papers we review. First, theoretical interest focuses on dynamic social networks that shape and are shaped by agent interaction. Second, {ABMs} are used to perform virtual experiments that test macrosociological theories by manipulating structural factors like network topology, social stratification, or spatial mobility. We conclude our review with a series of recommendations for realizing the rich sociological potential of this approach.}, pages = {143--166}, journaltitle = {Annual Review of Sociology}, shortjournal = {Annual Review of Sociology}, author = {Macy, Michael W. and Willer, Robert}, urldate = {2016-07-20}, date = {2002} } @book{neef_digital_2014, location = {Indianapolis, {IN}}, edition = {1 edition}, title = {Digital Exhaust: What Everyone Should Know About Big Data, Digitization and Digitally Driven Innovation}, isbn = {978-0-13-383796-4}, shorttitle = {Digital Exhaust}, abstract = {Will "Big Data" supercharge the economy, tyrannize us, or both? Data Exhaust is the definitive primer for everyone who wants to understand all the implications of Big Data, digitally driven innovation, and the accelerating Internet Economy. Renowned digital expert Dale Neef clearly explains: What Big Data really is, and what's new and different about it How Big Data works, and what you need to know about Big Data technologies Where the data is coming from: how Big Data integrates sources ranging from social media to machine sensors, smartphones to financial transactions How companies use Big Data analytics to gain a more nuanced, accurate picture of their customers, their own performance, and the newest trends How governments and individual citizens can also benefit from Big Data How to overcome obstacles to success with Big Data – including poor data that can magnify human error A realistic assessment of Big Data threats to employment and personal privacy, now and in the future Neef places the Big Data phenomenon where it belongs: in the context of the broader global shift to the Internet economy, with all that implies. By doing so, he helps businesses plan Big Data strategy more effectively – and helps citizens and policymakers identify sensible policies for preventing its misuse.   By conservative estimate, the global Big Data market will soar past \$50 billion by 2018. But those direct expenses represent just the "tip of the iceberg" when it comes to Big Data's impact. Big Data is now of acute strategic interest for every organization that aims to succeed – and it is equally important to everyone else. Whoever you are, Data Exhaust tells you exactly what you need to know about Big Data – and what to do about it, too.}, pagetotal = {320}, publisher = {Pearson {FT} Press}, author = {Neef, Dale}, date = {2014-12-01} } @article{friedman_regularization_2010, title = {Regularization Paths for Generalized Linear Models via Coordinate Descent}, volume = {33}, issn = {1548-7660}, url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2929880/}, abstract = {We develop fast algorithms for estimation of generalized linear models with convex penalties. The models include linear regression, two-class logistic regression, and multinomial regression problems while the penalties include ℓ1 (the lasso), ℓ2 (ridge regression) and mixtures of the two (the elastic net). The algorithms use cyclical coordinate descent, computed along a regularization path. The methods can handle large problems and can also deal efficiently with sparse features. In comparative timings we find that the new algorithms are considerably faster than competing methods.}, pages = {1--22}, number = {1}, journaltitle = {Journal of statistical software}, shortjournal = {J Stat Softw}, author = {Friedman, Jerome and Hastie, Trevor and Tibshirani, Rob}, urldate = {2016-07-20}, date = {2010}, pmid = {20808728}, pmcid = {PMC2929880} } @book{james_introduction_2013, location = {New York}, title = {An introduction to statistical learning: with applications in R}, isbn = {978-1-4614-7137-0}, shorttitle = {An introduction to statistical learning}, abstract = {"An Introduction to Statistical Learning provides an accessible overview of the field of statistical learning, an essential toolset for making sense of the vast and complex data sets that have emerged in fields ranging from biology to finance to marketing to astrophysics in the past twenty years. This book presents some of the most important modeling and prediction techniques, along with relevant applications. Topics include linear regression, classification, resampling methods, shrinkage approaches, tree-based methods, support vector machines, clustering, and more. Color graphics and real-world examples are used to illustrate the methods presented. Since the goal of this textbook is to facilitate the use of these statistical learning techniques by practitioners in science, industry, and other fields, each chapter contains a tutorial on implementing the analyses and methods presented in R, an extremely popular open source statistical software platform. Two of the authors co-wrote The Elements of Statistical Learning (Hastie, Tibshirani and Friedman, 2nd edition 2009), a popular reference book for statistics and machine learning researchers. An Introduction to Statistical Learning covers many of the same topics, but at a level accessible to a much broader audience. This book is targeted at statisticians and non-statisticians alike who wish to use cutting-edge statistical learning techniques to analyze their data. The text assumes only a previous course in linear regression and no knowledge of matrix algebra. Provides tools for Statistical Learning that are essential for practitioners in science, industry and other fields. Analyses and methods are presented in R. Topics include linear regression, classification, resampling methods, shrinkage approaches, tree-based methods, support vector machines, and clustering. Extensive use of color graphics assist the reader"--Publisher description.}, publisher = {Springer}, author = {James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert}, date = {2013} } @article{tibshirani_regression_1996, title = {Regression Shrinkage and Selection via the Lasso}, volume = {58}, issn = {0035-9246}, url = {http://www.jstor.org/stable/2346178}, abstract = {We propose a new method for estimation in linear models. The `lasso' minimizes the residual sum of squares subject to the sum of the absolute value of the coefficients being less than a constant. Because of the nature of this constraint it tends to produce some coefficients that are exactly 0 and hence gives interpretable models. Our simulation studies suggest that the lasso enjoys some of the favourable properties of both subset selection and ridge regression. It produces interpretable models like subset selection and exhibits the stability of ridge regression. There is also an interesting relationship with recent work in adaptive function estimation by Donoho and Johnstone. The lasso idea is quite general and can be applied in a variety of statistical models: extensions to generalized regression models and tree-based models are briefly described.}, pages = {267--288}, number = {1}, journaltitle = {Journal of the Royal Statistical Society. Series B (Methodological)}, shortjournal = {Journal of the Royal Statistical Society. Series B (Methodological)}, author = {Tibshirani, Robert}, urldate = {2016-07-20}, date = {1996} } @report{bollen_social_2015, title = {Social, Behavioral, and Economic Sciences Perspectives on Robust and Reliable Science}, url = {http://www.nsf.gov/sbe/AC_Materials/SBE_Robust_and_Reliable_Research_Report.pdf}, institution = {National Science Foundation}, author = {Bollen, Kenneth and Cacioppo, John T. and Kaplan, Robert M. and Krosnick, Jon A. and Olds, James L. and Dean, Heather}, date = {2015-05} } @article{stodden_toward_2013, title = {Toward Reproducible Computational Research: An Empirical Analysis of Data and Code Policy Adoption by Journals}, volume = {8}, issn = {1932-6203}, url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0067111}, doi = {10.1371/journal.pone.0067111}, shorttitle = {Toward Reproducible Computational Research}, abstract = {Journal policy on research data and code availability is an important part of the ongoing shift toward publishing reproducible computational science. This article extends the literature by studying journal data sharing policies by year (for both 2011 and 2012) for a referent set of 170 journals. We make a further contribution by evaluating code sharing policies, supplemental materials policies, and open access status for these 170 journals for each of 2011 and 2012. We build a predictive model of open data and code policy adoption as a function of impact factor and publisher and find higher impact journals more likely to have open data and code policies and scientific societies more likely to have open data and code policies than commercial publishers. We also find open data policies tend to lead open code policies, and we find no relationship between open data and code policies and either supplemental material policies or open access journal status. Of the journals in this study, 38\% had a data policy, 22\% had a code policy, and 66\% had a supplemental materials policy as of June 2012. This reflects a striking one year increase of 16\% in the number of data policies, a 30\% increase in code policies, and a 7\% increase in the number of supplemental materials policies. We introduce a new dataset to the community that categorizes data and code sharing, supplemental materials, and open access policies in 2011 and 2012 for these 170 journals.}, pages = {e67111}, number = {6}, journaltitle = {{PLOS} {ONE}}, shortjournal = {{PLOS} {ONE}}, author = {Stodden, Victoria and Guo, Peixuan and Ma, Zhaokun}, urldate = {2016-07-22}, date = {2013-06-21}, keywords = {Reproducibility, science policy, computational biology, Open access, Scientific publishing, open data, Computer and information sciences, Data management}, file = {Full Text PDF:/home/jeremy/Zotero/storage/PIC8KFJE/Stodden et al. - 2013 - Toward Reproducible Computational Research An Emp.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/NTS2JK5S/article.html:text/html} } @article{leveque_reproducible_2012, title = {Reproducible research for scientific computing: Tools and strategies for changing the culture}, volume = {14}, issn = {1521-9615}, shorttitle = {Reproducible research for scientific computing}, pages = {13--17}, number = {4}, journaltitle = {Computing in Science and Engineering}, author = {{LeVeque}, Randall J. and Mitchell, Ian M. and Stodden, Victoria}, date = {2012}, file = {LeVeque et al. - 2012 - Reproducible research for scientific computing To.pdf:/home/jeremy/Zotero/storage/2FHZTG9Q/LeVeque et al. - 2012 - Reproducible research for scientific computing To.pdf:application/pdf} } @book{wilensky_introduction_2015, location = {Cambridge, Massachusetts}, title = {An introduction to agent-based modeling: modeling natural, social, and engineered complex systems with {NetLogo}}, shorttitle = {An introduction to agent-based modeling}, publisher = {{MIT} Press}, author = {Wilensky, Uri and Rand, William}, urldate = {2016-07-19}, date = {2015} } @article{welles_minorities_2014, title = {On minorities and outliers: The case for making Big Data small}, volume = {1}, issn = {2053-9517}, url = {http://bds.sagepub.com/content/1/1/2053951714540613}, doi = {10.1177/2053951714540613}, shorttitle = {On minorities and outliers}, abstract = {In this essay, I make the case for choosing to examine small subsets of Big Data datasets—making big data small. Big Data allows us to produce summaries of human behavior at a scale never before possible. But in the push to produce these summaries, we risk losing sight of a secondary but equally important advantage of Big Data—the plentiful representation of minorities. Women, minorities and statistical outliers have historically been omitted from the scientific record, with problematic consequences. Big Data affords the opportunity to remedy those omissions. However, to do so, Big Data researchers must choose to examine very small subsets of otherwise large datasets. I encourage researchers to embrace an ethical, empirical and epistemological stance on Big Data that includes minorities and outliers as reference categories, rather than the exceptions to statistical norms.}, pages = {2053951714540613}, number = {1}, journaltitle = {Big Data \& Society}, author = {Welles, Brooke Foucault}, urldate = {2016-07-23}, date = {2014-04-01}, langid = {english}, file = {Full Text PDF:/home/jeremy/Zotero/storage/SS8P2JN4/Welles - 2014 - On minorities and outliers The case for making Bi.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/M2HTAVP2/2053951714540613.html:text/html} } @book{hansen_analyzing_2010, location = {Burlington, Massachusetts}, title = {Analyzing social media networks with {NodeXL}: Insights from a connected world}, shorttitle = {Analyzing social media networks with {NodeXL}}, publisher = {Morgan Kaufmann}, author = {Hansen, Derek and Shneiderman, Ben and Smith, Marc A.}, urldate = {2016-07-18}, date = {2010} } @inproceedings{asur_predicting_2010, title = {Predicting the Future with Social Media}, volume = {1}, doi = {10.1109/WI-IAT.2010.63}, abstract = {In recent years, social media has become ubiquitous and important for social networking and content sharing. And yet, the content that is generated from these websites remains largely untapped. In this paper, we demonstrate how social media content can be used to predict real-world outcomes. In particular, we use the chatter from Twitter.com to forecast box-office revenues for movies. We show that a simple model built from the rate at which tweets are created about particular topics can outperform market-based predictors. We further demonstrate how sentiments extracted from Twitter can be utilized to improve the forecasting power of social media.}, eventtitle = {2010 {IEEE}/{WIC}/{ACM} International Conference on Web Intelligence and Intelligent Agent Technology ({WI}-{IAT})}, pages = {492--499}, booktitle = {2010 {IEEE}/{WIC}/{ACM} International Conference on Web Intelligence and Intelligent Agent Technology ({WI}-{IAT})}, author = {Asur, S. and Huberman, B. A.}, date = {2010-08}, keywords = {Web sites, Social Media, attention, prediction, social media content, content sharing, social networking (online), market-based predictors, Twitter.com, social networking}, file = {IEEE Xplore Abstract Record:/home/jeremy/Zotero/storage/AT38MBGW/articleDetails.html:text/html;IEEE Xplore Abstract Record:/home/jeremy/Zotero/storage/NAPSZ9F4/login.html:text/html;IEEE Xplore Full Text PDF:/home/jeremy/Zotero/storage/5XINGQC4/Asur and Huberman - 2010 - Predicting the Future with Social Media.pdf:application/pdf} } @article{blei_latent_2003, title = {Latent dirichlet allocation}, volume = {3}, url = {http://dl.acm.org/citation.cfm?id=944937}, pages = {993--1022}, journaltitle = {The Journal of Machine Learning Research}, author = {Blei, David M. and Ng, Andrew Y. and Jordan, Michael I.}, urldate = {2015-12-03}, date = {2003}, file = {Blei et al. - 2003 - Latent dirichlet allocation.pdf:/home/jeremy/Zotero/storage/2K3E7TJH/Blei et al. - 2003 - Latent dirichlet allocation.pdf:application/pdf} } @article{dimaggio_exploiting_2013, title = {Exploiting affinities between topic modeling and the sociological perspective on culture: Application to newspaper coverage of U.S. government arts funding}, volume = {41}, issn = {0304422X}, url = {http://linkinghub.elsevier.com/retrieve/pii/S0304422X13000661}, doi = {10.1016/j.poetic.2013.08.004}, shorttitle = {Exploiting affinities between topic modeling and the sociological perspective on culture}, pages = {570--606}, number = {6}, journaltitle = {Poetics}, author = {{DiMaggio}, Paul and Nag, Manish and Blei, David}, urldate = {2016-01-02}, date = {2013-12}, langid = {english}, file = {exploiting-affinities.pdf:/home/jeremy/Zotero/storage/7D8NAGNB/exploiting-affinities.pdf:application/pdf} } @inproceedings{cheng_can_2014, location = {New York, {NY}, {USA}}, title = {Can cascades be predicted?}, isbn = {978-1-4503-2744-2}, url = {http://doi.acm.org/10.1145/2566486.2567997}, doi = {10.1145/2566486.2567997}, series = {{WWW} '14}, abstract = {On many social networking web sites such as Facebook and Twitter, resharing or reposting functionality allows users to share others' content with their own friends or followers. As content is reshared from user to user, large cascades of reshares can form. While a growing body of research has focused on analyzing and characterizing such cascades, a recent, parallel line of work has argued that the future trajectory of a cascade may be inherently unpredictable. In this work, we develop a framework for addressing cascade prediction problems. On a large sample of photo reshare cascades on Facebook, we find strong performance in predicting whether a cascade will continue to grow in the future. We find that the relative growth of a cascade becomes more predictable as we observe more of its reshares, that temporal and structural features are key predictors of cascade size, and that initially, breadth, rather than depth in a cascade is a better indicator of larger cascades. This prediction performance is robust in the sense that multiple distinct classes of features all achieve similar performance. We also discover that temporal features are predictive of a cascade's eventual shape. Observing independent cascades of the same content, we find that while these cascades differ greatly in size, we are still able to predict which ends up the largest.}, pages = {925--936}, booktitle = {Proceedings of the 23rd International Conference on World Wide Web}, publisher = {{ACM}}, author = {Cheng, Justin and Adamic, Lada and Dow, P. Alex and Kleinberg, Jon Michael and Leskovec, Jure}, urldate = {2015-04-06}, date = {2014}, keywords = {cascade prediction, contagion, information diffusion}, file = {Cheng et al. - 2014 - Can Cascades Be Predicted.pdf:/home/jeremy/Zotero/storage/KPPCCRXU/Cheng et al. - 2014 - Can Cascades Be Predicted.pdf:application/pdf} } @article{pedregosa_scikit-learn:_2011, title = {Scikit-learn: Machine learning in python}, volume = {12}, url = {http://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html}, shorttitle = {Scikit-learn}, abstract = {Scikit-learn is a Python module integrating a wide range of state-of-the-art machine learning algorithms for medium-scale supervised and unsupervised problems. This package focuses on bringing machine learning to non-specialists using a general-purpose high-level language. Emphasis is put on ease of use, performance, documentation, and {API} consistency. It has minimal dependencies and is distributed under the simplified {BSD} license, encouraging its use in both academic and commercial settings. Source code, binaries, and documentation can be downloaded from http://scikit-learn.sourceforge.net.}, pages = {2825--2830}, journaltitle = {Journal of Machine Learning Research}, author = {Pedregosa, Fabian and Varoquaux, Gaël and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and Duchesnay, Édouard}, urldate = {2016-06-07}, date = {2011-10}, note = {bibtex: pedregosa\_scikit-learn:\_2011}, file = {Scikit-learn\: Machine Learning in Python:/home/jeremy/Zotero/storage/6XS2PM2P/Pedregosa et al. - 2011 - Scikit-learn Machine Learning in Python.pdf:application/pdf} } @article{zimmer_okcupid_2016, title = {{OkCupid} Study Reveals the Perils of Big-Data Science}, url = {https://www.wired.com/2016/05/okcupid-study-reveals-perils-big-data-science/}, abstract = {The data of 70,000 {OKCupid} users is now searchable in a database. Ethicist Michael T Zimmer explains why it doesn't matter that it was "already public."}, journaltitle = {{WIRED}}, author = {Zimmer, Michael}, urldate = {2016-08-31}, date = {2016-05-14}, file = {Snapshot:/home/jeremy/Zotero/storage/KV5P4IA9/okcupid-study-reveals-perils-big-data-science.html:text/html} } @article{merton_matthew_1968, title = {The Matthew effect in science}, volume = {159}, url = {http://www.unc.edu/~fbaum/teaching/PLSC541_Fall06/Merton_Science_1968.pdf}, pages = {56--63}, number = {3810}, journaltitle = {Science}, author = {Merton, Robert K.}, urldate = {2014-09-27}, date = {1968}, file = {[PDF] from unc.edu:/home/jeremy/Zotero/storage/B3H2PG6R/Merton - 1968 - The Matthew effect in science.pdf:application/pdf} } @article{barabasi_emergence_1999, title = {Emergence of Scaling in Random Networks}, volume = {286}, issn = {0036-8075, 1095-9203}, url = {http://science.sciencemag.org/content/286/5439/509}, doi = {10.1126/science.286.5439.509}, abstract = {Systems as diverse as genetic networks or the World Wide Web are best described as networks with complex topology. A common property of many large networks is that the vertex connectivities follow a scale-free power-law distribution. This feature was found to be a consequence of two generic mechanisms: (i) networks expand continuously by the addition of new vertices, and (ii) new vertices attach preferentially to sites that are already well connected. A model based on these two ingredients reproduces the observed stationary scale-free distributions, which indicates that the development of large networks is governed by robust self-organizing phenomena that go beyond the particulars of the individual systems.}, pages = {509--512}, number = {5439}, journaltitle = {Science}, author = {Barabási, Albert-László and Albert, Réka}, urldate = {2016-10-06}, date = {1999-10-15}, langid = {english}, pmid = {10521342}, file = {Barabási and Albert - 1999 - Emergence of Scaling in Random Networks.pdf:/home/jeremy/Zotero/storage/D4DAX5XA/Barabási and Albert - 1999 - Emergence of Scaling in Random Networks.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/JETSMGUZ/509.html:text/html} } @article{rosvall_mapping_2010, title = {Mapping Change in Large Networks}, volume = {5}, issn = {1932-6203}, url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0008694}, doi = {10.1371/journal.pone.0008694}, abstract = {Change is a fundamental ingredient of interaction patterns in biology, technology, the economy, and science itself: Interactions within and between organisms change; transportation patterns by air, land, and sea all change; the global financial flow changes; and the frontiers of scientific research change. Networks and clustering methods have become important tools to comprehend instances of these large-scale structures, but without methods to distinguish between real trends and noisy data, these approaches are not useful for studying how networks change. Only if we can assign significance to the partitioning of single networks can we distinguish meaningful structural changes from random fluctuations. Here we show that bootstrap resampling accompanied by significance clustering provides a solution to this problem. To connect changing structures with the changing function of networks, we highlight and summarize the significant structural changes with alluvial diagrams and realize de Solla Price's vision of mapping change in science: studying the citation pattern between about 7000 scientific journals over the past decade, we find that neuroscience has transformed from an interdisciplinary specialty to a mature and stand-alone discipline.}, pages = {e8694}, number = {1}, journaltitle = {{PLOS} {ONE}}, shortjournal = {{PLOS} {ONE}}, author = {Rosvall, Martin and Bergstrom, Carl T.}, urldate = {2016-07-08}, date = {2010-01-27}, keywords = {Medicine and health sciences, Behavioral neuroscience, neuroscience, Algorithms, Structure of markets, Molecular neuroscience, Simulated annealing, Cellular neuroscience}, file = {Full Text PDF:/home/jeremy/Zotero/storage/79Q8AFD4/Rosvall and Bergstrom - 2010 - Mapping Change in Large Networks.pdf:application/pdf;Snapshot:/home/jeremy/Zotero/storage/7Z6NMBHX/article.html:text/html} } @inproceedings{tufekci_big_2014, title = {Big Questions for social media big data: Representativeness, validity and other methodological pitfalls}, isbn = {978-1-57735-657-8}, shorttitle = {Big Questions for social media big data}, abstract = {Large-scale databases of human activity in social media have captured scientific and policy attention, producing a flood of research and discussion. This paper considers methodological and conceptual challenges for this emergent field, with special attention to the validity and representativeness of social media big data analyses. Persistent issues include the over-emphasis of a single platform, Twitter, sampling biases arising from selection by hashtags, and vague and unrepresentative sampling frames. The sociocultural complexity of user behavior aimed at algorithmic invisibility (such as subtweeting, mock-retweeting, use of "screen captures" for text, etc.) further complicate interpretation of big data social media. Other challenges include accounting for field effects, i.e. broadly consequential events that do not diffuse only through the network under study but affect the whole society. The application of network methods from other fields to the study of human social activity may not always be appropriate. The paper concludes with a call to action on practical steps to improve our analytic capacity in this promising, rapidly-growing field.. Copyright © 2014, Association for the Advancement of Artificial Intelligence (www.aaai.org). All rights reserved.}, eventtitle = {Proceedings of the 8th International Conference on Weblogs and Social Media, {ICWSM} 2014}, pages = {505--514}, author = {Tufekci, Z.}, date = {2014} } @article{lazer_parable_2014, title = {The Parable of Google Flu: Traps in Big Data Analysis}, volume = {343}, rights = {Copyright © 2014, American Association for the Advancement of Science}, issn = {0036-8075, 1095-9203}, url = {http://science.sciencemag.org/content/343/6176/1203}, doi = {10.1126/science.1248506}, shorttitle = {The Parable of Google Flu}, abstract = {In February 2013, Google Flu Trends ({GFT}) made headlines but not for a reason that Google executives or the creators of the flu tracking system would have hoped. Nature reported that {GFT} was predicting more than double the proportion of doctor visits for influenza-like illness ({ILI}) than the Centers for Disease Control and Prevention ({CDC}), which bases its estimates on surveillance reports from laboratories across the United States (1, 2). This happened despite the fact that {GFT} was built to predict {CDC} reports. Given that {GFT} is often held up as an exemplary use of big data (3, 4), what lessons can we draw from this error? Large errors in flu prediction were largely avoidable, which offers lessons for the use of big data. Large errors in flu prediction were largely avoidable, which offers lessons for the use of big data.}, pages = {1203--1205}, number = {6176}, journaltitle = {Science}, author = {Lazer, David and Kennedy, Ryan and King, Gary and Vespignani, Alessandro}, urldate = {2016-10-06}, date = {2014-03-14}, langid = {english}, pmid = {24626916}, file = {Full Text PDF:/home/jeremy/Zotero/storage/UFHNQF8W/Lazer et al. - 2014 - The Parable of Google Flu Traps in Big Data Analy.pdf:application/pdf} } @article{boyd_critical_2012, title = {Critical questions for big data}, volume = {15}, issn = {1369-118X}, url = {http://dx.doi.org/10.1080/1369118X.2012.678878}, doi = {10.1080/1369118X.2012.678878}, abstract = {The era of Big Data has begun. Computer scientists, physicists, economists, mathematicians, political scientists, bio-informaticists, sociologists, and other scholars are clamoring for access to the massive quantities of information produced by and about people, things, and their interactions. Diverse groups argue about the potential benefits and costs of analyzing genetic sequences, social media interactions, health records, phone logs, government records, and other digital traces left by people. Significant questions emerge. Will large-scale search data help us create better tools, services, and public goods? Or will it usher in a new wave of privacy incursions and invasive marketing? Will data analytics help us understand online communities and political movements? Or will it be used to track protesters and suppress speech? Will it transform how we study human communication and culture, or narrow the palette of research options and alter what ‘research’ means? Given the rise of Big Data as a socio-technical phenomenon, we argue that it is necessary to critically interrogate its assumptions and biases. In this article, we offer six provocations to spark conversations about the issues of Big Data: a cultural, technological, and scholarly phenomenon that rests on the interplay of technology, analysis, and mythology that provokes extensive utopian and dystopian rhetoric.}, pages = {662--679}, number = {5}, journaltitle = {Information, Communication \& Society}, author = {given=danah, family=boyd AND Kate Crawford}, urldate = {2016-08-09}, date = {2012}, file = {boyd and Crawford - 2012 - Critical Questions for Big Data.pdf:/home/jeremy/Zotero/storage/XEM23ZJG/boyd and Crawford - 2012 - Critical Questions for Big Data.pdf:application/pdf} } @book{silver_signal_2015, location = {New York, New York}, title = {The Signal and the Noise: Why So Many Predictions Fail--but Some Don't}, isbn = {978-0-14-312508-2}, shorttitle = {The Signal and the Noise}, abstract = {One of Wall Street Journal's Best Ten Works of Nonfiction in 2012   New York Times Bestseller “Not so different in spirit from the way public intellectuals like John Kenneth Galbraith once shaped discussions of economic policy and public figures like Walter Cronkite helped sway opinion on the Vietnam War…could turn out to be one of the more momentous books of the decade.” —New York Times Book Review   "Nate Silver's The Signal and the Noise is The Soul of a New Machine for the 21st century." —Rachel Maddow, author of Drift "A serious treatise about the craft of prediction—without academic mathematics—cheerily aimed at lay readers. Silver's coverage is polymathic, ranging from poker and earthquakes to climate change and terrorism." —New York Review of Books Nate Silver built an innovative system for predicting baseball performance, predicted the 2008 election within a hair’s breadth, and became a national sensation as a blogger—all by the time he was thirty. He solidified his standing as the nation's foremost political forecaster with his near perfect prediction of the 2012 election. Silver is the founder and editor in chief of {FiveThirtyEight}.com.  Drawing on his own groundbreaking work, Silver examines the world of prediction, investigating how we can distinguish a true signal from a universe of noisy data. Most predictions fail, often at great cost to society, because most of us have a poor understanding of probability and uncertainty. Both experts and laypeople mistake more confident predictions for more accurate ones. But overconfidence is often the reason for failure. If our appreciation of uncertainty improves, our predictions can get better too. This is the “prediction paradox”: The more humility we have about our ability to make predictions, the more successful we can be in planning for the future.In keeping with his own aim to seek truth from data, Silver visits the most successful forecasters in a range of areas, from hurricanes to baseball, from the poker table to the stock market, from Capitol Hill to the {NBA}. He explains and evaluates how these forecasters think and what bonds they share. What lies behind their success? Are they good—or just lucky? What patterns have they unraveled? And are their forecasts really right? He explores unanticipated commonalities and exposes unexpected juxtapositions. And sometimes, it is not so much how good a prediction is in an absolute sense that matters but how good it is relative to the competition. In other cases, prediction is still a very rudimentary—and dangerous—science.Silver observes that the most accurate forecasters tend to have a superior command of probability, and they tend to be both humble and hardworking. They distinguish the predictable from the unpredictable, and they notice a thousand little details that lead them closer to the truth. Because of their appreciation of probability, they can distinguish the signal from the noise.With everything from the health of the global economy to our ability to fight terrorism dependent on the quality of our predictions, Nate Silver’s insights are an essential read.}, pagetotal = {560}, publisher = {Penguin Books}, author = {Silver, Nate}, date = {2015} } @online{sandvig_why_2016, title = {Why I Am Suing the Government}, url = {https://socialmediacollective.org/2016/07/01/why-i-am-suing-the-government/}, titleaddon = {Social Media Collective Research Blog}, type = {Web Log}, author = {Sandvig, Christian}, urldate = {2016-10-23}, date = {2016-07-01}, file = {Snapshot:/home/jeremy/Zotero/storage/9USUHHJB/why-i-am-suing-the-government.html:text/html} } @book{domingos_master_2015, location = {New York, New York}, title = {The Master Algorithm: How the Quest for the Ultimate Learning Machine Will Remake Our World}, shorttitle = {The Master Algorithm}, abstract = {Algorithms increasingly run our lives. They find books, movies, jobs, and dates for us, manage our investments, and discover new drugs. More and more, these algorithms work by learning from the trails of data we leave in our newly digital world. Like curious children, they observe us, imitate, and experiment. And in the world’s top research labs and universities, the race is on to invent the ultimate learning algorithm: one capable of discovering any knowledge from data, and doing anything we want, before we even ask.Machine learning is the automation of discovery—the scientific method on steroids—that enables intelligent robots and computers to program themselves. No field of science today is more important yet more shrouded in mystery. Pedro Domingos, one of the field’s leading lights, lifts the veil for the first time to give us a peek inside the learning machines that power Google, Amazon, and your smartphone. He charts a course through machine learning’s five major schools of thought, showing how they turn ideas from neuroscience, evolution, psychology, physics, and statistics into algorithms ready to serve you. Step by step, he assembles a blueprint for the future universal learner—the Master Algorithm—and discusses what it means for you, and for the future of business, science, and society.If data-ism is today’s rising philosophy, this book will be its bible. The quest for universal learning is one of the most significant, fascinating, and revolutionary intellectual developments of all time. A groundbreaking book, The Master Algorithm is the essential guide for anyone and everyone wanting to understand not just how the revolution will happen, but how to be at its forefront.}, pagetotal = {354}, publisher = {Basic Books}, author = {Domingos, Pedro}, date = {2015} } @inproceedings{arun_finding_2010, title = {On Finding the Natural Number of Topics with Latent Dirichlet Allocation: Some Observations}, isbn = {978-3-642-13656-6}, url = {https://link.springer.com/chapter/10.1007/978-3-642-13657-3_43}, doi = {10.1007/978-3-642-13657-3_43}, series = {Lecture Notes in Computer Science}, shorttitle = {On Finding the Natural Number of Topics with Latent Dirichlet Allocation}, abstract = {It is important to identify the “correct” number of topics in mechanisms like Latent Dirichlet Allocation({LDA}) as they determine the quality of features that are presented as features for classifiers like {SVM}. In this work we propose a measure to identify the correct number of topics and offer empirical evidence in its favor in terms of classification accuracy and the number of topics that are naturally present in the corpus. We show the merit of the measure by applying it on real-world as well as synthetic data sets(both text and images). In proposing this measure, we view {LDA} as a matrix factorization mechanism, wherein a given corpus C is split into two matrix factors M1 and M2 as given by Cd*w = M1d*t x Qt*w. Where d is the number of documents present in the corpus and w is the size of the vocabulary. The quality of the split depends on “t”, the right number of topics chosen. The measure is computed in terms of symmetric {KL}-Divergence of salient distributions that are derived from these matrix factors. We observe that the divergence values are higher for non-optimal number of topics – this is shown by a ’dip’ at the right value for ’t’.}, eventtitle = {Pacific-Asia Conference on Knowledge Discovery and Data Mining}, pages = {391--402}, booktitle = {Advances in Knowledge Discovery and Data Mining}, publisher = {Springer, Berlin, Heidelberg}, author = {Arun, R. and Suresh, V. and Madhavan, C. E. Veni and Murthy, M. N. Narasimha}, urldate = {2017-07-06}, date = {2010-06-21}, langid = {english}, file = {Arun et al. - 2010 - On Finding the Natural Number of Topics with Laten.pdf:/home/jeremy/Zotero/storage/EMMCNH7F/Arun et al. - 2010 - On Finding the Natural Number of Topics with Laten.pdf:application/pdf} }