\documentclass{sigchi}
%\documentclass[12pt]{article} % FOR PRINTING: OTHERWISE REMOVE THIS LINE 

<<preinit, echo=FALSE>>=
knit_hooks[['set']](document = function(x) {
sub('\\usepackage[dvipsnames,usenames]{color}',
'\\usepackage[]{color}', x, fixed = TRUE)
})
@

% Use this section to set the ACM copyright statement (e.g. for
% preprints).  Consult the conference website for the camera-ready
% copyright statement.

% Use this command to override the default ACM copyright statement
% (e.g. for preprints).  Consult the conference website for the
% camera-ready copyright statement.

% Arabic page numbers for submission.  Remove this line to eliminate
% page numbers for the camera ready copy
% \pagenumbering{arabic}
\usepackage[pdflang={en-US},pdftex]{hyperref}
% Load basic packages
\usepackage{dcolumn}
\newcolumntype{d}[1]{D{.}{.}{#1} }
\usepackage{array}
% \usepackage{balance}       % to better equalize the last page
\usepackage{graphics}      % for EPS, load graphicx instead 
\usepackage[T1]{fontenc}   % for umlauts and other diaeresis
\usepackage{txfonts}  % temporarily(?) turned off -mako
\usepackage{mathptmx}

\usepackage{color}
\usepackage{booktabs}
\usepackage{textcomp}
\usepackage{balance}
% Some optional stuff you might like/need.
\usepackage{microtype}        % Improved Tracking and Kerning
 \usepackage[all]{hypcap}    % Fixes bug in hyperref caption linking
\usepackage{ccicons}          % Cite your images correctly!
\usepackage[utf8]{inputenc} % for a UTF8 editor only

\usepackage{dcolumn}
% Paper metadata (use plain text, for PDF inclusion and later
% re-using, if desired).  Use \emtpyauthor when submitting for review
% so you remain anonymous.
\def\plaintitle{Revisiting ``The Rise and Decline'' \\ in a Population of Peer Production Projects}
\def\plainkeywords{governance; peer production; online communities; quality control; retention; replication; Wikipedia; wikis}
\def\plainauthor{Nathan TeBlunthuis, Aaron Shaw, Benjamin Mako Hill}
\def\emptyauthor{}


% llt: Define a global style for URLs, rather that the default one
%  \makeatletter
% \def\url@leostyle{%
%  \@ifundefined{selectfont}{
%    \def\UrlFont{\sf}
%  }{
%    \def\UrlFont{\small\bf\ttfamily}
%  }}
% \makeatother
% \urlstyle{leo}

% To make various LaTeX processors do the right thing with page size.
 \def\pprw{8.5in}
\def\pprh{11in}
\special{papersize=\pprw,\pprh}
\setlength{\paperwidth}{\pprw}
\setlength{\paperheight}{\pprh}
\setlength{\pdfpagewidth}{\pprw}
\setlength{\pdfpageheight}{\pprh}

\usepackage{tikz}
\usetikzlibrary{arrows}
\usetikzlibrary{positioning}

% Make sure hyperref comes last of your loaded packages, to give it a
% fighting chance of not being over-written, since its job is to
% redefine many LaTeX commands.
\definecolor{linkColor}{RGB}{6,125,233}
\hypersetup{%
  pdftitle={\plaintitle},
% Use \plainauthor for final version.
  pdfauthor={\plainauthor},
%  pdfauthor={\emptyauthor},
  pdfkeywords={\plainkeywords},
  pdfdisplaydoctitle=true, % For Accessibility
  bookmarksnumbered,
  pdfstartview={FitH},
  colorlinks,
  citecolor=black,
  filecolor=black,
  linkcolor=black,
  urlcolor=black,
  breaklinks=true,
  hypertexnames=false
}

% create a shortcut to typeset table headings
 \newcommand\tabhead[1]{\small\textbf{#1}}

% End of preamble. Here it comes the document.
\toappear{\scriptsize Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). Copyright is held by the author/owner(s). \\
{\emph{CHI 2018, April 21--26, 2018, Montr\'eal, QC, Canada.} } \\
ACM ISBN 978-1-4503-5620-6/18/04. \\
https://doi.org/10.1145/3173574.3173929}
% Update the XXXX string to your assigned DOI from ACM.

\clubpenalty=10000
\widowpenalty = 10000


\begin{document}
%CopyrightYear{2018}
%setcopyright{rightsretained}
%conferenceinfo{CHI 2018}{April 21--26, 2018, Montreal, QC, %Canada}\isbn{978-1-4503-5620-6/18/04}
%\doi{}


<<init,echo=FALSE>>=

library(scales)
neg.log.2 <- function(x){
    ix.1 <- (x<0) & !is.na(x)
    ix.2 <- (x>0) & !is.na(x)
    x[ix.1] <- -1*log(abs(x[ix.1]),base=2) - 1 
    x[ix.2] <- log(x[ix.2],base=2) + 1
    return(x)
}

neg.log.2.inv <- function(x){
    ix.1 <- (x<0) & !is.na(x)
    ix.2 <- (x>0) & !is.na(x)
    x[ix.1] <- -1*2**(abs(x[ix.1] + 1))
    x[ix.2] <- 2**(x[ix.2]-1)
    return(x)
}

neg.log2.trans <- trans_new("neg.log2", transform=neg.log.2,inverse=neg.log.2.inv)

format.ordinal <- function(n){
    n <- as.character(n)
    last <- substr(n,nchar(n),nchar(n))
    first <- substr(n,1,1)
    if(first == '0')
        n = last
    if( n == "1")
        return(paste0(n,"\\textsuperscript{st}"))
    else if(last == "2")
        return(paste0(n,"\\textsuperscript{nd}"))
    else if (last == "3")
        return(paste0(n,"\\textsuperscript{rd}"))
    else
        return(paste0(n,"\\textsuperscript{th}"))
}

library('ggplot2')
library('data.table')
suppressPackageStartupMessages(library('lubridate'))

logit <- function(x,B){
  1 / (1 + exp(-x*B))
}

odds <- function(x,B){
  logit(x,B) / (1 - logit(x,B))
}

#suppressPackageStartupMessages(library(texreg,quietly=TRUE,lib.loc="x86_64-pc-linux-gnu-library"))
bold <- function(x) {paste('{\\textbf{',x,'}}', sep ='')}
gray <- function(x) {paste('{\\textcolor{gray}{',x,'}}', sep ='')}
wrapify <- function (x) {paste("{", x, "}", sep="")}

r.userroles <- readRDS("knitr/lib-01-generate_userroles.RDS")
r <- readRDS("knitr/remember.RDS")
attach(r)
m1.coef <- as.list(halfak.model@coef)
m1.se <- as.list(halfak.model@se)
m2.coef <- as.list(morgan.model@coef)
m2.se <- as.list(morgan.model@se)

f <- function (x) {formatC(x, format="d", big.mark=',')}
format.percent <- function(x) {paste(signif(100*x,2),"\\%",sep='')}
format.day.ordinal <- function(x) {
    day <- format(x,format="%d")
    daylast <- substr(day,nchar(day),nchar(day))
    dayfirst <- substr(day,1,1)
    if(dayfirst == '0')
        day = daylast

    if( daylast == "1")
        day <- paste0(day,"\\textsuperscript{st}")
    else if(daylast == "2")
        day <- paste0(day,"\\textsuperscript{nd}")
    else if (daylast == "3")
        day <- paste0(day,"\\textsuperscript{rd}")
    else
        day <- paste0(day,"\\textsuperscript{th}")
        
    return(day)
}

format.month <- function(x){
    return( format(x,format='%B %Y'))
}

format.date <- function(x) {
    return(paste(format(x,format = '%B'),format.day.ordinal(x),format(x,format='%Y'),sep=' '))
}
@

\title{\plaintitle}

\numberofauthors{3}
\author{%
  \alignauthor{Nathan TeBlunthuis\\
    \affaddr{University of Washington}\\
    \affaddr{Seattle, Washington, USA}\\
    \email{nathante@uw.edu}}\\
  \alignauthor{Aaron Shaw\\
    \affaddr{Northwestern University}\\
    \affaddr{Evanston, IL, USA}\\
    \email{aaronshaw@northwestern.edu}}\\
  \alignauthor{Benjamin Mako Hill\\
    \affaddr{University of  Washington}\\
    \affaddr{Seattle, Washington, USA}\\
    \email{makohill@uw.edu}}\\
}
\maketitle

\begin{abstract}
Do patterns of growth and stabilization found in large peer production systems such as Wikipedia occur in other communities? This study assesses the generalizability of Halfaker et al.'s influential 2013 paper on “The Rise and Decline of an Open Collaboration System.” We replicate its tests of several theories related to newcomer retention and norm entrenchment using a dataset of hundreds of active peer production wikis from Wikia. We reproduce the subset of the findings from Halfaker and colleagues that we are able to test, comparing both the estimated signs and magnitudes of our models. Our results support the external validity of Halfaker et al.'s claims that quality control systems may limit the growth of peer production communities by deterring new contributors and that norms tend to become entrenched over time. 
\end{abstract}

\category{H.5.3.}{Information Interfaces and Presentat
ion (e.g. HCI)}{Group and Organization Interfaces -- Computer-supported cooperative work}

\keywords{governance; peer production; online communities; quality control; retention; replication; Wikipedia; wikis}


\section{Introduction}

``Peer production'' describes a way of organizing collaborative information production in online commons \cite{benkler_coases_2002-2}. Over the last decade, peer production has become a central object of HCI research. However, the vast majority of peer production research has studied a small number of the largest communities \cite{benkler_peer_2015,crowston_free/libre_2008}. An enormous portion of empirical studies of peer production in HCI are of the English-language version of Wikipedia. Unfortunately, HCI's historical focus on novelty has meant that tests of the applicability of findings shown in one setting to other contexts rarely happens \cite{wilson_replichi_2011}. As a result, we know little about the degree to which theory and design claims from studies of Wikipedia apply more broadly.

\begin{figure}
<<plot-editors, echo=FALSE,fig.height=3.125,fig.width=5,out.width="\\columnwidth",cache=FALSE,message=FALSE,warning=FALSE>>=
xlabels = paste0("Year ", 0:max(plot.active.editors.dt$wiki.age.years))

xbreaks = plot.active.editors.dt[,.(b=min(wiki.age.months)),by=.(wiki.age.years)]$b

p2 <- ggplot(plot.active.editors.dt, aes(y=sd.units.active.editors,x=wiki.age.months,ymin=lower.ci,ymax=upper.ci)) + geom_point()

p2 <- p2 + geom_errorbar(width=0.5,alpha=0.25)

p2 <- p2 + geom_smooth(method='loess',se=FALSE,linetype='dashed',color='#E69F00',size=1.1)

p2 <- p2 + scale_x_continuous(name="Wiki age",breaks=xbreaks,labels=xlabels) + scale_y_continuous(name="Active editors (Std dev units)",limit=c(1,2.05),breaks=c(1,1.5,2),minor_breaks=NULL)

p2 <- p2 + theme_minimal(base_size=12) + theme(legend.position="None") 

print(p2)
@

\caption{Mean of the number of editors with at least 5 edits per month in standard deviation units for wikis in our sample. The dashed lines represent the results of a LOESS regression. The error bars represent bootstrap 95\% confidence intervals. This replicates Figure 2 in RAD.}
\label{plot.editors.time}
\end{figure}

This paper replicates analysis from Halfaker et al.'s ``The Rise and Decline of an Open Collaboration System'' \cite{halfaker_rise_2013} (which we abbreviate  ``RAD'') in a sample of \Sexpr{n.wikia.wikis} active wikis hosted on Wikia.\footnote{Wikia is a wiki hosting platform where anyone can start a wiki. In 2016, Wikia partially rebranded as ``Fandom'' to emphasize support for fan communities. See: \url{https://www.wikia.com/} (\url{https://perma.cc/TL79-VB57}).} RAD makes one of the most influential and highly cited claims about peer production dynamics, attributing English Wikipedia's decline in contributors since 2007 to entrenchment (RAD uses the term ``calcification'') within the community as norms and policies become difficult to change, especially for newer users. Our results reproduce most of RAD's findings. Like RAD, we find that the average community in our dataset experiences a ``rise and decline,'' that newcomers are less likely to survive over time, that rejected newcomers are less likely to survive, that editors with longer tenure have more influence over norms, and that norms become entrenched as wikis age. In addition to providing an external validation of RAD's findings, we rule out alternative explanations of RAD's results that emphasize unique attributes of Wikipedia or the timing of the editor decline in that community.

\section{Background}
\subsection{Entrenchment in Wikipedia and Peer Production}

Active peer production communities often experience a period of rapid growth followed by stabilization \cite{ortega_wikipedia:_2009, schweik_internet_2012}. Following this pattern, the number of contributors to English Wikipedia grew exponentially until March 2007, when it began to decline \cite{suh_singularity_2009-2}. Although early accounts of peer production argued that projects such as Wikipedia and the Linux kernel mobilized massive collaboration without the sorts of formal hierarchies or bureaucracies used in formal organizations \cite{benkler_coases_2002-2, konieczny_governance_2009}, organizational research has argued that the formalization of rules, norms, and routines accompany this trajectory in many types of organizations \cite{hannan_population_1977, meyer_institutionalized_1977, scott_organizations_2006}. Drawing from this work, early explanations for Wikipedia's decline included bureaucratic overhead and increasing resistance to contributions from less active editors \cite{suh_singularity_2009-2, forte_decentralization_2009}. During this same period, algorithmic tools such as ``bots'' became important parts of Wikipedia's quality control systems \cite{geiger_work_2010}, and the proportion of edits that were rejected increased  \cite{halfaker_dont_2011}. Building on this prior work, Halfaker et al.'s ``The Rise and Decline of an Open Collaboration System'' \cite{halfaker_rise_2013} found evidence in support of the theory that three elements of Wikipedia's quality control system---newcomer rejection, algorithmic tools, and norm entrenchment---could explain the transition from growth to decline. However, despite the impact and influence of RAD's explanation, it has not been replicated beyond Wikipedia until now.

\subsection{Replication in Social Computing Research}

Although HCI research prizes novelty and provocation, it also seeks to build scientifically rigorous, replicable, and generalizable knowledge \cite{wilson_replichi_2011}. Replicability refers to how well results hold up when other researchers follow reported procedures. Hornæk et al.~define replication studies as attempts ``to confirm, expand, or generalize an earlier study's findings''  \cite{hornbaek_is_2014}. Generalizability (external validity) refers to the degree to which results hold up across different populations \cite{bollen_social_2015}. Replication studies thus assess whether details of context or methodological choice explain results.

Although comparative analysis of peer production communities has emerged as an important means to understand the life cycles and dynamics of social computing systems \cite{ortega_wikipedia:_2009,roth_measuring_2008,shaw_laboratories_2014}, there have been few efforts to establish whether findings from Wikipedia and other large communities replicate or generalize \cite{benkler_peer_2015,hill_studying_2017}. In one important exception, Kittur and Kraut \cite{kittur_beyond_2010} examine the prevalence of social mechanisms related to conflict and coordination in Wikipedia among nearly 7,000 wikis from Wikia. Their work found both similarities and differences between these communities and Wikipedia.

\subsection{Replicating RAD}

Do the relationships described in RAD generalize to other peer production communities? The evidence on project life cycles, stabilization, and entrenchment suggests that similar patterns may occur beyond Wikipedia. However, Wikipedia's scale and popularity make it a unique outlier among these communities. It is likely unusual in other ways as well. In their conclusion, the RAD authors note: ``Wikipedia's  [growth and quality assurance] challenges may seem unique to its status as one of the largest collaborative projects in human history.'' Nevertheless, they suggest that their analysis of ``sociotechnical gatekeeping and its consequences''  has general applicability.  Indeed, their conclusions have informed analyses of crowdsourced fund-raising \cite{agrawal_simple_2014}, social media \cite{crawford_what_2016}, and online collaborative mapping \cite{palen_success_2015}. 

This paper assesses the replicability and external validity of RAD to provide an empirical foundation for such generalization. In doing so, we also evaluate several alternative explanations that RAD could not rule out. In particular, the simultaneous decline and entrenchment RAD observes in Wikipedia could be driven by external factors related to time, such as the rise of other online communities (e.g., Facebook) that might compete for newcomers. By studying a population of communities whose trajectories start at different points in time, we can model wiki age accounting for calendar time in ways RAD could not. By studying many communities, we can also better understand the scope of RAD's generalizability by measuring variation between wikis.

\section{Methods}
 
% P.1
We attempt to follow RAD's measures and methods to the fullest extent possible. In some places, we are forced to make changes to accommodate differences between English Wikipedia and Wikia and the fact that our analysis includes multiple wikis. To describe our methodology, we briefly summarize RAD's techniques and note several ways that we diverge. Additional detail on operationalization is provided in RAD. We also provide access to the complete R source code that we used to complete our analysis in the supplementary material that accompanies this paper. In the description that follows, variable names are italicized.

% P.2
The RAD authors present three interdependent analyses. The first tests whether the rejection of edits made by newcomers causes decreased newcomer retention which in turn leads to a decline in the number of active editors.
To support this claim, the RAD authors use data from English Wikipedia to plot three trends: the number of active contributors, the rate of newcomer survival, and the rate of newcomer rejection. The first plot shows the rise and decline in active contributors (i.e.,~individuals who make at least 5 edits in a given month). The second plot shows that the proportion of good-faith newcomers who ``survive'' falls over time. The third plot shows that the proportion of good-faith newcomers ``rejected'' in their first edit session rises over time. RAD considers a newcomer to have \emph{survived} if the newcomer edits during the period between \Sexpr{as.double(newcomer.period,units='days')} days and \Sexpr{as.double(newcomer.sunset,units='days')} days after their first edit session (i.e.,~sequence of consecutive edits less than one hour apart) and to have been \emph{rejected} if a change the newcomer makes to an article in their first edit session is undone. Using data from all newcomers drawn from a set of Wikia wikis, we replicate these plots in our Study 1. 

% P.3
Additionally, RAD provides evidence that newcomer rejection is a mechanism for declining newcomer retention by estimating logistic regression models predicting newcomer survival. According to these models, newcomers are less likely to survive both when rejected and when the community was older. RAD presents separate models for good-faith newcomers and for all newcomers. The variables in their model are \textit{year} to model time, \textit{session edits} (the number of edits made in the first session) to account for the newcomer's early activity level, \textit{messaged} specifying if the newcomer was messaged during the newcomer's first 60 days, \textit{reverted} indicating if the newcomer had an edit to an existing page rejected, and \textit{deleted} to specify whether the newcomer created a new page which was deleted. We replicate these findings in our Study 2.

% P.4
RAD's second analysis builds closely on the same logistic regression to test their theory that the rise of algorithmic quality control tools are an additional cause of Wikipedia's transition from rise to decline. They follow the methods of Geiger et al.~\cite{geiger_defense_2012} to track a number of different tools, including bots, to create a variable, \textit{tool reverted}, that indicates whether the newcomer was reverted by a bot or human using an algorithmic tool. Plots in their paper show that tool use increased greatly and that desirable newcomers were increasingly likely to be reverted by tools. RAD argues that tool use may decrease newcomer retention over-and-above other forms of rejection, because tool users are less likely to practice a norm thought to mitigate discouragement following rejection known as the ``BOLD, revert, discuss cycle'' (BRD). BRD prescribes that reverting editors reciprocate discussion with those they revert.  A negative coefficient for \textit{tool reverted} in the logistic regression model described above provides evidence that algorithmic tool use may be a mechanism for declining newcomer retention.  Because tool-based rejection is extremely rare on Wikia, we do not attempt to replicate RAD's finding that tool use is associated with lower levels of ``BOLD, revert, discuss.'' The rest of their analysis is replicated in our Study 2.

% P.5
In their third analysis, the RAD authors seek to measure the entrenchment of norms on Wikipedia. Norms are formed at many sites on Wikipedia, including three different kinds of norm pages analyzed by RAD: official policy pages, less formal guidelines, and informal essays. As evidence that norm entrenchment may be a cause of the decline, they plot the number of edits to these different kinds of pages over time. Edits to policies and guidelines began decreasing in 2006. Edits to essays slowed during the transition from rise to decline in 2008, decreasing thereafter. 

% P.6
RAD once again uses a logistic regression predicting whether an edit to a norm page is \emph{reverted} to provide evidence of norm entrenchment: norm pages become more difficult to edit over time, measured as \textit{year}, and those with greater \emph{editor tenure} have their contributions to norm pages reverted less often than newer editors. They also model whether or not the norm page was an \textit{essay}, the interaction between \textit{editor tenure} and \textit{essay}, and the interaction between \textit{essay} and \textit{year}. They find that essays had calcified substantially less than policies. Norm pages categories do not exist systematically on Wikia, so we are not able to reproduce the analysis of different levels of formality in norm pages. We replicate the other analyses from RAD's regression analysis in our Study 3.

% P.7
As we have suggested, there are several parts of RAD that we do not attempt to replicate. RAD makes considerable effort to address a threat arising from high prevalence of vandalism on Wikipedia---because vandals may not intend to continue contributing, they may be unaffected by rejection. Additionally, a decline in the number of desirable newcomers may also be a cause of the decline in active contributors. To address these potential confounds, they hand-code a sample of ``good-faith'' newcomers and report that the proportion of newcomers classified as ``good-faith'' fell during the period of rapid growth, about one year before the transition to decline. Their results for their sample of good-faith newcomers and for all newcomers are substantively similar.

% P.8
Our work is only able to replicate RAD in a sample of all newcomers and does not attempt to create a sub-sample of desirable contributors. As experienced Wikipedians, the RAD authors and their volunteer coders were qualified to judge the quality of newcomers. Many Wikia wikis are about subject matter we are not familiar with. In many cases, they are written in languages we cannot read. We are confident in our results despite this omission for two reasons. First, RAD found very similar estimates in the models restricted to good-faith newcomers and the models that include all newcomers. To the extent that their good-faith-only estimates represented a robustness check that their analysis passed, we are comfortable forgoing it. Second, exploratory analysis of our data suggests that rates of vandalism are lower on Wikia than on Wikipedia, which should lessen the underlying threat.

% P.9
Additionally, our analysis deviates from RAD in several ways that reflect the challenges and threats associated with studying a population of communities. Most importantly, we diverge from RAD by using estimation techniques and additional control variables appropriate to data nested within multiple communities.
Because we consider multiple communities, it is possible that a single person might be a ``newcomer'' in our dataset more than once. To avoid analytic problems with repeated measures of users, and because individuals with experience in other wikis are likely not newcomers in the way RAD conceptualized them, our analysis identifies newcomers as individuals who have not edited any wiki in our sample and who are not marked as bots. In results available in our supplement, we fit models that include newcomers with prior experience in other wikis in our sample. Our results are not substantively different.

%P.10 TODO. it would be nice to cite something in this paragraph
Of course, RAD itself has limitations. In particular, the quality of the evidence for the proposed causes of newcomer retention hinges on the assumption that other contemporaneous factors did not drive the decline. However, external events such as the rise of social media sites such as Facebook, as well as cultural changes in how the Internet was used and popularly understood, overlap with the transition from growth to decline. Studying multiple wikis that began at different points in time allows us to partially address this limitation in RAD. Our analysis inherits other limitations from RAD that we do not address. Importantly, entrenchment is theorized to contribute to declining newcomer retention, although this relationship is not modeled explicitly.

\subsection{Data}

Our dataset consists of page, user, and revision history data from \Sexpr{n.wikia.wikis} wikis publicly hosted on Wikia, the largest peer production wiki platform in terms of number of communities. Our initial dataset included all public edits to all Wikia wikis between \Sexpr{format.date(earliest.data.point)} and \Sexpr{format.date(latest.data.point)}.
The \Sexpr{n.wikia.wikis} wikis whose data we use to replicate RAD include the top 1\% of Wikia wikis by number of unique registered article editors. We include only these wikis because they have newcomer and governance activity appropriate for replicating RAD.
We follow the RAD authors by only including newcomers we can observe for \Sexpr{as.double(newcomer.sunset,units='days')} days. We also obtain records identifying bot and administrator accounts from the Wikia API. We exclude \Sexpr{length(deleted.wikis)} wikis where the API is unavailable because the wiki had been deleted since the XML archives we used were created.

Our dataset includes substantial variation between wikis, including linguistic diversity,  activity level, and organizational complexity. Wikis vary in size, with numbers of unique contributors ranging from \Sexpr{f(wiki.stats[,min(total.editors)])} to \Sexpr{f(wiki.stats[,max(total.editors)])} (median \Sexpr{f(wiki.stats[,median(total.editors)])}). Some wikis in our sample produce collections of facts about popular culture, video games, and fandom. Others, such as the \textit{Althistory Wiki},\footnote{\url{http://althistory.wikia.com/}  (\url{https://perma.cc/4EPQ-FW6Q})} write collaborative fiction. Still others, such as \textit{Uncyclopedia},\footnote{\url{http://uncyclopedia.wikia.com/} (\url{https://perma.cc/L9CC-KN5A})} parody Wikipedia. 
These \Sexpr{n.wikia.wikis} wikis also vary along our measures. For example, quality control practices vary, and the number of reverts within our communities ranges from \Sexpr{f(wiki.stats[,min(total.reverts)])} to \Sexpr{f(wiki.stats[,max(total.reverts)])} (median \Sexpr{f(wiki.stats[,median(total.reverts)])}). Only \Sexpr{format.percent(wiki.stats[,mean(total.bot.reverts!=0)])} use bots to revert edits and the number of bot reverts among wikis with any ranges from \Sexpr{f(wiki.stats[total.bot.reverts!=0,min(total.bot.reverts)])} to \Sexpr{f(wiki.stats[total.bot.reverts!=0,max(total.bot.reverts)])} (median \Sexpr{f(wiki.stats[total.bot.reverts!=0,median(total.bot.reverts)])}). The communities also vary in terms of policy making activity. A ``namespace'' is a high-level category used on all wikis. The project namespace is typically used for policy and documentation and governance activity. The number of edits to the project namespace ranges from \Sexpr{f(wiki.stats[,min(total.ns4.edits)])} to \Sexpr{f(wiki.stats[,max(total.ns4.edits)])} (median \Sexpr{f(wiki.stats[,median(total.ns4.edits)])}). 

\section{Study 1: Trajectories}

\begin{figure}[t]
<<newcomer_survival_reversion,echo=FALSE,fig.height=3.8,fig.width=5,out.width="\\columnwidth",cache=FALSE,message=FALSE,warning=FALSE>>=
min.newcomers.plotted <- 1
min.wikis.plotted <- 20
p.stats[variable=='p.survives',variable:="Surviving"]
p.stats[variable=='p.reverted',variable:="Reverted"]

p.stats <- p.stats[variable=='Surviving' | variable=="Reverted"] 

max.year <- 5
p.stats <- p.stats[wiki.age.half.years/2 <= max.year]
xlabels <-  paste0("Year ", 0:5)
breaks <- (0:5)*2

equal_breaks <- function(...){
    function(x){
        if(max(x) < 0.1){
            return(c(0,0.015,0.03,0.045,0.06))
        }
        else{
            return(c(0,0.2,0.4,0.6,0.8))
        }
    }
}

p.stats <- p.stats[variable=='Surviving',variable:="Survived"]
p.stats <- p.stats[variable=='Reverted',variable:="Rejected"]
p <- ggplot(p.stats,aes(x=as.factor(wiki.age.half.years),ymin=min,lower=q1,middle=med,upper=q3,ymax=max,width=0.4))
p <- p + geom_boxplot(stat='identity')
p <- p + geom_line(aes(x=wiki.age.half.years + 1, y=mu),color="#E69F00",linetype=2)
p <- p + geom_line(aes(x=wiki.age.half.years + 1, y=med),color="#CC79A7",linetype=1)
p <- p + facet_wrap("variable",nrow=2,strip.position="right",scales="free_y")
p <- p + scale_y_continuous(name="Proportion of newcomers ",minor_breaks=NULL,breaks=equal_breaks()) + scale_x_discrete(name="Wiki age", labels=xlabels,breaks=breaks)
p <- p + theme_minimal(base_size=12)  + theme(legend.position="None",panel.grid.major.x=element_blank(),panel.spacing=unit(2,'lines'))

print(p)
@
\caption{Newcomer survival and rejection over time. The orange dashed line shows the mean and the pink solid line shows the median. Years with data from at least  \Sexpr{min.wikis.plotted} wikis are shown.}
\label{newcomer-survival}
\end{figure}

We first replicate RAD's Figure 2 to determine whether the ``rise and decline'' pattern generalizes. To compare across wikis of vastly different size, we divide the number of monthly active contributors to each wiki in a given month by the standard deviation of that measure within that wiki.
Figure \ref{plot.editors.time} plots our results and shows a trajectory of growth and decline similar to English Wikipedia. While Wikipedia's exponential growth was more explosive and lasted longer, the average active Wikia wiki follows a similar pattern. These communities begin small, tend to grow for 3-4 years, and then transition from growth to decline. 
Because few wikis in our dataset have existed for more than 5 years, we visualize only months with at least \Sexpr{min(plot.active.editors.dt[,N.wikis])} active wikis (the 90th percentile). 
Although the downward trend continues after this threshold, the estimates become noisier. 

Next we replicate RAD's Figures 3 and 4 to visualize the average trajectories in newcomer survival and rejection. Our results are shown in Figure \ref{newcomer-survival}. Lines connect the mean and median rates for all wikis active in each period to show the overall trend. Box plots visualize the variation between wikis.
The top panel of Figure \ref{newcomer-survival} corresponds to RAD's Figure 3 and shows box plots for the proportion of newcomers who survive in each year. As in Wikipedia, newcomer retention declines over time in the average wiki in our dataset. The trend is statistically significant (Spearman's \(\rho=\Sexpr{signif(survives.cor.test$estimate,2)},~p<0.001\)).

The bottom panel of Figure \ref{newcomer-survival} corresponds to RAD's Figure 4 and shows box plots for the proportion of newcomers who are rejected over time. Although rejection is much less common in our wikis than in English Wikipedia, wikis in our sample exhibit increasing rates of newcomer rejection. The trend is statistically significant (Spearman's \(\rho=\Sexpr{signif(reverted.cor.test$estimate,2)},~p<0.001\)). Although our estimates point in the same direction as RAD's, the average trajectory is qualitatively different. Rates of newcomer rejection are initially very low, increase over the first year, and remain level for most of the wiki's lifetime. They begin increasing again in the 4\textsuperscript{th} year, when the number of active editors tends to decline.

\section{Study 2: Newcomer survival}
\subsection{Methods}

We replicate RAD's first logistic regression model predicting whether a newcomer \emph{survived} to test whether being \emph{reverted} or \emph{tool reverted} in the first edit session makes newcomers less likely to survive. RAD includes a single variable capturing the linear effect of time which reflects both the age of Wikipedia and the passage of calendar time. Having multiple wikis, we can tease apart wiki age and calendar time by measuring \emph{wiki age} as the time since the first edit to each wiki in years. We include a linear specification of this variable following RAD. We also add a control for calendar time by adding a categorical variable for \emph{quarter} that includes dummy variables for each 90-day calendar period.
We also include \emph{wiki}, a categorical variable with \Sexpr{f(n.wikia.wikis)} levels to account for variation in baseline level of newcomer retention between wikis and to address issues of serial correlation in our standard errors. We do not report the results for these categorical variables, both for the sake of clarity and because they control for variation in the dataset that does not relate to the core theoretical concerns.

We cannot replicate several facets of this part of RAD's analysis in this study. RAD considers two kinds of newcomer rejection. Although the first of these corresponds with our measure for \emph{reverted}, the RAD authors also consider whether an article created by a newcomer  in their first session is \emph{deleted}. We do not have information about deleted pages. 
Additionally, RAD considers two kinds of algorithmic tools: fully automated ``bots'' and semi-automated editing interfaces that automatically alert human users to suspected vandalism. These interfaces are either very rare or invisible on Wikia, so our measure of \emph{tool reverted} only includes rejection by bots. Summary statistics for our analytic variables are available in the supplementary material.
 
\subsection{Results}

\input{tables/halfak.mod.tex}

Table \ref{table:regression.1} shows our fitted regression model. This table closely mirrors the first column of RAD's Table 1.
Like RAD, we find that newcomers reverted in their first edit session are less likely to survive (\(\beta=\Sexpr{signif(m1.coef[['is.revertedTRUE']],2)}\), \(SE = \Sexpr{signif(m1.se[['is.revertedTRUE']],2)}\)).  The magnitude of our coefficient for \emph{reverted} is very close to that reported by RAD (\(\beta=-0.68\), \( SE=0.04\)). According to our model, a newcomer who is reverted in their first session has \Sexpr{signif(exp(m1.coef[['is.revertedTRUE']]),2)} times the odds of continuing to contribute of a newcomer who is not reverted.
We also find a negative relationship between \emph{wiki age} and newcomer survival (\(\beta=\Sexpr{signif(m1.coef[['wiki.age']],2)}\), \(SE=\Sexpr{signif(m1.se[['wiki.age']],2)}\)). Again, this is very close to that reported by RAD (\(\beta=-0.40\), \(SE=0.012\)).
Our parameter estimate for \emph{tool reverted} (\(\beta=\Sexpr{signif(m1.coef[['is.bot.revertedTRUE']],2)}\), \(SE=\Sexpr{signif(m1.se[['is.bot.revertedTRUE']],2)}\)) suggests that newcomers who are rejected by a bot might be less likely to survive. However, the magnitude of this coefficient is too small relative to its standard error to support confidence in this conclusion.\footnote{A post-hoc power analysis suggests that, even if the true relationship is the same as that observed in RAD, we may have been unable to observe it because only \Sexpr{f(sum(newcomer.summary.stats[['p.bot.reverted']]) * halfak.model@gof[5])} newcomers were reverted by bots in our dataset. See the supplementary materials for details.}


\section{Study 3: Entrenchment}
\subsection{Methods}

Finally, we replicate RAD's second model that predicts whether or not an edit to a policy page will be reverted. Although RAD carefully distinguishes between official policy pages and essays, Wikia contributors do not systematically label policy pages in this way. Therefore, we follow Shaw and Hill \cite{shaw_laboratories_2014} and analyze all edits to the project namespace. This departure presents a substantial threat to validity, as the project namespace may be used for purposes besides documenting norms. Despite this limitation, we believe this measure provides the best available opportunity to study norm entrenchment in Wikia. Not all of the wikis in our sample utilize the namespace, so we use only the subset of \Sexpr{n.wikis.ns4} wikis that do for this analysis. Summary statistics for our analytic variables are available in the supplementary material. 

\subsection{Results}
\input{tables/morgan.model.tex}

Table \ref{table.regression.2} shows the fitted model results and replicates RAD's Table 2.
Like RAD, we find that contributors with greater \emph{editor tenure} are less likely to have their edits to policy pages reverted (\(\beta=\Sexpr{signif(m2.coef[['age']],2)}\), \(SE=\Sexpr{signif(m2.se[['age']],2)}\)). Our model predicts that, everything else equal, an editor with a 1-week tenure faces about \Sexpr{signif(exp(51/52*-1*m2.coef[['age']]),2)} times the odds of having their edit reverted compared to an editor with a 1-year tenure. RAD reported an odds ratio of  \Sexpr{signif(exp(51/52*0.29),2)}. 
Also consistent with RAD, we find that project page edits become more likely to be reverted as \emph{wiki age} increases (\(\beta=\Sexpr{signif(m2.coef[['wiki.age']],2)}\), \(SE=\Sexpr{signif(m2.se[['wiki.age']],2)}\)).  According to our model, an edit to the project namespace on a wiki that is 1 year old has about \Sexpr{signif(exp(51/52*m2.coef[['wiki.age']]),2)} times the odds of rejection as when the wiki is 1 week old. 

\section{Discussion}

We find that the patterns of community entrenchment documented in English Wikipedia also occur in comparable Wikia wikis. Wikis in our dataset experience growth in active contributors over about three years and then decline. Newcomer survival tends to decline over time, and newcomers who are rejected are less likely to survive. Older editors have more influence over norms, and norms become more difficult to change. 

By studying these dynamics outside Wikipedia, we can rule out potential explanations of RAD's results linked to unique characteristics of Wikipedia, such as its specific culture. We can also rule out explanations linked to the specific time at which English Wikipedia experienced its decline. The diversity and size of our sample support both more precise estimation of the observed relationships in the data as well as stronger confidence in the validity of our inferences.

Our work has important limitations. Our data are observational, our sample may have unknown biases, and our measures may contain hidden sources of error. For example, wiki editors may change accounts, bots may be unreported, and the project namespace may include material unrelated to norms. Omitted variables may also bias our results. Readers should be careful not to draw causal conclusions from our findings.  

The units of analysis in our regression models are newcomers and edits to project namespaces. Because the wikis in our sample have different numbers of each, the average effects we report could disproportionately reflect the experience of users in the communities that contribute the most observations to our sample. As a robustness check, we fit another set of regression models where each wiki is given equal weight. Our substantive conclusions are robust to this change. Indeed, the re-weighted models suggest that the relationships reported in RAD may even be stronger in smaller or less active communities. In one unsurprising exception, we find that norm pages do not appear to become more difficult to edit over time in wikis that make very little use of the project namespace. These preliminary findings suggest analysis of the relationship between the size of a community and governance systems as a promising direction for future work. Details are available in the supplementary material. 

Despite our effort at generalization, we cannot know if our findings will generalize beyond the wikis in our sample. That said, we think the mechanisms driving the emergence of entrenchment on wikis are similar to mechanisms theorized to drive the emergence and centralization of authority in democratic organizations. For example, Michels' ``iron law of oligarchy'' predicts that bureaucracies arising in large democratic organizations will centralize authority \cite{michels_political_1915, shaw_laboratories_2014}. Similarly, Freeman's ``The Tyranny of Structurelessness'' describes how, even when activist groups deliberately avoid creating formalized rules and bureaucracies, informal structures arise \cite{freeman_tyranny_1972}. Indeed, due to their opacity, informal structures can be more difficult for newcomers to navigate than formalized bureaucracies. Drawing from this earlier theoretical work and our own results, we believe that the patterns of increasing entrenchment and newcomer rejection we estimate will generalize beyond the wikis in our sample to other peer production projects and informal organizations. Understanding why some communities in our sample show more entrenchment than others remains a fascinating subject for further research.

\section{Conclusion}

Our study supports RAD's claim that quality control practices help explain increases in entrenchment and decreases in growth among peer production communities. Our work contributes to social computing and peer production research by providing evidence in support of the external validity of RAD, an influential empirical study. We also contribute to a small but growing literature on replication in HCI by demonstrating a replication study focused on generalizability. Our evidence in support of generalizability rests not only on the signs of our regression coefficients, but also on the similarity of our point estimates and visualizations. This work supports designers and community managers who are acting on the implications in RAD's earlier work.

\section{Acknowledgments}

The authors would like to thank our anonymous reviewers and associate chairs at CHI for their thoughtful and detailed feedback. We would also like to thank Wikia for providing public access to data from their wikis and other members of the Community Data Science Collective for sharing the software, data, and research infrastructure necessary to complete this work. We thank Jonathan Morgan for providing help in planning this study and Amanda TeBlunthuis, Kaylea Champion, Wm Salt Hale, and Sayamindu Dasgupta for feedback on drafts of our manuscript. This project was completed using the Hyak high performance computing cluster at the University of Washington. Financial support for this work came from the National Science Foundation (grants IIS-1617129, IIS-1617468, and GRFP-2016220885), Northwestern University, and the University of Washington. 

\section{Access to Data}

A replication dataset has been placed in the Harvard Dataverse archive and is available at the following URL: \\ \href{https://doi.org/10.7910/DVN/SG3LP1}{https://doi.org/10.7910/DVN/SG3LP1}.

% REFERENCES FORMAT
\bibliographystyle{SIGCHI-Reference-Format}
\bibliography{refs}

\end{document}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End:

%  LocalWords:  xlabels xbreaks aes loess linetype rebranded RAD's
%  LocalWords:  replicable Hornæk Kittur sociotechnical crowdsourced
%  LocalWords:  BRD Althistory Uncyclopedia namespace ymin ymax Hyak
%  LocalWords:  boxplot Spearman's Structurelessness Kaylea IIS
%  LocalWords:  Dataverse