]> code.communitydata.science - rises_declines_wikia_code.git/commitdiff
Initial commit
authorgroceryheist <nathante@uw.edu>
Sat, 2 Jun 2018 22:32:19 +0000 (15:32 -0700)
committergroceryheist <nathante@uw.edu>
Sat, 2 Jun 2018 22:32:19 +0000 (15:32 -0700)
p# new file:   runwikiq.sh

202 files changed:
00_count_editors.R [new file with mode: 0755]
00_select_wikis.R [new file with mode: 0755]
01_build_datasets.R [new file with mode: 0755]
02_model_newcomer_survival.R [new file with mode: 0755]
03_generate_plots.R [new file with mode: 0755]
04_model_namespace4.R [new file with mode: 0755]
05_power_simulation.R [new file with mode: 0755]
RCommunityData/.Rbuildignore [new file with mode: 0644]
RCommunityData/.gitignore [new file with mode: 0644]
RCommunityData/DESCRIPTION [new file with mode: 0644]
RCommunityData/NAMESPACE [new file with mode: 0644]
RCommunityData/R/hhi.R [new file with mode: 0644]
RCommunityData/R/load_if_missing.R [new file with mode: 0644]
RCommunityData/R/namespaces.R [new file with mode: 0644]
RCommunityData/R/wikia_admin.R [new file with mode: 0644]
RCommunityData/R/wikiq.R [new file with mode: 0644]
RCommunityData/RCommunityData.Rproj [new file with mode: 0644]
README.md [new file with mode: 0644]
lib-00-utils.R [new file with mode: 0644]
lib-01-build_newcomer_table.R [new file with mode: 0644]
lib-01-generate_userroles.R [new file with mode: 0644]
lib-01-sample-datasets.R [new file with mode: 0644]
mediawiki_dump_tools/.gitignore [new file with mode: 0644]
mediawiki_dump_tools/.gitmodules [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/.gitignore [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/CHANGE_LOG.rst [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/LICENSE [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/MANIFEST.in [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/README.rst [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/WORK_LOG.rst [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/debian/changelog [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/debian/compat [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/debian/control [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/debian/copyright [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/debian/rules [new file with mode: 0755]
mediawiki_dump_tools/Mediawiki-Utilities/doc/Makefile [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/doc/_static/PLACEHOLDER [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/doc/_templates/PLACEHOLDER [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/doc/conf.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/doc/core/api.rst [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/doc/core/database.rst [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/doc/core/xml_dump.rst [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/doc/index.rst [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/persistence.rst [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/reverts.rst [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/sessions.rst [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/title.rst [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/doc/types.rst [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/api.deleted_revisions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/api.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/api.recent_changes.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/api.revisions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/api.users.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/database.users.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/dump.xml [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/dump2.xml [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.persistence.api.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.api.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.database.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.sessions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.title.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/timestamp.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/xml_dump.iteration.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/examples/xml_dump.map.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/api/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/collection.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/deleted_revisions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/pages.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/recent_changes.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/revisions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/site_info.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/user_contribs.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/users.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/api/errors.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/api/session.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/database/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/collection.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/pages.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/recent_changes.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/revisions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/users.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/database/db.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/api.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/defaults.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/difference.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/state.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_difference.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_state.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokenization.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokens.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokenization.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokens.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/api.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/database.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/defaults.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/detector.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/dummy_checksum.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/functions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/test_detector.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/test_functions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/cache.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/defaults.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/event.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/functions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/test_cache.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/test_functions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/functions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/parser.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/test_functions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/test_parser.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/types/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/types/namespace.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/types/serializable.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/types/tests/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/types/tests/test_namespace.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/types/tests/test_serializable.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/types/tests/test_timestamp.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/types/timestamp.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/api.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/autovivifying.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/functions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/heap.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/aggregate.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/count.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/peekable.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/sequence.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/tests/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/tests/test_aggregate.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/tests/test_peekable.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/tests/test_sequence.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/ordered.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/test_autovivifying.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/test_functions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/test_heap.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/test_ordered.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/7zfile.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/element_iterator.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/errors.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/functions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/comment.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/contributor.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/iterator.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/namespace.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/page.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/redirect.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/revision.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/test_comment.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/test_iterator.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/test_text.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/text.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/util.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/map.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/processor.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/__init__.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/test_element_iterator.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/test_functions.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/test_map.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/test_processor.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/setup.py [new file with mode: 0644]
mediawiki_dump_tools/Mediawiki-Utilities/tox.ini [new file with mode: 0644]
mediawiki_dump_tools/README.rst [new file with mode: 0644]
mediawiki_dump_tools/mw [new symlink]
mediawiki_dump_tools/wikiq [new file with mode: 0755]
paper_source/ACM-Reference-Format.bst [new file with mode: 0644]
paper_source/Makefile [new file with mode: 0644]
paper_source/SIGCHI-Reference-Format.bst [new file with mode: 0644]
paper_source/acmcopyright.sty [new file with mode: 0644]
paper_source/auto/generalizable_wiki.el [new file with mode: 0644]
paper_source/figure/newcomer_survival_reversion-1.pdf [new file with mode: 0644]
paper_source/figure/plot-editors-1.pdf [new file with mode: 0644]
paper_source/generalizable_wiki.Rtex [new file with mode: 0644]
paper_source/knitr/lib-01-generate_userroles.RDS [new file with mode: 0644]
paper_source/knitr/remember.RDS [new file with mode: 0644]
paper_source/refs.bib [new file with mode: 0644]
paper_source/sigchi.cls [new file with mode: 0644]
paper_source/tables/halfak.mod.tex [new file with mode: 0644]
paper_source/tables/morgan.model.tex [new file with mode: 0644]
paper_source/todo.txt [new file with mode: 0644]
regen.all.sh [new file with mode: 0755]
runwikiq.sh [new file with mode: 0755]
userroles_scraper_scripts/list_of_wikis.csv [new file with mode: 0644]
userroles_scraper_scripts/userroles_from_listusers.py [new file with mode: 0755]
userroles_scraper_scripts/userroles_from_logevents.py [new file with mode: 0755]

diff --git a/00_count_editors.R b/00_count_editors.R
new file mode 100755 (executable)
index 0000000..8d7f194
--- /dev/null
@@ -0,0 +1,28 @@
+#!/usr/bin/env Rscript
+## script that saves the number of unique editors to a wiki. For use with parallelsql
+
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
+source("lib-00-utils.R")
+opt <- commandArgs(trailingOnly = TRUE)
+input.dir <- "../wikiq_wikia_2010_all_nopersistance/"
+output.dir <- "../wikiq_wikia_2010_unique_editors/"
+d <- load.wikiq.file(paste0(input.dir,opt[1]))
+
+n.editors <- length(unique(d[anon == FALSE & namespace == 0, editor.id]))
+
+write(n.editors,file=paste0(output.dir,gsub("\\.tsv$",'.editors',opt[1])),append=FALSE)
diff --git a/00_select_wikis.R b/00_select_wikis.R
new file mode 100755 (executable)
index 0000000..de8849c
--- /dev/null
@@ -0,0 +1,44 @@
+#!usr/bin/env Rscript
+
+## Script used to choose the top 1% of wikis to analyze
+
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+library("ggplot2")
+library("data.table")
+counts.dir <- "../wikiq_wikia_2010_unique_editors/"
+files <- list.files(counts.dir)
+read.count.file <- function(f){
+    return(read.csv(paste0(counts.dir,f),header=FALSE))
+}
+dbname <- gsub("\\.editors",'',files)
+counts <- c(sapply(files,read.count.file))
+counts <- unlist(counts,use.names=FALSE)
+dt <- data.table(wiki=dbname,n.editors=counts)
+
+
+#ggplot(dt,aes(x=n.editors)) + stat_ecdf(geom="step") + scale_x_log10(minor_breaks=10**(1:10/2)) + scale_y_continuous(minor_breaks=1:20/20)
+
+top_1_percentile = quantile(x=dt$n.editors,probs=(1:99)/100)[99]
+## lets take all with > 100. This is very close to the top 1%, but it involves nice round numbers :)
+
+wiki.list <- dt[n.editors >= top_1_percentile]
+
+wiki.list[is.na(url),':='(url=paste0("http://",wiki,".wikia.com/"))]
+wiki.list$wiki.type="wikia"
+
+fwrite(wiki.list,"selected.wikis.csv")
+           
diff --git a/01_build_datasets.R b/01_build_datasets.R
new file mode 100755 (executable)
index 0000000..89ab813
--- /dev/null
@@ -0,0 +1,77 @@
+#!/usr/bin/env Rscript
+# Top level script for building datasets. 
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+library(data.table)
+library(parallel)
+
+plot.distribution <- function(data,varname,save=TRUE){
+    x = data[[varname]];
+    print(paste("plotting distribution for",varname))
+    if(save){
+        pdf(paste0("plots/",varname,".distribution.pdf"))
+    }
+    ## overlay histogram, empirical density and normal density
+    if(class(x) == "logical"){
+        p0 <- qplot(x)
+    }
+    else{
+    
+        p0 = qplot(x, geom = 'blank') +   
+        geom_line(aes(y = ..density.., colour = 'Empirical'), stat = 'density') +  
+        geom_histogram(aes(y = ..density..), alpha = 0.4,bins=100) +                        
+        scale_colour_manual(name = 'Density', values = c('red', 'blue')) + 
+        theme(legend.position = c(0.85, 0.85))
+    }    
+    print(p0)
+    if(save){
+    dev.off()
+    }
+}
+
+if(!exists("wiki.list")){
+    source("lib-00-utils.R",echo=TRUE)
+}
+
+if(!exists("bots") | !exists("admins")){
+    if(file.exists("bots.RDS") & file.exists("admins.RDS")){
+        bots = readRDS("bots.RDS")
+        admins = readRDS("admins.RDS")
+    }
+    else {
+        source("lib-01-generate_userroles.R",echo=TRUE)
+    }
+}
+
+if(!exists("newcomer.dt")){
+    intermediate.files <- list("newcomers.RDS","wikiweeks.RDS","wiki.stats.RDS","active.editors.RDS")
+    if(! all(sapply(intermediate.files,function (x) file.exists(x)))){
+        source("lib-01-build_newcomer_table.R",echo=TRUE)
+    }
+}
+
+plot.distributions = FALSE
+if(plot.distributions == TRUE){
+    library(ggplot2)
+    ## plot distributions for model 1
+    outcome1 <- c("survives")
+    predictors1 <- c("is.reverted","is.messaged","is.bot.reverted","is.reverted.messaged","is.admin.reverted","BRD.initiation","BRD.reciprocation")
+    controls1 <- c("ns0.edits","ns1.edits","ns4.edits","n.other.wikis","week","has.edited.other.wikis","n.edits.other","n.messages","n.editors","total.wiki.length","revert.rate","revert.disc.rate","newcomer.revert.disc.rate","revert.message.rate","newcomer.revert.message.rate","newcomer.edits.rate","bot.revert.rate","bot.revert.prop","newcomer.bot.revert.rate","newcomer.bot.revert.prop","admin.revert.rate","admin.revert.prop","n.ns4.edits","n.ns4.editors","d.ns4.length","ns4.editor.age","age","wiki.age")
+
+    for(varname in c(outcome1,predictors1,controls1)){
+        plot.distribution(newcomers,varname)
+    }
+}
diff --git a/02_model_newcomer_survival.R b/02_model_newcomer_survival.R
new file mode 100755 (executable)
index 0000000..87ecdca
--- /dev/null
@@ -0,0 +1,162 @@
+#!/usr/bin/env Rscript
+
+# Fits newcomer retention models 
+
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+library(scales)
+if(!exists("newcomers")){
+    source("01_build_datasets.R")
+}
+
+use.sample <- FALSE
+if(use.sample == TRUE){
+    source("lib-01-sample-datasets.R")
+    newcomer.ds <- sample.newcomers()
+}else{
+    newcomer.ds <- newcomers    
+}
+
+library("optimx")
+library("lme4")
+
+newcomer.ds <- newcomer.ds[,wiki:=as.factor(wiki.name)]
+
+newcomer.ds <- newcomer.ds[,":="(
+    wiki.age.log = log1p(as.double(wiki.age,units='weeks')),
+    is.bot.reverted = ifelse(is.na(is.bot.reverted),FALSE,is.bot.reverted),
+    is.admin.reverted = ifelse(is.na(is.admin.reverted),FALSE,is.admin.reverted),
+    year = as.factor(year(time.first.edit)),
+    month = as.factor(paste0(year(time.first.edit),month(time.first.edit))),
+    ns0.edits.log = log1p(ns0.edits),
+    ns1.edits.log = log1p(ns1.edits),
+    ns4.edits.log = log1p(ns4.edits),
+    n.other.wikis.log = log1p(n.other.wikis),
+    n.edits.other.log = log1p(n.edits.other),
+    n.messages.log = log1p(n.messages),
+    n.editors.log = log1p(n.editors),
+    total.wiki.length.log = log1p(total.wiki.length),
+    n.ns4.edits.log = log1p(n.ns4.edits),
+    n.ns4.editors.log = log1p(n.ns4.editors),
+    ns4.editor.age.log = log1p(as.double(ns4.editor.age,units='years')),
+    d.ns4.length.scaled = scale(d.ns4.length),
+    newcomer.chars.changed.scaled = scale(newcomer.chars.change),
+    session.edits.log = log1p(session.edits),
+    wiki.age = as.double(wiki.age,units='years')
+)]
+
+## record summary stats for our analytic variables
+newcomer.summary.stats <- list()
+newcomer.summary.stats$p.survives <- mean(newcomer.ds$survives)
+newcomer.summary.stats$var.survives <- var(newcomer.ds$survives)
+
+outliers <- newcomer.ds[session.edits >= 100]
+newcomer.summary.stats$N.outliers <- nrow(outliers)
+newcomer.summary.stats$p.first.session.no.outliers <- mean(newcomer.ds[session.edits < 100]$session.edits)
+newcomer.summary.stats$var.first.session.no.outliers <- var(newcomer.ds[session.edits < 100]$session.edits)
+
+newcomer.summary.stats$p.reverted <- mean(newcomer.ds$is.reverted)
+newcomer.summary.stats$var.reverted <- var(newcomer.ds$is.reverted)
+newcomer.summary.stats$p.messaged <- mean(newcomer.ds$is.messaged)
+newcomer.summary.stats$var.messaged <- var(newcomer.ds$is.messaged)
+newcomer.summary.stats$mean.first.session.edits <- mean(newcomer.ds$session.edits)
+newcomer.summary.stats$var.first.session.edits <- var(newcomer.ds$session.edits)
+newcomer.summary.stats$med.first.session.edits <- median(newcomer.ds$session.edits)
+newcomer.summary.stats$p.bot.reverted <- mean(newcomer.ds$is.bot.reverted)
+newcomer.summary.stats$var.bot.reverted <- var(newcomer.ds$is.bot.reverted)
+remember(newcomer.summary.stats)
+
+halfak.formula <-  as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits.log + wiki.age + quarter + wiki.name")
+
+newcomer.ds.all <- newcomer.ds
+newcomer.ds <- newcomer.ds[n.other.wikis==0]
+
+print('fitting halfak model on all newcomers')
+halfak.mod.all.newcomers <- glm(halfak.formula,data=newcomer.ds.all,family=binomial(link=logit))
+saveRDS(halfak.mod.all.newcomers,"halfak.mod.all.newcomers.RDS")
+remember(extract(halfak.mod.all.newcomers),"halfak.model.all.newcomers",silent=TRUE)
+
+print("fitting halfak model")
+halfak.mod <- glm(halfak.formula,data=newcomer.ds,family=binomial(link=logit))
+saveRDS(halfak.mod,"halfak.mod.RDS")
+remember(extract(halfak.mod),"halfak.model",silent=TRUE)
+
+print('fitting halfak model with weights')
+n.total.wikis <- length(unique(newcomer.ds$wiki.name))
+weight.per.wiki <- nrow(newcomer.ds)/n.total.wikis
+newcomer.ds <- newcomer.ds[,weights:=weight.per.wiki/.N,by=wiki.name]
+halfak.mod.weighted <- glm(halfak.formula,data=newcomer.ds,family=binomial(link=logit),weights=newcomer.ds$weights)
+saveRDS(halfak.mod.weighted,"halfak.mod.weighted.RDS")
+remember(extract(halfak.mod.weighted),"halfak.model.weighted",silent=TRUE)
+
+## print('fit halfak model on a sample')
+## sample.size <- 30
+## newcomer.ds <- newcomer.ds[,in.sample:=.N >= sample.size, by=wiki.name]
+## newcomer.ds.sample <- newcomer.ds[,.SD[sample(.N,min(sample.size,.N))],by=wiki.name]
+## halfak.mod.sample <- glm(halfak.formula,data=newcomer.ds.sample,family=binomial(link=logit))
+## saveRDS(halfak.mod.sample,"halfak.mod.sample.RDS")
+## remember(extract(halfak.mod.sample),"halfak.model.sample",silent=TRUE)
+
+print('fitting RE model')
+library("optimx")
+print('fitting re model')
+re.icc.survives.model <- glmer(as.formula("survives ~ + (1 | wiki) - 1"),data=newcomer.ds,family=binomial(link=logit))
+saveRDS(re.icc.survives.model,"re.icc.survives.model.RDS")
+varcorrmat <- as.data.table(VarCorr(re.icc.survives.model))
+wiki.var <- varcorrmat[grp=='wiki' & var1=="(Intercept)" ,vcov]
+group.var <- var(residuals(re.icc.survives.model))
+icc <- wiki.var/(group.var + wiki.var)
+remember(varcorrmat,'icc.survives.varcormat')
+remember(group.var,'icc.survives.group.var')
+remember(icc,'icc.survives')
+
+## newcomer.no.pooling.f <- as.formula("survives ~ is.reverted:wiki.name + is.messaged:wiki.name + is.bot.reverted:wiki.name + session.edits.log:wiki.name + wiki.name + quarter:wiki.name + wiki.name:wiki.age - 1")
+## newcomer.no.pooling.mod <- glm(newcomer.no.pooling.f,gdata=newcomer.ds,family=binomial(link=logit))
+## remember(extract(newcomer.no.pooling.mod),"newcomer.no.pooling.mod",silent=TRUE)
+
+## if( !(exists("halfak.robustnes1.mod") | file.exists("halfak.robustness1.mod.RDS")) | refit.models == TRUE){
+##         halfak.robustness1.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits.log + wiki + quarter + wiki:wiki.age")
+##         print("fitting halfak robustness 1 model")
+##         newcomer.robustness.ds <- newcomer.ds[p.reverted <= 0.05]
+##         halfak.robustness1.mod <- glm(halfak.robustness1.formula,data=newcomer.robustness.ds,family=binomial(link=logit))
+##         saveRDS(halfak.robustness1.mod,"halfak.robustness1.mod.RDS")
+##         remember(extract(halfak.robustness1.mod),"halfak.robustness1.model")
+##     }
+##     else if(file.exists("halfak.robustness1.mod.RDS") & !exists("halfak.robustness1.mod")){
+##         newcomer.no.pooling.mod <- readRDS("halfak.robustness1.mod.RDS")
+##     }
+##     else if (exists("halfak.robustness1.mod")){
+##         saveRDS(halfak.robustness1.mod,"halfak.robustness1.mod.RDS")
+##     }
+##         remember(extract(halfak.robustness1.mod),"halfak.robustness1.mod")
+## }
+
+## if( !(exists("halfak.robustnes2.mod") | file.exists("halfak.robustness1.mod.RDS")) | refit.models == TRUE){
+##         halfak.robustness2.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits .log + wiki + quarter + wiki:wiki.age")
+##         print("fitting halfak robustness 2 model")
+##         newcomer.robustness.ds2 <- newcomer.ds[p.reverted <= 0.5]
+##         halfak.robustness2.mod <- glm(halfak.robustness2.formula,data=newcomer.robustness.ds2,family=binomial(link=logit))
+##         saveRDS(halfak.robustness1.mod,"halfak.robustness2.mod.RDS")
+##         remember(extract(halfak.robustness1.mod),"halfak.robustness2.model")
+##     }
+##     else if(file.exists("halfak.robustness2.mod.RDS") & !exists("halfak.robustness2.mod")){
+##         halfak.robustness2.mod <- readRDS("halfak.robustness2.mod.RDS")
+##     }
+##     else if (exists("halfak.robustness2.mod")){
+##         saveRDS(halfak.robustness2.mod,"halfak.robustness2.mod.RDS")
+##     }
+##         remember(extract(halfak.robustness2.mod),"halfak.robustness2.mod")
+## }
diff --git a/03_generate_plots.R b/03_generate_plots.R
new file mode 100755 (executable)
index 0000000..250a4a9
--- /dev/null
@@ -0,0 +1,97 @@
+#!/usr/bin/env Rscript
+
+# Creates data for plotting
+
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+library("ggplot2")
+library("bootstrap")
+
+library("scales")
+source("lib-00-utils.R")
+if(!exists("newcomers")){
+    source("01_build_datasets.R")
+}
+
+remember(min(all.edits$date.time),"earliest.data.point")
+remember(max(all.edits$date.time),"latest.data.point")
+
+p1.data <- newcomers[,.(p.reverted = mean(is.reverted),
+                          var.reverted=var(is.reverted),
+                          p.survives=mean(survives),
+                          var.survives=(var(survives)),
+                          N=.N),
+                       by=.(wiki.name,wiki.age.half.years)]
+p1.data <- p1.data[N>1]
+p1.data[,N.wikis := .N, by = .(wiki.age.half.years)]
+## put p1 data onto sd scales
+p1.data[,p.survives.in.sd := p.survives/sd(p.survives),by=.(wiki.name)]
+p1.data[,p.reverted.in.sd := p.reverted/sd(p.reverted),by=.(wiki.name)]
+
+p.data <- melt(p1.data,id.vars=c("wiki.name","wiki.age.half.years"),measure.vars=c("p.survives","p.reverted","p.survives.in.sd","p.reverted.in.sd"))
+
+p.stats <- p.data[,as.list(c(setNames(boxplot.stats(value,coef=1.5)$stats,c("min","q1","med","q3","max")),
+                             mu=mean(value),N.wikis=.N)),by=.(wiki.age.half.years,variable)]
+
+remember(p.stats)
+p.stats[variable=="p.survives"]$variable="Survives"
+p.stats[variable=="p.reverted"]$variable="Reverted"
+
+remember(cor.test(p1.data$wiki.age.half.years,p1.data$p.survives,method='spearman',alternative='less'),"survives.cor.test")
+remember(cor.test(p1.data$wiki.age.half.years,p1.data$p.reverted,method='spearman',alternative='greater'),"reverted.cor.test")
+
+xlabels = paste0("Year ", 0:max(p.stats$wiki.age.half.years))
+p <- ggplot(p.stats,aes(x=as.factor(wiki.age.half.years),ymin=min,lower=q1,middle=med,upper=q3,ymax=max,width=0.3))
+p <- p + geom_boxplot(stat='identity')
+p <- p + geom_line(aes(x=wiki.age.half.years+1,y=med), linetype=2)
+p <- p + facet_wrap("variable",nrow=2,strip.position="bottom",scales="free")
+p <- p + scale_y_continuous(name="Proportion of newcomers",minor_breaks=NULL) + scale_x_discrete(name="Wiki age", labels=xlabels)
+p <- p + theme_bw()  + theme(legend.position="None")
+
+pdf(width=6,height=6)
+print(p)
+dev.off()
+
+active.editors <- all.edits[,
+                            .(N.edits=.N,
+                              wiki.age.years=first(wiki.age.years)),
+                            by=.(wiki.name,
+                                 editor,
+                                 wiki.age.months)]
+
+n.active.editors <- active.editors[N.edits >= 5,
+                                   .(N.active.editors = .N,
+                                     wiki.age.years=first(wiki.age.years)),
+                                   by=.(wiki.name,wiki.age.months)]
+
+n.active.editors[, ":="(N=.N), by=.(wiki.age.months)]
+
+n.active.editors[,":="(max.age=max(wiki.age.months),max.active.editors=max(N.active.editors),sd.units.active.editors=N.active.editors/sd(N.active.editors)),by="wiki.name"]
+n.active.editors[,":="(active.editors.pmax=N.active.editors/max.active.editors)]
+wiki.age.quantile <- .90
+
+max.age.months <- quantile(n.active.editors$max.age,wiki.age.quantile)
+
+boot <- n.active.editors[is.finite(sd.units.active.editors)&wiki.age.months <= max.age.months,.(thetastar = bootstrap(x=sd.units.active.editors,nboot=5000,mean)$thetastar),by=.(wiki.age.months)]
+
+boot.ci <- boot[,as.list(quantile(thetastar,probs=c(0.025,0.975))),by=.(wiki.age.months)]
+names(boot.ci) <- c("wiki.age.months","lower.ci","upper.ci")
+
+plot2.data <- n.active.editors[is.finite(sd.units.active.editors) & wiki.age.months <= max.age.months,.(sd.units.active.editors = mean(sd.units.active.editors),N.active.editors = mean(N.active.editors),wiki.age.years=first(wiki.age.years),N.wikis=.N),by=.(wiki.age.months)]
+
+plot2.data[boot.ci,":="(lower.ci=lower.ci,upper.ci=upper.ci),on="wiki.age.months"]
+
+remember(plot2.data,'plot.active.editors.dt')
diff --git a/04_model_namespace4.R b/04_model_namespace4.R
new file mode 100755 (executable)
index 0000000..2e72eeb
--- /dev/null
@@ -0,0 +1,149 @@
+#!/usr/bin/env Rscript
+
+# Fits models predicting reverions of namespace 4 edits
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+library(effects)
+library(texreg)
+library(lme4)
+if(!exists("newcomers")){
+    source("01_build_datasets.R")
+}
+nosave <- FALSE
+sample <- FALSE
+
+if(sample == TRUE){
+    source("lib-01-sample-datasets.R")
+    ns4.ds <- sample.ns4.edits()
+    weights <- ns4.ds$weight
+}else{
+    ns4.ds <- ns4.reg.edits
+}
+
+
+ns4.ds <- ns4.ds[,":="(wiki.age.log = log1p(as.double(wiki.age,units="years")),
+             age.log = log1p(as.double(age,units="years")),
+             wiki.age = as.double(wiki.age,units='years'),
+             quarter = as.factor(paste0(year(date.time),"_",ceiling(month(date.time)/4))),
+             age = as.double(age,units='years'))]
+
+ns4.ds <- ns4.ds[,":="(time.first.wikia.edit = min(time.first.edit)),by=.(editor)]
+ns4.ds.all.newcomers <- ns4.ds
+ns4.ds <- ns4.ds[time.first.wikia.edit == time.first.edit]
+
+ns4.summary.stats <- list()
+ns4.summary.stats$p.reverted <- mean(ns4.ds$reverted)
+ns4.summary.stats$var.reverted <- var(ns4.ds$reverted)
+ns4.summary.stats$mean.editor.age <- mean(ns4.ds$age)
+ns4.summary.stats$var.editor.age <- var(ns4.ds$age)
+ns4.summary.stats$median.editor.age <- median(ns4.ds$age)
+ns4.summary.stats$mean.wiki.age <- mean(ns4.ds$wiki.age)
+ns4.summary.stats$var.wiki.age <- var(ns4.ds$wiki.age)
+ns4.summary.stats$median.wiki.age <- median(ns4.ds$wiki.age)
+
+remember(ns4.summary.stats)
+
+print('fit morgan model')
+f.morgan <- as.formula("reverted ~ age + wiki.age + quarter + wiki.name")
+morgan.model <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'))
+saveRDS(morgan.model,"morgan.model.RDS")
+remember(extract(morgan.model),"morgan.model",silent=TRUE)
+
+print('fit morgan model weights')
+f.morgan <- as.formula("reverted ~ age + wiki.age + quarter + wiki.name")
+
+n.total.wikis <- length(unique(ns4.ds$wiki.name))
+weight.per.wiki <- nrow(ns4.ds)/n.total.wikis
+ns4.ds <- ns4.ds[,weights := weight.per.wiki/.N, by=.(wiki.name)]
+morgan.model.weighted <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'),weights=ns4.ds$weights)
+saveRDS(morgan.model.weighted,"morgan.model.weighted.RDS")
+remember(extract(morgan.model.weighted),"morgan.model.weighted",silent=TRUE)
+
+print('fit morgan model weights')
+f.morgan <- as.formula("reverted ~ age + wiki.age + quarter + wiki.name")
+ns4.ds <- ns4.ds[,N:=.N,by=wiki.name]
+ns4.ds.temp <- ns4.ds
+min.edits <- 10
+remember(print(1 - length(unique(ns4.ds[N>=min.edits]$wiki.name))/length(unique(ns4.ds$wiki.name))),"p.wikis.removed.weighted2")
+# remove the bottom 24.1% of wikis
+ns4.ds <- ns4.ds[N>=min.edits]
+n.total.wikis <- length(unique(ns4.ds$wiki.name))
+weight.per.wiki <- nrow(ns4.ds)/n.total.wikis
+ns4.ds <- ns4.ds[,weights := weight.per.wiki/.N, by=.(wiki.name)]
+morgan.model.weighted2 <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'),weights=ns4.ds$weights)
+saveRDS(morgan.model.weighted2,"morgan.model.weighted2.RDS")
+remember(extract(morgan.model.weighted2),"morgan.model.weighted2",silent=TRUE)
+ns4.ds <- ns4.ds.temp
+
+print('fit morgan model all newcomers')
+morgan.model.all.newcomers <- glm(f.morgan,data=ns4.ds.all.newcomers,family=binomial(link='logit'))
+saveRDS(morgan.model.all.newcomers,"morgan.model.all.newcomers.RDS")
+remember(extract(morgan.model.all.newcomers),"morgan.model.all.newcomers",silent=TRUE)
+
+print('fitting RE model')
+
+re.icc.reverted.model <- glmer(as.formula("reverted ~ + (1 | wiki.name) -1 "),data=ns4.ds,family=binomial(link=logit))
+saveRDS(re.icc.reverted.model,"re.icc.reverted.model.RDS")
+varcorrmat <- as.data.table(VarCorr(re.icc.reverted.model))
+wiki.var <- varcorrmat[grp=='wiki.name' & var1=="(Intercept)" ,vcov]
+group.var <- var(residuals(re.icc.reverted.model))
+icc <- wiki.var/(group.var + wiki.var)
+remember(varcorrmat,'icc.reverted.varcorrmat')
+remember(group.var,'icc.reverted.group.var')
+remember(icc,'icc.reverted')    
+
+## print("fit morgan model sample")
+## sample.size <- 30
+## ns4.ds <- ns4.ds[,in.sample:=(.N >= sample.size),by=wiki.name]
+## # DT[,.SD[sample(.N, min(3,.N))],by = a]
+## ns4.ds.equal.sample <- ns4.ds[,.SD[sample(.N,min(sample.size,.N))], by=wiki.name]
+## morgan.model.sampled <- glm(f.morgan,data=ns4.ds.equal.sample,family=binomial(link='logit'))
+## saveRDS(morgan.model.sampled,"morgan.model.sampled.RDS")
+## remember(extract(morgan.model.sampled),"morgan.model.sampled",silent=TRUE)
+
+## ns4.model2.formula <- as.formula("reverted ~ age.log + wiki.age + quarter")
+## ns4.model2 <- glm(ns4.model2,data=ns4.ds,family=binomial(link='logit'),weights=weights)
+## remember(extract(ns4.model2),"ns4.model2")
+
+## print('fit morgan no pooling model')
+## f.morgan <- as.formula("reverted ~ age.log + wiki.age + quarter + wiki.name:age.log + wiki.name:wiki.age")
+## morgan.model <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'),weights=weights)
+## remember(extract(morgan.model),"morgan.model")
+
+## re.ns4.model <- glmer(as.formula("reverted ~ age.log + wiki.age + quarter | wiki.name"),data=ns4.ds,family=binomial(link='logit'),weights=weights)
+
+## remember(extract(re.ns4.model),'re.ns4.model')
+
+## print('fit morgan.robustness.1 model')
+## f.morgan.robustness.1 <- as.formula("reverted ~ age.log + wiki.age + quarter + wiki.name")
+## ns4.reg.edits.robustness <- build.namespace4.dataset(all.edits[p.reverted < 0.5])
+
+## ns4.reg.edits.robustness[,":="(wiki.age.log = log1p(as.double(wiki.age,units="weeks")),
+##                                age.log = log1p(as.double(age,units="weeks")),
+##                                wiki.age = as.double(wiki.age,units='weeks'),
+##                                quarter = as.factor(paste0(year(date.time),"_",ceiling(month(date.time)/4))))]
+
+## morgan.robustness.1.model <- glm(f.morgan.robustness.1,data=pns4.reg.edits.robustness,family=binomial(link='logit'),weights=weights)
+## saveRDS(morgan.robustness.1.model,"morgan.robustness.1.model.RDS")
+## remember(extract(morgan.robustness.1.model),"morgan.robustness.1.model")
+
+      
+## ns4.ds[,":="(wiki.age.log = log1p(as.numeric(wiki.age,units="weeks")), age.log = log1p(as.numeric(age,units="weeks")))]
+## f.ns4.2 <- as.formula("reverted ~ age.log + wiki.age.log + age.log|wiki.age.log + wiki.name")
+## ns4.2.model <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'),weights=weights)
+## remember(extract(ns4.2.model))
+
+## summary statistics for namespace 4 edits
diff --git a/05_power_simulation.R b/05_power_simulation.R
new file mode 100755 (executable)
index 0000000..da5725b
--- /dev/null
@@ -0,0 +1,53 @@
+#!/usr/bin/env Rscript
+
+# Perform power analysis to assess whether we have enough data to study bots
+
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+source("lib-00-utils.R")
+library(effects)
+library(texreg)
+if(!exists("r")){
+    source("lib-00-utils.R")
+    source("01_build_datasets.R")
+}
+
+p.outcome <- r$newcomer.summary.stats$p.survives
+p.dv <- r$newcomer.summary.stats$p.bot.reverted
+n <- r$halfak.model@gof[5]
+
+sample.ds <- function(n,p.outcome,p.dv,eff = -0.01){
+    dv <- rbinom(n=n,size=1,prob=p.dv)
+    iv <- rbinom(n,size=1,prob=p.outcome)
+    m1 <- glm(iv ~ 1, family=binomial(link='logit'))
+    eta <- eff*dv + coef(m1)[1]
+    p <- exp(eta)/(1+exp(eta))
+    tmp <- runif(n)
+    y <- (tmp < p)
+    fit <- glm(y ~ dv,family=binomial(link='logit'))
+    summary(fit)$coefficients[2,4]
+}
+
+eff <- -0.68
+remember(exp(-eff),"power.analysis.effect")
+pwr.test.sig.level <- 0.05
+remember(pwr.test.sig.level)
+n.power.sim <- 1000
+remember(n.power.sim)
+
+out <- replicate(n.power.sim,sample.ds(n,p.outcome,p.dv,eff=eff))
+remember(mean(out<pwr.test.sig.level),"pwr.test")
+
diff --git a/RCommunityData/.Rbuildignore b/RCommunityData/.Rbuildignore
new file mode 100644 (file)
index 0000000..91114bf
--- /dev/null
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/RCommunityData/.gitignore b/RCommunityData/.gitignore
new file mode 100644 (file)
index 0000000..807ea25
--- /dev/null
@@ -0,0 +1,3 @@
+.Rproj.user
+.Rhistory
+.RData
diff --git a/RCommunityData/DESCRIPTION b/RCommunityData/DESCRIPTION
new file mode 100644 (file)
index 0000000..2a53d7b
--- /dev/null
@@ -0,0 +1,9 @@
+Package: RCommunityData
+Title: library of functions used in communitydata packages
+Version: 0.1
+Authors@R: person("Benjamin Mako", "Hill", email = "mako@atdot.cc", role = c("aut", "cre"))
+Description: library of functions used in communitydata packages
+Depends: R (>= 3.0)
+License: GPLv3+
+Encoding: UTF-8
+LazyData: true
diff --git a/RCommunityData/NAMESPACE b/RCommunityData/NAMESPACE
new file mode 100644 (file)
index 0000000..884a631
--- /dev/null
@@ -0,0 +1,2 @@
+# Generated by roxygen2: fake comment so roxygen2 overwrites silently.
+exportPattern("^[^\\.]")
diff --git a/RCommunityData/R/hhi.R b/RCommunityData/R/hhi.R
new file mode 100644 (file)
index 0000000..d17cb3e
--- /dev/null
@@ -0,0 +1,17 @@
+# Community Data Science Collective R Utilities
+#
+# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
+# mako@atdot.cc, aaronshaw@northwestern.edu
+
+## functions to create normal and non-normalized herfenidahl indexes
+hhi <- function (x) {
+  x <- x / sum(x)
+  sum(x**2)
+}
+
+hhi.norm <- function (x) {
+  n <- length(x)
+  h <- hhi(x)
+  (h - 1/n)/(1-1/n)
+}
+
diff --git a/RCommunityData/R/load_if_missing.R b/RCommunityData/R/load_if_missing.R
new file mode 100644 (file)
index 0000000..4143886
--- /dev/null
@@ -0,0 +1,24 @@
+# Community Data Science Collective R Utilities
+#
+# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
+# mako@atdot.cc, aaronshaw@northwestern.edu
+
+# load a file if a variable is missing
+load.if.missing <- function (var.name, file.name) {
+  if (!exists(var.name)) {
+    load(file.name, parent.frame())
+
+    # check to see if we're dealing with a data.table because, if we
+    # are, we need to do some nasty back and forth
+    if (class(eval(as.name(var.name)))[1] == "data.table") {
+
+      # gnarly function that loads resorts things within the parent
+      # frame to get around the bug in data.table
+      assign(var.name,
+             data.table(as.data.frame(eval(as.name(var.name))),
+                        key=attr(eval(as.name(var.name)), "sorted")),
+             parent.frame())
+    }
+  }
+}
+
diff --git a/RCommunityData/R/namespaces.R b/RCommunityData/R/namespaces.R
new file mode 100644 (file)
index 0000000..0f96399
--- /dev/null
@@ -0,0 +1,59 @@
+# Community Data Science Collective R Utilities
+#
+# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
+# mako@atdot.cc, aaronshaw@northwestern.edu
+
+## functions to deal with namespace information
+#####################################################################
+load.wikia.namespaces <- function () {
+    # load namespace data
+    wikia.namespaces <- read.delim("~/data/wikia_namespaces.tsv",
+                                   stringsAsFactors=TRUE, header=FALSE)
+
+    colnames(wikia.namespaces) <- c("wiki", "ns.num", "ns.string")
+    wikia.namespaces$ns.num <- as.factor(wikia.namespaces$ns.num)
+    return(wikia.namespaces)
+}
+
+# enwiki - move to barnstars directory
+# TODO: TEST
+load.enwiki.namespaces <- function(){
+  enwiki.ns.num <- c(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                     14, 15, 100, 101, 108, 109)
+  
+  names(enwiki.ns.num) <- c( "Media", "Special", "", "Talk", "User", "User talk",
+                            "Wikipedia", "Wikipedia talk","File", "File talk",
+                            "MediaWiki", "MediaWiki talk", "Template", "Template talk",
+                            "Help", "Help talk", "Category", "Category talk",
+                            "Portal", "Portal talk", "Book","Book talk")
+}
+
+# function to take a list of article titles and a wiki name and return
+# a list of numbered namespaces
+titles.to.ns.num <- function (page.titles, wiki) {
+    # load wikia namespace data from disk if it does not exist
+    if (!exists("wikia.namespaces")) {
+        wikia.namespaces <- load.wikia.namespaces()
+    }
+    
+    # page.titles <- d$title # DEBUG 
+    ns.df <- wikia.namespaces[wikia.namespaces$wiki == wiki,
+                                c("ns.num", "ns.string")]
+
+    namespaces <- as.character(ns.df$ns.num)
+    names(namespaces) <- ns.df$ns.string
+
+    # drop the zero, we'll deal with it later
+    namespaces <- namespaces [!namespaces == 0]
+    
+    # change underscores to spaces (necessary?)
+    page.titles <- gsub('_', ' ', page.titles)
+    page.ns <- rep("0", length(page.titles))
+
+    for (ns in names(namespaces)) {
+        page.ns[grepl(paste('^', ns, ':', sep=""), page.titles)] <- namespaces[ns]
+    }
+
+    # return the list of namespaces as a factor
+    return(as.factor(page.ns))
+}
diff --git a/RCommunityData/R/wikia_admin.R b/RCommunityData/R/wikia_admin.R
new file mode 100644 (file)
index 0000000..cd19316
--- /dev/null
@@ -0,0 +1,184 @@
+# Community Data Science Collective R Utilities
+#
+# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
+# mako@atdot.cc, aaronshaw@northwestern.edu
+
+# privileges of interest:
+# a shared variable that gets used everywhere
+generate.admin.addrm <- function (logevents, current.admins) {
+
+  # convert types of a few variables
+  logevents$ancient <- logevents$ancient == "true"
+  logevents$timestamp <- timestamp.to.POSIXct(logevents$timestamp)
+  logevents$rights.new[is.na(logevents$rights.new)] <- ""
+  logevents$rights.old[is.na(logevents$rights.old)] <- ""
+
+  # TODO do wikia wikis have these =?
+  # in WP, all of these are negated by one day
+  logevents <- logevents[!(logevents$ancient & logevents$comment == "="),]
+  
+  ##########################################
+  ###  Parsing logevents file
+  #########################################
+
+  # separate out moderns & ancients and the necessary columns
+  ancients <- logevents[logevents$ancient,c("title","comment","timestamp")]
+  moderns <- logevents[!logevents$ancient,
+                       c("title","rights.new","rights.old","timestamp")]
+  
+  # function that looks at rights.old, rights.new and returns a value of
+  # privilege, add/remove, and timestamp for each user
+  parse.moderns <- function (i, d) {
+    user <- sub('^User:', "", d[i,"title"])
+    change.time <- d[i,"timestamp"]
+    rights.new <- d[i,"rights.new"]
+    rights.old <- d[i,"rights.old"]
+    
+    # create a vector of new and old rights:
+    destring <- function (x) { strsplit(as.character(x), ", ")[[1]] }
+
+    # create a list of privileges that are mentioned
+    privileges <- unique(c(destring(rights.new),
+                           destring(rights.old)))
+
+    # create T/F vectors incidating which privileges were added/removed
+    added <- privileges[privileges %in% destring(rights.new) &
+                        !(privileges %in% destring(rights.old))]
+    removed <- privileges[!(privileges %in% destring(rights.new)) &
+                          privileges %in% destring(rights.old)]
+
+    # assemble the data frame of: role,action,user,timestamp
+    data.frame(user=rep(user, length(c(added,removed))),
+               role=c(added, removed),
+               action=c(rep("added",length(added)),
+                 rep("removed", length(removed))),
+               timestamp=rep(change.time, length(c(added,removed))),
+               era=rep("modern", length(c(added,removed))),
+               stringsAsFactors=FALSE)
+  }
+
+  # if there are log events, and there are non-ancients (not all are ancients), we parse them
+  if (dim(logevents)[1] & !all(logevents$ancient)) {
+    moderns.parsed <- do.call("rbind",
+                              lapply(1:dim(moderns)[1], parse.moderns, moderns))
+  } else {
+    moderns.parsed = NULL
+  }
+  
+  # another function to handle processing the ancients:
+  parse.ancient <- function (i, d) {
+    user <- sub('^.*?:', '', d[i,"title"])
+    comment <- d[i, "comment"]
+    change.time <- d[i, "timestamp"]
+
+    added <- unlist(strsplit(unlist(strsplit(comment, '(\\+|\\=)')), ', '))
+
+    # clean any leadin, trailing whitespace
+    added <- gsub("^\\s+|\\s+$", "", added)
+
+    data.frame(user=user,
+               role=added,
+               action="added",
+               timestamp=change.time,
+               era="ancient",
+               stringsAsFactors=FALSE)
+  }
+
+  # if there are any ancients, we parse them
+  if (any(logevents$ancient)) {
+    ancients.parsed <- do.call("rbind",
+                               lapply(1:dim(ancients)[1], parse.ancient, ancients))
+  } else {
+    ancients.parsed = NULL
+  }
+
+  combined <- rbind(moderns.parsed, ancients.parsed)
+  
+  ##########################################
+  ###  Parsing current.admins file
+  #########################################
+  # turn each of the columns after the first two into logical
+
+  # function to process pre.ancients
+  parse.current.admins <- function (i, d) {
+    user <- d[i, "username"]
+    roles <- gsub("^\\s+|\\s+$", "", strsplit(d[i, "groups"], ",")[[1]])
+
+    o <- data.frame(user=user, role=roles, stringsAsFactors=FALSE)
+    colnames(o) <- c("user", "role")
+    return(o)
+  }
+
+  ## handle the case where there are no admins. This can happen on Wikipedia
+  if(dim(current.admins)[1] != 0){
+      current.admins.parsed <- do.call("rbind",
+                                       lapply(1:dim(current.admins)[1],
+                                              parse.current.admins, current.admins))
+  }
+  else{
+      current.admins.parsed <- NULL
+  }
+
+  # select pre-ancients as people who have a given right *today* but
+  # were never seen as having it added
+  is.pre.ancients <- function (i, d, combined) {
+    user <- d[i, "user"]
+    role <- d[i, "role"]
+
+    # look to see if we've see any events with this user and role added:
+    # if we see none, this is pre-ancient
+    !any(combined$user == user &
+         combined$role == role &
+         combined$action == "added")
+
+  }
+
+  if(!is.null(current.admins.parsed)){
+  # create the list of pre-ancients (people role combinations we have
+  # not seen in the logevents data
+      pre.ancients <- current.admins.parsed[sapply(1:dim(current.admins.parsed)[1],
+                                                   is.pre.ancients,
+                                                   current.admins.parsed,
+                                                   combined),]
+  }
+  else{
+      pre.ancients <- NULL
+  }
+
+  # make a list of people who have been removed
+  combined.removed <- combined[combined$action == "removed",]
+  if (!is.null(combined.removed)) {
+    if (dim(combined.removed)[1] > 0) {
+      combined.removed <- combined.removed[sapply(1:dim(combined.removed)[1],
+                                                  function (i,d) {
+        user <- d[i,"user"]
+        role <- d[i,"role"]
+        timestamp <- d[i,"timestamp"]
+
+        # was the person added before they were removed? OR in the pre-ancients
+        any(combined$user == user &
+            combined$role == role &
+            combined$action == "added" &
+            combined$timestamp <= timestamp) | (user %in% pre.ancients$user)
+      }, combined.removed),c("user", "role")]
+    }
+  }
+
+  
+  pre.ancients <- rbind(pre.ancients, combined.removed)
+  
+  # give them the earliest ancient timestamp minus 1 day
+  # and then add the pre.ancients to the 
+  if(!is.null(pre.ancients)){
+      pre.ancients$action <- "added"
+      pre.ancients$timestamp <- as.POSIXct("2000-01-01 00:00:00") # min(combined$timestamp) - 60 * 1440
+      pre.ancients$era <- "pre.ancient"
+
+      combined <- rbind(combined, pre.ancients)
+  }
+
+  # remove redunandt actions
+  combined <- combined[!duplicated(combined),]
+  return(combined)
+}
+
diff --git a/RCommunityData/R/wikiq.R b/RCommunityData/R/wikiq.R
new file mode 100644 (file)
index 0000000..76c9dcf
--- /dev/null
@@ -0,0 +1,86 @@
+# Community Data Science Collective R Utilities
+#
+# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
+# mako@atdot.cc, aaronshaw@northwestern.edu
+
+# loads simple utility functions for use in the subsequent files
+
+# store this for re-use across various scripts
+wikiq.header <- c("title", "articleid", "revid", "timestamp", "anon",
+                  "editor", "editor_id", "minor", "text_size",
+                  "text_entropy", "text_md5", "reversion",
+                  "additions_size", "deletions_size", "edits",
+                  "articles", "users")
+
+# helper function to load the TSV files our perl scripts are generating
+load.extracted.df <- function (filename) {
+  read.delim(filename, header=T, quote="", na.strings="", stringsAsFactors=TRUE)
+}
+
+# helper function to grab the classes of all columns of a dataframe
+# keep this because it's being used but this can just be lapply(d, class)
+get.col.classes <- function (d) {
+  sapply(colnames(d), function (col) { class(d[,col]) })
+}
+
+# convert mediawiki timestamps into POSIXct
+timestamp.to.POSIXct <- function (ts.string)  {
+  ts.string <- gsub("T", " ", ts.string)
+  ts.string <- gsub("Z", "", ts.string)
+  return(as.POSIXct(ts.string, format="%Y-%m-%d %H:%M:%S", tz="UTC"))
+}
+
+
+read.wikiq <- function (con, header=TRUE, detect.reverts=FALSE) {
+  d <- read.delim(con, stringsAsFactors=FALSE, header=header,
+                  encoding="UTF-8", quote="")
+
+  # rename date.time to timestamp and remove _
+  colnames(d)[colnames(d) == "date.time"] <- "timestamp"
+  colnames(d) <- sub("_", ".", colnames(d))
+  
+  d$timestamp <- as.POSIXct(sub("^(.*)y(.*)\xc8zy$", "\\1\\2",
+                                d$timestamp), tz="UTC")
+
+  # convert reversion to a logical
+  d$reversion <- !is.na(d$reversion)
+
+  if (detect.reverts) {
+      # reorder so we cannow find the order and timestamp
+      d <- d[order(d$title, d$timestamp),]
+      
+      # generate a list of reverted editors and a list of previous and next md5
+      d$reverted <- c(d$reversion[2:length(d$reversion)],NA)
+      d$md5.next <- c(d$text.md5[2:length(d$reversion)],NA)
+      d$md5.prev <- c(NA,d$text.md5[1:(length(d$reversion)-1)])
+      d$reverted <- d$reverted & (d$md5.next == d$md5.prev)
+
+      # drop the extra columns and the last edit
+      d <- d[!is.na(d$reverted),]
+      d <- d[,!colnames(d) %in% c("md5.next", "md5.prev")]
+  
+      # create a reverted by variable by shifting up the editors and
+      # then NAing nonreverts
+      d$reverted.by <- c(d$editor[2:length(d$reversion)], NA)
+      d$reverted.by[!d$reverted] <- NA
+  }  
+  # set ip address to the username and create a new variable
+  d$ipaddress <- d$editor == ""
+  d$editor[d$editor == ""] <- d$editor.id[d$editor == ""]
+  
+  # delete the connection
+  return(d)
+}
+
+# TODO refactor this so that we clean the data BEFORE we read it into R
+# ATM, this is set to only work on 14 item issues
+
+# see the vereins wiki for "Philcomputing" and 29 lines that seem to
+# have a newline in the editor name
+read.bz.wikiq <- function (filename, header=TRUE, detect.reverts=FALSE) {
+  con <- pipe(paste("bzcat", filename, "|awk -F'\t' '{if (NF == 14) print;}'"))
+  d <- read.wikiq(con, header=header, detect.reverts=detect.reverts)
+  rm(con)
+  return(d)
+}
+
diff --git a/RCommunityData/RCommunityData.Rproj b/RCommunityData/RCommunityData.Rproj
new file mode 100644 (file)
index 0000000..d848a9f
--- /dev/null
@@ -0,0 +1,16 @@
+Version: 1.0
+
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+Encoding: UTF-8
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,collate,namespace
diff --git a/README.md b/README.md
new file mode 100644 (file)
index 0000000..736a973
--- /dev/null
+++ b/README.md
@@ -0,0 +1,147 @@
+Copyright (C)  2018  Nathan TeBlunthuis.
+Permission is granted to copy, distribute and/or modify this document
+under the terms of the GNU Free Documentation License, Version 1.3
+or any later version published by the Free Software Foundation;
+with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
+A copy of the license is included in the file entitled "fdl-1.3.md".
+
+# Replication data for "Revisiting 'The Rise and Decline' in a Population of Peer Production Projects" #
+
+## Overview ##
+
+This archive contains code and data for reproducing the analysis for
+"Replication Data for Revisiting 'The Rise and Decline' in a
+Population of Peer Production Projects". Depending on what you hope to
+do with the data you probabbly do not want to download all of the
+files. Depending on your computation resources you may not be able to
+run all stages of the analysis.
+
+The code for all stages of the analysis, including typesetting the
+manuscript and running the analysis, is in code.tar.
+
+If you only want to run the final analysis or to play with datasets
+used in the analysis of the paper, you want intermediate_data.7z or
+the uncompressed tab and csv files.
+
+The data files are created in a three stage process. The first stage
+uses the program "wikiq" to create tsv files that have edit data for
+each wiki. The second stage generates all.edits.RDS file which
+contains edit metadata from mediawiki xml dumps. This file is
+expensive to generate and at 1.5GB is pretty big.  The third stage
+builds smaller intermediate files that contain the analytical
+variables from these tsv files. The fourth stage uses the intermediate
+files to generate smaller RDS files that contain the results. Finally,
+knitr and latex typeset the manuscript. 
+
+A stage will only run if the outputs from the previous stages do not
+exist. So if the intermediate files exist they will not be
+regenerated. Only the final analysis will run. The exception is that
+stage 4, fitting models and generating plots, always runs.
+
+If you only want to replicate from the second stage onward, you want
+wikiq_tsvs.7z. If you want to replicate everything, you want
+wikia_mediawiki_xml_dumps.7z.001 and wikia_mediawiki_xml_dumps.7z.002.
+
+These instructions work backwards from building the manuscript using
+knitr, loading the datasets, running the analysis, to building the
+intermediate datasets.
+
+## Building the manuscript using knitr ##
+This requires working latex, latexmk, and knitr
+installations. Depending on your operating system you might install
+these packages in different ways. On Debian Linux you can run `apt
+install r-cran-knitr latexmk texlive-latex-extra`. Alternatively, you
+can upload the necessary files to a project on Sharelatex.com or
+Overleaf.com.
+
+1. Download `code.tar`. This has everything you need to typeset the manuscript. 
+2. Unpack the tar archive. On a unix system this can be done by running `tar xf code.tar`.
+3. Navigate to code/paper_source.
+4. Install R dependencies. In R. run `install.packages(c("data.table","scales","ggplot2","lubridate","texreg"))`
+5. On a unix system you should be able to run `make` to build the
+   manuscript `generalizable_wiki.pdf`. Otherwise you should try
+   uploading all of the files (including the tables, figure, and knitr
+   folders) to a new project on ShareLatex.com.
+
+## Loading intermediate datasets ##
+The intermediate datasets are found in the `intermediate_data.7z`
+archive. They can be extracted on a unix system using the command `7z
+x intermediate_data.7z`. The files are 95MB uncompressed. These are
+RDS (R data set) files and can be loaded in R using the `readRDS`. For
+example `newcomer.ds <- readRDS("newcomers.RDS")`.  If you wish to
+work with these datasets using a tool other than R, you might prefer
+to work with the .tab files.
+
+## Running the analysis ##
+
+Fitting the models may not work on machines with less than 32GB of
+RAM. If you have trouble, you may find the functions in
+lib-01-sample-datasets.R useful to create stratified samples of data
+for fitting models. See line 89 of 02_model_newcomer_survival.R for an
+example.
+
+1. Download `code.tar` and `intermediate_data.7z` to your working
+   folder and extract both archives. On a unix system this can be done
+   with the command `tar xf code.tar && 7z x intermediate_data.7z`.
+2. Install R
+   dependencies. `install.packages(c("data.table","ggplot2","urltools","texreg","optimx","lme4","bootstrap","scales","effects","lubridate","devtools","roxygen2"))`.
+3. On a unix system you can simply run `regen.all.sh` to fit the
+   models, build the plots and create the RDS files.
+
+## Generating datasets ##
+
+### Building the intermediate files ###
+The intermediate files are generated from all.edits.RDS. This process requires about 20GB of memory.
+
+1. Download `all.edits.RDS`, `userroles_data.7z`,`selected.wikis.csv`,
+   and `code.tar`. Unpack `code.tar` and `userroles_data.7z`. On a
+   unix system this can be done using `tar xf code.tar && 7z x
+   userroles_data.7z`.
+2. Install R dependencies. In R run
+   `install.packages(c("data.table","ggplot2","urltools","texreg","optimx","lme4","bootstrap","scales","effects","lubridate","devtools","roxygen2"))`.
+3. Run `01_build_datasets.R`.
+
+### Building all.edits.RDS ###
+
+The intermediate RDS files used in the analysis are created from
+`all.edits.RDS`. To replicate building all.edits.RDS, you only need to
+run 01_build_datasets.R when the intermediate RDS files and
+`all.edits.RDS` files do not exist in the working
+directory. `all.edits.RDS` is generated from the tsv files generated
+by wikiq. This may take several hours. By default building the dataset
+will use all available CPU cores. If you want to change this, modify
+line 26 of `lib-01-build_newcomer_table.R`.
+
+1. Download selected.wikis.csv, userroles_data.7z, wikiq_tsvs.7z, and
+   code.tar. Unpack the files. On a unix system this can be done by
+   running `7z x userroles_data.7z && 7z x wikiq_tsvs.7z && tar xf
+   code.tar`.
+2. Run `01_build_datasets.R` to generate all.edits.RDS and the intermediate files. 
+
+
+### Running Wikiq to generate tsv files ### 
+If you want to regenerate the datasets all the way from the xml dumps
+and data from the Wikia api you will have to run the python script
+`wikiq`. This is a fairly computationally intensive process. It may
+over a day unless you can run the computations in parallel.
+
+1. Download `code.tar`, `wikia_mediawiki_xml_dumps.7z.001`,
+   `wikia_mediawiki_xml_dumps.7z.002`, and
+   `userroles_data.7z`. Extract the archives. On a Unix system this
+   can be done by running `tar xf code.tar && 7z x
+   wikia_mediawiki_xml_dumps.7z.001 && 7z x userroles_data.7z`.
+2. Have python3 and python3-pip installed. Using pip3 install `argparse`. `pip3 install argparse`.
+3. Edit `runwikiq.sh` to set N_THREADS. 
+4. Run `runwikiq.sh` to generate the tsv files.
+
+### Obtaining Bot and Admin data from the Wikia API ###
+For the purposes of supporting an audit of our research project, this
+repository includes the code that we used to obtain Bot and Admin data
+from the Wikia API. Unfortunantly, since we ran the script, the API
+has changed and this code does not work. 
+
+Our research group maintains a tool for scraping the Wikia API
+available at https://code.communitydata.cc/wikia_userroles_scraper. This can
+be used to download user roles for the wikis in this dataset. Follow
+the instructions found in that package.
+
diff --git a/lib-00-utils.R b/lib-00-utils.R
new file mode 100644 (file)
index 0000000..fadc502
--- /dev/null
@@ -0,0 +1,172 @@
+# Library containing helper functions
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+library(parallel)
+library(urltools)
+library(data.table)
+library(texreg)
+
+## load wikiq data for all wikis in the wiki list
+## this wikiq data doesn't have persistent word revisions
+## It doesn't collapse user edits either. we identify user sessions as well
+load.wikiq.file <- function(path){
+    d <- fread(paste0(path),
+               colClasses=list(character=c("reverteds", "date_time", "editor", "title")),
+               na.string="", stringsAsFactors=TRUE, quote="",drop=c("sha1","minor"))
+    gc()
+    setnames(d, gsub('_', '.', colnames(d)))
+    setkey(d, "revid")
+    d$date.time <- as.POSIXct(as.character(d$date.time),
+                              format="%Y-%m-%d %H:%M:%S",
+                              tz="UTC")
+
+    d[, ':='(editor = as.factor(url_decode(as.character(editor))), title = as.factor(url_decode(as.character(title))))]
+    
+    d[d$editor == "127.0.0.1","anon"] <- FALSE 
+
+    # drop edits made before mediawiki was written
+    d <- d[d$date.time > as.POSIXct("2002-01-22",timezone="UTC"),]
+
+    ## drop wikia edits made after 2010-04-10, when data was collected
+    if(wiki.list$wiki.type == "wikia"){
+        d <- d[d$date.time < as.POSIXct("2010-04-10",timezone="UTC"),]
+    }
+
+    # created "reverted" which captures whether an edit has been identity
+    # reverted within the revert RADIUS (currently 15 edits).
+    if (!any(d$revert)) {
+        d$reverted <- FALSE
+        ## we need to reorder the columns in this case
+        ## the merge in the other case also reorders columns
+        setcolorder(d,c("revid",names(d)[!grepl("revid",names(d))]))
+
+    } else {
+        reverteds <- d$reverteds[d$revert]
+
+        if (!any(grepl(",", d$reverteds))) {
+            reverteds <- unique(as.integer(as.character(d$reverteds)))
+        } else {
+            reverteds <- unique(as.integer(unlist(strsplit(as.character(reverteds), ","))))
+        }
+
+        reverteds <- data.table(revid=reverteds, reverted=TRUE)
+        d <- merge(d, reverteds, all.x=TRUE)
+        d$reverted[is.na(d$reverted)] <- FALSE
+    }
+
+    # "new.id" indicates whether this is a first-time editor
+    setkey(d, "date.time")
+    d$new.account <- !duplicated(d$editor)
+    d$new.account[is.na(d$editor)] <- FALSE
+    d$total.edits <- length(d$revid)
+    d$total.sessions <- seq(1, nrow(d))
+    d$total.editors <- cumsum(d$new.account)
+    d$total.pages <- cumsum(!duplicated(d$articleid))
+    
+    ## add the wiki name to the dt
+
+    ## remove edits not in the namespaces we care about
+    d <- d[namespace %in% c(0,1,3,4),]
+    return(d)
+}
+
+load.wikiq.files <- function(i,wiki.list, path="wikiq_wikia_2010_all_nopersistence/"){
+    wiki.filename = wiki.list[i,filename]
+    wiki <- wiki.list[i,wiki]
+    print(wiki)
+    d <- load.wikiq.file(paste0(path,wiki.filename))
+
+    d$wiki.name <- rep(wiki,nrow(d))
+    d$wiki.type <- rep(wiki.list[i,wiki.type],nrow(d))
+    d[,time.first.edit := min(date.time),by=.(editor.id, wiki.name)]
+
+    return(d)
+}
+
+remember <- function (v, k, silent=FALSE) {
+    if (!exists("r")){
+        rfilename = "remember.RDS"
+        if(file.exists(rfilename)){
+            
+            r <<- readRDS(rfilename)
+        }
+        else
+            r <<- list()
+    }
+        
+    if (missing(k)) {
+        k <- deparse(substitute(v))
+    }
+    
+    ## save to the global r variable/list
+    r[[k]] <<- v
+
+    if (!silent) {
+        print(r[[k]])
+        flush.console()
+    }
+    
+    invisible(r[[k]])
+    ## return(r[[k]])
+
+    saveRDS(r,"remember.RDS")
+}
+
+## make sure that appendix and nosave are always defined
+if (!exists("appendix")) { appendix <- FALSE }
+if (!exists("nosave")) { nosave <- FALSE }
+if(!exists("plot.distribtuions")){plot.distributions <- FALSE}
+basedir <- "."
+setwd(basedir)
+include.wikipedia <- FALSE
+if (!exists("wiki.list")) {
+    subdir <- "userroles_data/"
+    if (!exists(paste0(subdir,"missing.wikis"))){
+        deleted.wikis <- fread(paste0(subdir,"allusers_deleted_merge.txt"),header=FALSE,col.names=c("wiki"))
+        deleted.wikis <- unique(deleted.wikis$wiki)
+
+        notauthorized.wikis <- fread(paste0(subdir,"allusers_notauthorized_merge.txt"),header=FALSE,col.names=c("wiki"))
+        notauthorized.wikis <- unique(notauthorized.wikis$wiki)
+        missing.wikis = c(deleted.wikis, notauthorized.wikis)
+        remember(deleted.wikis)
+        remember(notauthorized.wikis)
+    }
+
+    wiki.list <- fread("selected.wikis.csv")
+    wiki.list <- wiki.list[! (wiki %in% missing.wikis) ]
+    wiki.list[wiki.type=="wikia",filename:=paste0(wiki,".tsv")]
+
+    if(include.wikipedia){
+        matchidx <- wiki.list[wiki.type=="wikipedia",regexec("https://(.*)\\.wikipedia.org",url)]
+        lang <- sapply(regmatches(wiki.list[wiki.type=="wikipedia",url],matchidx),function (l) l[2])
+        lang <- gsub("-","_",lang)
+        wiki.list[wiki.type=="wikipedia",lang := lang]
+        wiki.list[wiki.type=="wikipedia",filename:=paste0(lang,"_wikipedia.tsv")]
+    }
+    else{
+        wiki.list <- wiki.list[wiki.type != "wikipedia"]
+    }
+
+ #       wiki.list[,lang := NULL]
+
+    rm(missing.wikis)
+}
+
+if (!file.exists("wikis.used")){
+    write(wiki.list$wiki,"wikis.used")
+}
+
+options(mc.cores = 16)
diff --git a/lib-01-build_newcomer_table.R b/lib-01-build_newcomer_table.R
new file mode 100644 (file)
index 0000000..0f8036f
--- /dev/null
@@ -0,0 +1,845 @@
+# Library containing code for processing wikiq tsvs into datasets
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+library(urltools)
+library(lubridate)
+### is it more efficient to develop inside the loop or outside?
+## with group by outside mclapply
+##  user  system elapsed 
+## 3.743   8.112   6.219 
+
+##    user  system elapsed 
+## 609.715 592.603 638.172 
+
+## with group by inside mclapply
+##  user  system elapsed 
+## 3.670  8.302   5.780 
+
+##    user  system elapsed 
+## 739.826 408.396 596.346 
+## conclusion: do as much outside mclapply as possible
+
+build.newcomer.table.step1 <- function(wiki.list,
+                                       session.window.length = duration(1,units="hours"),
+                                       newcomer.period = duration(2*30,units="days"),
+                                       newcomer.sunset = duration(180,units="days"),
+                                       n.early.period.sessions = 1){
+    d.list <- mclapply(1:nrow(wiki.list),load.wikiq.files,wiki.list=wiki.list,mc.preschedule=F)
+#    d.list <- lapply(1:nrow(wiki.list),wiki.list=wiki.list,load.wikiq.files)
+    all.edits <- rbindlist(d.list)
+    
+    all.edits[,
+              ":="(time.first.edit = min(date.time),
+                   time.last.edit = max(date.time)),
+              by=.(editor.id, wiki.name)]
+
+
+    all.edits[,
+              ":="(editor=gsub("\"","",editor),
+                   title=gsub("\"","",title),
+                   reverteds=gsub("\"","",reverteds))]
+
+    all.edits <- all.edits[editor != "Default"]
+    all.edits[,month:=floor_date(date.time,unit="month")]
+    all.edits[,,by=.(wiki.name,editor)]
+    setkey(all.edits,wiki.name,editor.id,date.time)
+    ## fix the definition of session to edits that have less than 1 hour together
+    all.edits[,":="(time.since.last.edit = diff(c(first(time.first.edit),date.time),lag=1,differences=1),
+                    time.till.next.edit = diff(c(date.time,last(time.last.edit))),lag=1,differences=1,
+                    editor.tenure =as.duration(max(date.time)-min(date.time))), 
+              by=.(editor.id,wiki.name)]
+    
+    all.edits[,":="(new.session = time.since.last.edit > session.window.length),by=.(editor.id,wiki.name)]
+    all.edits[,":="(nth.session = cumsum(new.session)),by=.(editor.id,wiki.name)]
+    all.edits[,":="(in.early.session = nth.session < n.early.period.sessions)]
+    
+    all.edits[,
+              ":="(is.reverted = any(reverted),
+                   is.deleted = any(deleted),
+                   p.reverted = mean(reverted & namespace ==0),
+                   n.first.session=nrow(.SD[in.early.session==TRUE])),
+              by=.(editor.id,wiki.name)]
+    all.edits[,":="(age = as.duration(date.time - time.first.edit))]
+
+    all.edits[,":="(last.wiki.edit = max(date.time)),by=.(wiki.name)]
+    all.edits[,":="(is.newcomer = (age < newcomer.period) & (as.duration(last.wiki.edit - time.first.edit) > as.duration(newcomer.sunset)) & !anon)]
+
+    ## did rejecting editors leave a comment on the talk page?
+    return(all.edits)
+}
+
+add.userroles <- function(all.edits,bots,admins){
+
+    bots[,":="(wiki.name = wiki,
+               editor = user
+               ),
+         by=.(wiki,user)
+         ]
+
+    admins[,":="(wiki.name = wiki,
+                 editor = user),
+           by=.(wiki,user)]
+    
+    all.edits[bots,
+              ":="(
+                  is.bot = i.is.bot
+              ),
+                  on=.(wiki.name,
+                       editor,
+                       date.time >= role.period.begin,
+                       date.time <= role.period.end)
+              ]
+
+    all.edits[admins,
+              ":="(
+                  is.admin = i.is.admin
+              ),
+                  on=.(wiki.name,
+                       editor,
+                       date.time >= role.period.begin,
+                       date.time <= role.period.end)
+              ]
+              
+    all.edits[,":="(is.bot = ifelse(is.na(is.bot),FALSE,is.bot),
+                    is.admin = ifelse(is.na(is.admin),FALSE,is.admin))]
+
+    all.edits[,":="(is.newcomer = (is.newcomer & !is.bot))]
+    return(all.edits)
+}
+
+identify.revert.messages <- function(all.edits, discussion.window = as.difftime(7,units="days"),week.length=as.difftime(7,units="days")){
+
+    all.edits[,user.talk:=as.factor(paste0("User talk:",as.character(all.edits$editor)))]
+
+    ## join the talk page edits wit
+    all.edits[namespace==0,talk:=as.factor(paste0("Talk:",as.character(all.edits[namespace==0]$title)))]
+
+    print("    identifying reverts")
+    all.edits[!is.na(reverteds),reverted.edits := lapply(strsplit(reverteds,","),strtoi)]
+
+    all.edits[!is.na(reverteds),N.reverteds := lapply(reverted.edits,length)]
+
+    ns.edits = all.edits[namespace==0 | namespace==4]
+
+    reverted.lookup <- ns.edits[!is.na(reverteds),
+                                 .(revid = unlist(reverted.edits),
+                                   wiki.name = rep(wiki.name,N.reverteds),
+                                   reverted.by = rep(editor,N.reverteds),
+                                   reverted.by.bot = rep(is.bot, N.reverteds),
+                                   reverted.by.admin = rep(is.admin, N.reverteds),
+                                   revert.date.time = rep(date.time,N.reverteds),
+                                   revert.id = rep(revid,N.reverteds))]
+
+    reverted.edits <- ns.edits[reverted==TRUE]
+
+    reverted.edits[reverted.lookup,
+                   ":="(reverted.by = i.reverted.by,
+                        reverted.by.bot = i.reverted.by.bot,
+                        reverted.by.admin = i.reverted.by.admin,
+                        revert.date.time = i.revert.date.time,
+                        revert.id = revert.id),
+                   on=.(wiki.name,revid)]
+
+    reverted.edits[,message.window.end:= revert.date.time + discussion.window]
+
+    ## merge back revert info to all.edits
+    all.edits[reverted.edits,":="(
+                                 reverted.by = i.reverted.by,
+                                 reverted.by.bot = i.reverted.by.bot,
+                                 reverted.by.admin = i.reverted.by.admin,
+                                 revert.date.time = i.revert.date.time,
+                                 revert.id = revert.id,
+                                 message.window.end = message.window.end),
+              on = .(wiki.name, revid)]
+
+    print("    done")
+    print("    identifying editor talk page edits")
+    ns0.edits = all.edits[namespace==0]
+
+    ## we want talkers who talk before the end of the window
+    talk.page.edits = all.edits[namespace==1]
+    talk.page.edits[,talk:=title]
+
+
+    ## we only need to keep the key identifier for each revert
+    ## use editor + title instead of revid since editors may have more than
+    ## one edit reverted by a given revert.id. 
+    ## key = wiki.name,editor,title,revert.id,
+
+    setkeyv(reverted.edits,c("wiki.name","editor","title","revert.id"))
+    ## condition where editor discusses after being reverted
+    editor.talks <- reverted.edits[talk.page.edits,
+                                   .(
+                                       wiki.name,
+                                       editor = x.editor,
+                                       revert.id = x.revert.id,
+                                       talk.id = i.revid,
+                                       talk.date.time=i.date.time
+                                   )
+                                  ,on=.(editor,
+                                        wiki.name,
+                                        talk,
+                                        revert.date.time<date.time,
+                                        message.window.end>=date.time)
+                                  ,nomatch=0L]
+
+    editor.talks <- editor.talks[,
+                                 .(
+                                     editor.talks = TRUE,
+                                     time.editor.talks = min(talk.date.time),
+                                     editor.talks.revid = min(talk.id)
+                                 ),
+                                 by = .(wiki.name,editor,revert.id)
+                                 ]
+
+    ## merge back reverted edits to all.edits
+    all.edits[editor.talks,
+              ":="(editor.talks = editor.talks,
+                   time.editor.talks = time.editor.talks,
+                   editor.talks.revid=editor.talks.revid),
+              on=.(wiki.name,editor,revert.id)]
+
+    ## tidy up
+    rm(editor.talks, reverted.lookup)
+
+    print("    done")
+    print("    identifying reverter talk page edits")
+    all.edits[,":="(response.window.end = time.editor.talks + discussion.window)]
+    all.edits[(reverted==TRUE & is.na(editor.talks)), editor.talks := FALSE]
+    ns0.edits = all.edits[namespace==0]
+    reverted.edits <- ns0.edits[reverted==TRUE]
+    talk.page.edits <- all.edits[namespace==1]
+    talk.page.edits[,":="(talk = title,reverted.by=editor)]
+
+                                        # the key is still wiki.name, editor, revert.id
+    reverter.talks <- reverted.edits[talk.page.edits,
+                                     .(
+                                         wiki.name = wiki.name,
+                                         editor = x.editor,
+                                         revert.id = x.revert.id,
+                                         revert.date.time = x.revert.date.time,
+                                         time.reverter.talks = i.date.time,
+                                         reverter.talk.id = i.revid
+                                     ),
+                                    ,on=.(reverted.by,
+                                          wiki.name,  
+                                          talk,
+
+                                          revert.date.time<date.time,
+                                          response.window.end>=date.time),
+                                     nomatch=0L]
+
+    reverter.talks <- reverter.talks[time.reverter.talks > revert.date.time,
+                                     .(   
+                                         reverter.talks = TRUE,
+                                         time.reverter.talks = min(time.reverter.talks),
+                                         reverter.talk.id = min(reverter.talk.id)
+                                     ),
+                                     by=.(wiki.name,editor,revert.id)
+                                     ]
+
+
+    ## merge back reverted.edits to all.edits
+    all.edits[reverter.talks,
+              ":="(reverter.talks = reverter.talks,
+                   time.reverter.talks = time.reverter.talks,
+                   reverter.talk.id = reverter.talk.id),
+              on=.(wiki.name,editor,revert.id)]
+
+    ## tidy up
+    rm(reverter.talks,talk.page.edits)
+
+    all.edits[(reverted == TRUE) & (is.na(reverter.talks)), reverter.talks := FALSE]
+
+                                        # if the editor didn't talk first, the time window is different
+    all.edits[reverter.talks == TRUE,
+              editor.talks.first := (time.editor.talks < time.reverter.talks)]
+
+    all.edits[(reverter.talks == TRUE) & (editor.talks.first==FALSE),
+              reverter.talks := time.reverter.talks < (date.time + discussion.window)]
+
+    print("    done")
+    print("    identifying User talk page edits")
+    
+    ## now do the same thing but for user talk pages
+    ## did the reverter post on the editor's user talk page?
+    ## key is wiki.name, title, reverted.by, revert.id
+    ns0.edits = all.edits[namespace==0]
+    user.talk.edits = all.edits[namespace==3]
+    user.talk.edits[,":="(reverted.by=editor,user.talk=title)]
+    reverted.edits = ns0.edits[reverted==TRUE]
+    reverter.messages = reverted.edits[user.talk.edits,
+                                       .(wiki.name = x.wiki.name,
+                                         title = x.title,
+                                         revert.id = x.revert.id,
+                                         editor = x.editor,
+                                         reverted.by = i.reverted.by,
+                                         time.reverter.messages=i.date.time,
+                                         reverter.messages.id=i.revid),
+                                       on=.(wiki.name,
+                                            reverted.by,
+                                            user.talk,
+                                            revert.date.time <= date.time,
+                                            message.window.end >= date.time
+                                            ),
+                                       nomatch=0L]
+
+    reverter.messages = reverter.messages[,.(reverter.messages = TRUE,
+                                             time.reverter.messages = min(time.reverter.messages),
+                                             reverter.message.id = min(reverter.messages.id)),
+                                          by=.(wiki.name, editor, reverted.by, revert.id)]
+
+    reverted.edits[reverter.messages,":="(reverter.messages = reverter.messages,
+                                          time.reverter.messages = time.reverter.messages,
+                                          reverter.message.id = reverter.message.id),
+                   on=.(wiki.name, editor, revert.id)]
+
+    reverted.edits[is.na(reverter.messages), reverter.messages := FALSE]
+    
+    all.edits[reverted.edits,":="(reverter.messages = reverter.messages,
+                                  time.reverter.messages = time.reverter.messages,
+                                  reverter.message.id = reverter.message.id),
+              on=.(wiki.name, editor, revert.id)]
+
+    ## set some wiki-level variables
+    print("    creating wiki windows")
+    setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L)
+    all.edits[,":="(chars.change = diff(c(0L,text.chars),lag=1,differences=1),
+                    creates.article = (date.time == min(date.time))
+                    ),by=.(wiki.name,articleid)]
+
+    setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L)
+
+    # Some wikis got created by Wikia - invalidating wiki age that doesn't remove this editor
+
+    all.edits[,":="(wiki.birth.date = min(date.time)),by=.(wiki.name)]
+
+    all.edits[,":="(total.wiki.length = cumsum(chars.change),
+                    n.articles = cumsum(creates.article),
+                    wiki.age = as.duration(date.time - wiki.birth.date),
+                    year = year(date.time)
+                    ),by=.(wiki.name)]
+
+    all.edits[,":="(wiki.age.months = floor(as.double(wiki.age,units='days')/30),
+                    wiki.age.years = floor(as.double(wiki.age,units='years')))]
+    
+    ## generate breaks at precisely 1 week +/- the first edit.
+    date.range <- all.edits[,.(first.edit = min(date.time),last.edit = max(date.time)), by = .(wiki.name)]
+
+    window.breaks <- date.range[,.(breaks = seq(trunc(first.edit,"days"),
+                                                trunc(last.edit,"days"),
+                                                by=week.length),
+                                   break.next = seq(trunc(first.edit+week.length,"days"),
+                                                    trunc(last.edit+week.length,"days"),
+                                                    by=week.length)),
+                                by=.(wiki.name)]
+
+    window.breaks[,
+                  ":="(i.break = 1:length(breaks))
+                 ,by=(wiki.name)]
+    
+    all.edits[window.breaks,
+              ":="(week = i.break
+                   ),
+              on=.(wiki.name, date.time <=break.next,date.time >=breaks)]
+
+    print("   done")
+    ## tidy up 
+    all.edits[,":="(reverted.edits = NULL,
+                    N.reverteds = NULL,
+                    user = NULL,
+                    user.talk = NULL,
+                    talk=NULL,
+                    message.window.end=NULL,
+                    response.window.end=NULL)]
+
+    print("    done")
+    rm(reverted.edits,reverter.messages,user.talk.edits,ns0.edits)
+    return(all.edits)
+}
+
+build.newcomers <- function(all.edits,
+                            newcomer.period = duration(60,unit="days"),
+                            newcomer.sunset= duration(30*6,unit="days")
+                            ){
+    setkeyv(all.edits,'date.time')
+
+    all.edits[,":="(time.last.edit.to.wiki = max(date.time)), by=.(wiki.name)]
+
+    all.edits <- all.edits[,time.till.page.edit := c(diff(date.time),as.numeric(NA)),by=.(wiki.name,articleid)]
+    all.edits <- all.edits[,last.edit.to.page :=is.na(time.till.page.edit)]
+
+    all.edits[last.edit.to.page == TRUE,time.till.page.edit := time.last.edit.to.wiki-date.time]
+
+    all.edits <- all.edits[,time.till.page.edit := log1p(as.numeric(time.till.page.edit,units='days'))]
+
+    editor.variables <- all.edits[,
+                                  .(survives = any( (age > newcomer.period) & (age < newcomer.sunset)),anon=first(anon),is.bot=any(is.bot),is.admin=any(is.admin)),
+                                  by = .(wiki.name,editor)
+                                  ]
+    
+    first.session.edits <- all.edits[in.early.session==TRUE]
+    first.session.edits[,":="(end.newcomer.period = time.first.edit + newcomer.period)]
+
+    print("    aggregating newcomer activity within wikis")
+    newcomers <- first.session.edits[namespace == 0,
+                                .(
+                                   is.reverted = any(reverted & reverted.by != editor),
+                                   p.reverted = first(p.reverted),
+                                   is.bot.reverted = any(reverted.by.bot),
+                                   is.admin.reverted = any(reverted.by.admin),
+                                   is.reverted.messaged = any(reverter.messages |
+                                                              reverter.talks,na.rm=TRUE),
+                                   reverter.talks = any(reverter.talks, na.rm=TRUE),
+                                   reverter.messages = any(reverter.messages, na.rm=TRUE),
+                                   editor.talks = any(editor.talks,na.rm=TRUE),
+                                   time.next.page.edit = min(time.till.next.edit, na.rm=TRUE),
+                                   BRD.initiation = any(editor.talks &
+                                                        (editor.talks.first |
+                                                         !reverter.talks), na.rm = TRUE),
+                                   
+                                   BRD.reciprocation = any(editor.talks &
+                                                           editor.talks.first &
+                                                           reverter.talks, na.rm = TRUE),
+                                   reverter.initates.BRD = any(reverter.talks & (!editor.talks.first |
+                                                                                 is.na(editor.talks.first)),na.rm=TRUE),
+                                   time.first.edit = first(time.first.edit),
+                                   time.till.page.edit = min(time.till.page.edit),
+                                   last.edit.to.page = all(last.edit.to.page),
+                                   end.newcomer.period = first(end.newcomer.period),
+                                   week = first(week),
+                                   year = first(year(time.first.edit)),
+                                   newcomer.edits = .N,
+                                   session.edits = first(n.first.session),
+                                   ns0.edits = sum(namespace == 0),
+                                   ns1.edits = sum(namespace == 1),
+                                   ns4.edits = sum(namespace == 4),
+                                   newcomer.chars.change = sum(chars.change),
+                                   newcomer.creates.article = any(creates.article),
+                                   wiki.type = first(wiki.type),
+                                   wiki.age = first(wiki.age)
+                                   ),
+                                by = .(wiki.name, editor)
+                                ]
+
+
+    newcomers[editor.variables,":="(survives = survives,is.bot=is.bot,is.admin=is.admin), on=.(wiki.name,editor)]
+
+    newcomers <- newcomers[!is.bot & !is.admin]    
+    print("    done")
+    print("    identifying newcomer activity on other wikis")
+                                     
+    newcomer.prior.wikis <- first.session.edits[newcomers,
+                                           .(
+                                               editor = editor,
+                                               wiki.name = i.wiki.name,
+                                               other.wiki = x.wiki.name,
+                                               time.first.edit.this = i.time.first.edit,
+                                               time.first.edit.other = x.time.first.edit
+
+                                           ),
+                                           on=.(wiki.type,editor,time.first.edit < time.first.edit),
+                                           nomatch=0L,
+                                           allow.cartesian = TRUE
+                                           ]
+    
+    # using < time first edit should exlude edits to this wiki
+    newcomer.prior.wikis <- newcomer.prior.wikis[,.(n.edits.other = .N),
+                                                 by=.(editor,wiki.name,other.wiki)]
+
+    newcomer.prior.wikis <- newcomer.prior.wikis[,
+                                                 .(n.other.wikis = .N,
+                                                   n.edits.other = sum(n.edits.other)),
+                                                 by=.(wiki.name,editor)]
+
+    newcomer.prior.wikis <- newcomer.prior.wikis[newcomers,
+                                                 .(
+                                                     wiki.name=wiki.name,
+                                                     editor=editor,
+                                                     n.other.wikis = n.other.wikis,
+                                                     n.edits.other = n.edits.other,
+                                                     has.edited.other.wikis = (n.other.wikis > 0) & (!is.na(n.other.wikis))),
+                                                 on=.(wiki.name,editor),
+                                                 nomatch=NA]
+                                                 
+    newcomers <- newcomers[newcomer.prior.wikis,
+                           ":="(n.other.wikis = ifelse(is.na(i.n.other.wikis),0,i.n.other.wikis),
+                                n.edits.other = ifelse(is.na(i.n.edits.other),0,i.n.edits.other),
+                                has.edited.other.wikis = (i.n.other.wikis > 0) & (!is.na(i.n.other.wikis))),
+                           on=.(wiki.name, editor)
+                           ]
+
+    newcomers[,":="(has.edited.other.wikis = ifelse(is.na(has.edited.other.wikis),FALSE,has.edited.other.wikis),
+                    n.edits.other = ifelse(is.na(n.edits.other),0,n.edits.other),
+                    n.other.wikis = ifelse(is.na(n.other.wikis),0,n.other.wikis)
+                   )]
+
+    print("    done")
+    print("    identifying all messages")
+                    
+    user.talk.edits <- all.edits[namespace==3]
+
+    user.talk.edits[,user.talk:=title]
+    
+    newcomers[,user.talk:= as.factor(paste0("User talk:",as.character(editor)))]
+
+    newcomer.messages <- user.talk.edits[newcomers,
+                                        .(
+                                          editor = i.editor,
+                                          n.messages = .N,
+                                          end.newcomer.period = i.end.newcomer.period
+                                          ),
+                                        on=.(wiki.name,user.talk,date.time <= end.newcomer.period),
+                                        by=.EACHI,
+                                       nomatch=0L]
+    
+    newcomer.messages <- newcomer.messages[newcomers,
+                                   .(wiki.name,
+                                     editor,
+                                     n.messages = x.n.messages,
+                   is.messaged = (x.n.messages > 0) & (!is.na(x.n.messages))),
+              on=.(wiki.name,editor),
+              nomatch = NA]
+
+    newcomers <- newcomers[newcomer.messages,
+                           ":="(n.messages = ifelse(is.na(i.n.messages),0L,i.n.messages),
+                                is.messaged = ifelse(is.na(i.n.messages),FALSE,i.is.messaged)),
+                           on=.(wiki.name,editor)]
+
+    last.edit <- max(all.edits$date.time)
+    last.wikia.edit <- max(all.edits[wiki.type=="wikia",date.time])
+    newcomers <- newcomers[time.first.edit < last.edit - as.difftime(60,units="days")]
+    newcomers <- newcomers[(wiki.type == "wikia") & (time.first.edit < (last.wikia.edit - as.difftime(60,units="days")))]
+    
+    print("    done")
+    return(newcomers)
+}
+
+
+build.namespace4.dataset <- function(all.edits,  week.length = as.difftime(7,units="days")){
+    ns4.reg.edits <- all.edits[(namespace==4) & (anon==FALSE)]
+    
+    return(ns4.reg.edits)    
+}
+    
+
+build.wiki.level.variables <- function(all.edits, week.length = as.difftime(7,units="days")){
+
+    wiki.data <- all.edits[,.(n.editors = length(unique(editor)),
+                              total.wiki.length=last(total.wiki.length)
+                              )
+                           ,by=.(wiki.name,week)]
+    
+    wiki.ns4.data <- all.edits[namespace==4,
+                               .(n.ns4.edits = .N,
+                                 n.ns4.editors = length(unique(editor)),
+                                 d.ns4.length = sum(chars.change),
+                                 ns4.editor.age = mean(age)
+                                 ),
+                               by=.(wiki.name, week)]
+    
+    wiki.ns0.data <- all.edits[namespace==0,
+                               .(revert.rate = mean(reverted,na.rm=TRUE),
+                                 newcomer.revert.rate = sum((reverted & is.newcomer),na.rm=TRUE)/sum(is.newcomer,na.rm=TRUE),
+                                 revert.disc.rate = sum((reverted  & reverter.talks),na.rm=TRUE)/sum(reverted,na.rm=TRUE),
+                                 newcomer.revert.disc.rate = sum((reverted & reverter.talks & is.newcomer),na.rm=TRUE)/ sum(reverted & is.newcomer,na.rm=TRUE),
+                                 revert.message.rate = sum((reverted & reverter.messages),na.rm=TRUE)/sum(reverted,na.rm=TRUE),
+                                 newcomer.revert.message.rate = sum((reverted & reverter.messages & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE),
+                                 newcomer.edits.rate = mean(is.newcomer,na.rm=TRUE),
+                                 bot.revert.rate = mean(reverted.by.bot,na.rm=TRUE),
+                                 bot.revert.prop = sum(reverted.by.bot,na.rm=TRUE)/sum(reverted,na.rm=TRUE),
+                                 newcomer.bot.revert.rate = mean((reverted.by.bot & is.newcomer),na.rm=TRUE), 
+                                 newcomer.bot.revert.prop = sum((reverted.by.bot & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE),
+                                 admin.revert.rate = mean(reverted.by.admin,na.rm=TRUE),
+                                 admin.revert.prop = sum(reverted.by.admin,na.rm=TRUE)/sum(reverted,na.rm=TRUE),
+                               year = year(first(date.time)),
+                               month = month(first(date.time))),
+                               by=.(wiki.name,week)]
+
+    ## replace NAs with 0
+    wiki.ns0.data[,
+                  ":="(
+#                      revert.rate = ifelse(is.na(revert.rate),0,revert.rate),
+                      revert.disc.rate = ifelse(is.na(revert.disc.rate),0,revert.disc.rate),
+                      newcomer.revert.disc.rate = ifelse(is.na(newcomer.revert.disc.rate),0,newcomer.revert.disc.rate),
+                      revert.message.rate = ifelse(is.na(revert.message.rate),0,revert.message.rate),
+                      newcomer.revert.message.rate = ifelse(is.na(newcomer.revert.message.rate),0,newcomer.revert.message.rate),
+                      newcomer.edits.rate = ifelse(is.na(newcomer.edits.rate),0,newcomer.edits.rate),
+                      bot.revert.rate = ifelse(is.na(bot.revert.rate),0,bot.revert.rate),
+                      bot.revert.prop = ifelse(is.na(bot.revert.prop),0,bot.revert.prop),
+                      newcomer.bot.revert.rate = ifelse(is.na(newcomer.bot.revert.rate),0,newcomer.bot.revert.rate),
+                      newcomer.bot.revert.prop = ifelse(is.na(newcomer.bot.revert.prop),0,newcomer.bot.revert.prop),
+                      admin.revert.rate = ifelse(is.na(admin.revert.rate),0,admin.revert.rate),
+                      admin.revert.prop = ifelse(is.na(admin.revert.prop),0,admin.revert.prop)),
+                  ]
+
+    ## bring it together
+    wiki.data[wiki.ns0.data,
+              ":="(
+                  revert.rate = i.revert.rate,
+                  revert.disc.rate = i.revert.disc.rate,
+                  newcomer.revert.disc.rate = i.newcomer.revert.disc.rate,
+                  revert.message.rate = i.revert.message.rate,
+                  newcomer.revert.message.rate = i.newcomer.revert.message.rate,
+                  newcomer.edits.rate = i.newcomer.edits.rate,
+                  bot.revert.rate = i.bot.revert.rate,
+                  bot.revert.prop = i.bot.revert.prop,
+                  newcomer.bot.revert.rate = i.newcomer.bot.revert.rate,
+                  newcomer.bot.revert.prop = i.newcomer.bot.revert.prop,
+                  admin.revert.rate = i.admin.revert.rate,
+                  admin.revert.prop = i.admin.revert.prop),
+              on=.(wiki.name,week)]
+
+    wiki.data[wiki.ns4.data,
+              ":="(
+                  n.ns4.edits = i.n.ns4.edits,
+                  n.ns4.editors = i.n.ns4.editors,
+                  d.ns4.length = i.d.ns4.length,
+                  ns4.editor.age = i.ns4.editor.age
+              ),
+              on=.(wiki.name,week)]
+    
+    # create variables for community size in standard deviation units
+    return(wiki.data)
+}
+
+
+load.all.edits <- function(){
+    if(!exists("all.edits")){
+        file.name <- "all.edits.RDS"
+        if(!file.exists(file.name)){
+            print("loading wikiq data")
+
+            all.edits <- build.newcomer.table.step1(wiki.list, newcomer.period = newcomer.period)
+
+            print("done")
+            
+            print("adding user role data")
+            all.edits <- add.userroles(all.edits,bots=bots,admins=admins)
+            print("done")
+
+            print("identifying reverts and messages")
+            all.edits <- identify.revert.messages(all.edits,week.length=as.difftime(7,units="days"))
+            print("done")
+            if(!nosave){
+                print("saving work")
+                saveRDS(all.edits,file.name)
+                print("done")
+            }
+        } else{
+            print("loading wikiq data with reverts and messages")
+            all.edits <- readRDS(file.name)
+            print("done")
+        }
+
+        remember(min(all.edits$date.time),"earliest.data.point")
+        remember(max(all.edits$date.time),"latest.data.point")
+
+        ## make all.edits a global variable
+        all.edits <<- all.edits
+    }
+}
+
+newcomer.period = duration(2*30,unit="days")
+newcomer.sunset = duration(30*6,unit="days")
+week.length=duration(7,unit="days")
+remember(newcomer.period)
+remember(newcomer.sunset)
+remember(week.length)
+
+## try loading newcomers
+
+if(!exists("newcomers")){
+    file.name2 <- "newcomers.RDS"
+    if(file.exists(file.name2)){
+        newcomers <- readRDS(file.name2)            
+    } else{
+        print("building newcomers table")
+        load.all.edits()
+        
+        newcomers <- build.newcomers(all.edits,
+                                     newcomer.sunset = newcomer.sunset,
+                                     newcomer.period=newcomer.period)
+
+        print("done")
+        print("saving work")
+        if(!nosave){
+            saveRDS(newcomers,file.name2)
+        }
+    }
+}    
+
+
+if(!exists("ns4.reg.edits")){
+    file.name <- "ns4.reg.edits.RDS"
+    if(file.exists(file.name)){
+        ns4.reg.edits <- readRDS(file.name)            
+    } else{
+        print("building ns4 edits table")
+        
+        ## create table of namespace 4 edits from all edits
+        load.all.edits()
+        ns4.reg.edits <- build.namespace4.dataset(all.edits)
+        print("done")
+        print("saving work")
+        if(!nosave){
+            saveRDS(ns4.reg.edits,file.name)
+        }
+    }
+}    
+
+if(!exists("wiki.data")){
+    file.name3 <- "wikiweeks.RDS"
+    if(!file.exists(file.name3)){
+        print("building wiki level variable")
+        load.all.edits()
+        wiki.data <- build.wiki.level.variables(all.edits, week.length=week.length)
+        print("done")
+        print("saving work")
+        if(!nosave){
+            saveRDS(wiki.data,file.name3)
+        }
+        print("done")
+    }
+    else{
+        wiki.data <- readRDS(file.name3)
+    }
+}
+
+#wikis.to.remove <- newcomers[,.N,by="wiki.name"][N<30]$wiki.name
+#remember(nrow(wikis.to.remove),"n.wikis.insufficient.newcomers")
+#newcomers <- newcomers[!(wiki.name  %in% wikis.to.remove)]
+#all.edits <- all.edits[!(wiki.name %in% wikis.to.remove)]
+if(!exists("wiki.stats")){
+    file.name <- "wiki.stats.RDS"
+    if(!file.exists(file.name)){
+        load.all.edits()
+
+        editor.tenures <- all.edits[,.(tenure=first(editor.tenure)),by=.(wiki.name,editor)]
+        wiki.stats <- all.edits[,.(total.editors = length(unique(editor)),
+                                   total.edits = .N,
+                                   total.reverts = sum(reverted),
+                                   total.bot.reverts = sum(reverted.by.bot,na.rm=TRUE),
+                                   total.ns4.edits = nrow(.SD[namespace==4]),
+                                   med.edit.tenure = median(editor.tenure)
+                                   ),by=.(wiki.name)]
+
+        med.editor.tenure <- editor.tenures[,.(med.editor.tenure=median(tenure)),by=.(wiki.name)]
+
+        wiki.stats[med.editor.tenure,med.tenure := med.editor.tenure,on="wiki.name"]
+        newcomer.stats <- newcomers[,.(retention.rate = mean(survives),
+                                       reverted.newcomers = sum(is.reverted)
+                                       ),by=.(wiki.name)]
+        wiki.stats <- wiki.stats[newcomer.stats,':='(retention.rate = retention.rate, reverted.newcomers = reverted.newcomers), on="wiki.name"]
+        remember(wiki.stats,silent=TRUE)
+        saveRDS(wiki.stats,file.name)
+    } else {
+        wiki.stats <- readRDS("wiki.stats.RDS")
+    }
+}
+
+row1 <- c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits")
+row2 <- c("med.editor.tenure","retention.rate")
+m.wiki.stats <- melt(wiki.stats,id='wiki.name',measure.vars = c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits"))
+m.wiki.stats[variable %in% row1, ":="(row = 1,col=which(row1 == variable,useNames=F)),by=variable]
+m.wiki.stats[variable %in% row2, ":="(row = 2,col=which(row2 == variable,useNames=F)),by=variable]
+
+m.wiki.stats <- m.wiki.stats[value != 0 | variable != "total.bot.reverts"]
+m.wiki.stats <- m.wiki.stats[value == 0 & variable != "total.bot.reverts", value := 1]
+
+friendly.var <- function(varname){
+    sapply(as.character(varname),function(f) switch(f,
+                                                    total.editors='Editors',
+                                                    total.reverts='Reverts',
+                                                    total.bot.reverts='Bot reverts',
+                                                    total.ns4.edits='Edits to the project namespace'))
+}
+
+var.id <- function(varname){
+    sapply(as.character(varname),function(f) switch(f,
+                                                    total.editors=1,
+                                                    total.reverts=2,
+                                                    total.bot.reverts=3,
+                                                    total.ns4.edits=4))
+} 
+
+med.line.width <- 1
+m.wiki.stats[,variable := friendly.var(variable)]
+m.wiki.stats <- m.wiki.stats[,variable:=factor(variable,levels=c('Editors',"Reverts","Bot reverts","Edits to the project namespace"))]
+
+spoke.data <- m.wiki.stats[,.(y = median(value)),by=variable]
+remember(m.wiki.stats)
+remember(spoke.data)
+remember(nrow(wiki.stats),"n.wikia.wikis")
+
+## join wiki-level variables with newcomer variables to get ready to model newcomer retention.
+newcomers <- newcomers[wiki.data,
+          ":="(
+              wiki.name=i.wiki.name,
+              week = i.week,
+              n.editors = i.n.editors,
+              total.wiki.length = i.total.wiki.length,           
+              revert.rate = i.revert.rate,
+              revert.disc.rate = i.revert.disc.rate,            
+              newcomer.revert.disc.rate = i.newcomer.revert.disc.rate,
+              revert.message.rate = i.revert.message.rate,         
+              newcomer.revert.message.rate = i.newcomer.revert.message.rate,
+              newcomer.edits.rate = i.newcomer.edits.rate,         
+              bot.revert.rate = i.bot.revert.rate,
+              bot.revert.prop = i.bot.revert.prop,             
+              newcomer.bot.revert.rate = i.newcomer.bot.revert.rate,
+              newcomer.bot.revert.prop = i.newcomer.bot.revert.prop,    
+              admin.revert.rate = i.admin.revert.rate,
+              admin.revert.prop = i.admin.revert.prop,           
+              n.ns4.edits = i.n.ns4.edits,
+              n.ns4.editors = i.n.ns4.editors,               
+              d.ns4.length = i.d.ns4.length,
+              ns4.editor.age = i.ns4.editor.age,
+              wiki.age.weeks = as.double(wiki.age,units='days')/7,
+              wiki.age.months = floor(as.double(wiki.age,units='days')/30),
+              wiki.age.half.years = floor(as.double(wiki.age,units='years')*2),
+              wiki.age.years = floor(as.double(wiki.age,units='years')),
+              quarter = factor(floor_date(time.first.edit,unit="3 months"))
+          ),
+          on=.(wiki.name,week)
+          ]
+
+
+survival.data <- newcomers[,.(wiki.name,
+                              week,
+                              survival.rate = mean(survives),
+                              n.newcomers = .N),
+                           by = .(wiki.name, week)]
+wiki.data <- wiki.data[survival.data,
+          ":="(
+              survival.rate = survival.rate,
+              n.newcomers =  n.newcomers),
+          on = .(wiki.name,week)]
+
+file.name <- "active.editors.RDS"
+if(!file.exists(file.name)){
+    load.all.edits()
+    active.editors <- all.edits[,
+                                .(N.edits=.N,
+                                  wiki.age.years=first(wiki.age.years)),
+                                by=.(wiki.name,
+                                     editor,
+                                     wiki.age.months)]
+    saveRDS(active.editors, file.name)
+
+} else {
+    active.editors <- readRDS(file.name)
+}
diff --git a/lib-01-generate_userroles.R b/lib-01-generate_userroles.R
new file mode 100644 (file)
index 0000000..9351f55
--- /dev/null
@@ -0,0 +1,85 @@
+# Processes data from the Wikia API to identify bots and admins
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+library(devtools)
+
+load_all("RCommunityData")
+
+# Get files for a wiki
+load.rights.files <- function (filename) {
+  wiki <- gsub('\\.tsv$', '', filename)
+  print(wiki)
+  logevents <- read.delim(paste("logevents-2017/", filename, sep=""),
+                          stringsAsFactors=FALSE, encoding="UTF-8", quote="")
+
+  current.userroles <- read.delim(paste("userlist-2017/", filename, sep=""),
+                               stringsAsFactors=FALSE, na.string="",
+                               encoding="UTF-8", header=TRUE)
+  
+  d <- generate.admin.addrm(logevents, current.userroles)
+  d$wiki <- wiki
+  return(d)
+}
+
+setwd("userroles_data/")
+wiki.files = paste0(wiki.list$wiki,".tsv")
+userroles <- rbindlist(lapply(wiki.files, load.rights.files))
+userroles$blocked <- grepl('^<span class="listusers_blockeduser">(.*?)$', userroles$role)
+userroles$role <- gsub('^<span class="listusers_blockeduser">(.*?)$','\\1', userroles$role)
+userroles$role <- gsub('^(.*?)</span>$','\\1', userroles$role)
+
+userroles[, is.action.admin := (role %in% c("sysop", "bureaucrat","sysop,bureaucrat","staff","admin","fanonadmin","steward"))]
+userroles[, is.action.bot := (role %in%  c("bot", "fyzbot","bot-global"))]
+
+bots = userroles[is.action.bot==TRUE]
+admins = userroles[is.action.admin==TRUE]
+
+setorder(bots,"timestamp")
+setorder(admins,"timestamp")
+## we want to keep track of when the roles changed
+## assume nobody was a bot or admin at the beginning of Mediawiki
+
+## userroles[,':='(
+##     prev.isbot = ifelse(is.na(prev.isbot),(isbot & action=="removed"),prev.isbot)
+
+bots[,
+     ":="(
+         role.period.begin = timestamp,
+         role.period.end = shift(timestamp,fill=as.POSIXct("2017-01-01"),type="lead"))
+    ,by = .(wiki,user)
+     ]
+
+bots[,":="(is.bot = (action == "added"))]
+
+admins[,
+       ":="(
+           role.period.begin = timestamp,
+           role.period.end = shift(timestamp,fill=as.POSIXct("2017-01-01"),type="lead"))
+      ,by = .(wiki,user)
+       ]
+
+admins[,":="(is.admin = (action == "added") )]
+
+# save data to an output file for knitr
+setwd("..");
+rm(load.rights.files)
+rm(wiki.files,userroles)
+
+if (!nosave) {
+    saveRDS(bots, file="bots.RDS")
+    saveRDS(admins, file="admins.RDS")
+    saveRDS(r, file="lib-01-generate_userroles.RDS")
+}
diff --git a/lib-01-sample-datasets.R b/lib-01-sample-datasets.R
new file mode 100644 (file)
index 0000000..2aaea1e
--- /dev/null
@@ -0,0 +1,57 @@
+# Functions for creating samples of datasets
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+sample.by.wiki <- function(dt,outcome,N.per.wiki=30){
+    set.seed(0)
+    sample.params <- dt[,.N,by=wiki.name]
+    sample.params[,group.sd := dt[,.(group.sd=sd(.SD[[outcome]])),by=wiki.name]$group.sd]
+
+    sample.params[,p.in.group := N / nrow(dt)]
+
+    sample.params[,min.N := min(N)]
+    sample.params[,n.from.group := pmin(min.N/(1-group.sd), N)]
+
+    sample.params[,p.sampled := n.from.group/N]
+
+    sample.params[,weight := 1/p.sampled]
+    dt[sample.params,":="(prob=p.sampled,weight=weight),on=.(wiki.name)]
+    sample.idx <- sample(nrow(dt),size=sum(sample.params$n.from.group,na.rm=TRUE),prob=dt$prob)
+
+    return(dt[sample.idx])
+}
+
+sample.newcomers <- function()
+{
+    wikis.to.remove <- newcomers[,.N,by="wiki.name"][N<30]$wiki.name
+    remember(nrow(wikis.to.remove),"n.wikis.insufficient.newcomers")
+    newcomers.presample <- newcomers[!(wiki.name  %in% wikis.to.remove)]
+    newcomers.sample <- sample.by.wiki(newcomers.presample,"survives")
+    return(newcomers.sample)
+}
+
+sample.ns4.edits <- function(){
+    wikis.to.keep <- ns4.reg.edits[,.(.N,N.reverts=sum(reverted)),by=wiki.name][(N>30)&(N.reverts > 30)]
+    ns4.reg.edits.sub <- ns4.reg.edits[wiki.name %in% wikis.to.keep$wiki.name]
+    ns4.reg.edits.sample <- sample.by.wiki(ns4.reg.edits.sub,"reverted")
+    return(ns4.reg.edits.sample)
+}
+
+sample.wiki.data <- function(){
+    ## just choose 100 random wikis
+    wikis.to.keep <- sample(unique(wiki.data$wiki.name),100)
+    wiki.data.sample <- wiki.data[wiki.name %in% wikis.to.keep]
+    return(wiki.data.sample)
+}
diff --git a/mediawiki_dump_tools/.gitignore b/mediawiki_dump_tools/.gitignore
new file mode 100644 (file)
index 0000000..616dc22
--- /dev/null
@@ -0,0 +1,5 @@
+*.xml.gz
+*.7z
+*.xml.bz2
+*.xml.xz
+*.swp
diff --git a/mediawiki_dump_tools/.gitmodules b/mediawiki_dump_tools/.gitmodules
new file mode 100644 (file)
index 0000000..6c9d975
--- /dev/null
@@ -0,0 +1,3 @@
+[submodule "Mediawiki-Utilities"]
+       path = Mediawiki-Utilities
+       url = https://github.com/halfak/Mediawiki-Utilities.git
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/.gitignore b/mediawiki_dump_tools/Mediawiki-Utilities/.gitignore
new file mode 100644 (file)
index 0000000..d9d6192
--- /dev/null
@@ -0,0 +1,46 @@
+# Demo files
+demo_*
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# Temporary text editor files
+*~
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+#lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Sphinx documentation
+doc/_build/
+doc/.buildfile
+*.toctree
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/CHANGE_LOG.rst b/mediawiki_dump_tools/Mediawiki-Utilities/CHANGE_LOG.rst
new file mode 100644 (file)
index 0000000..5f4a516
--- /dev/null
@@ -0,0 +1,19 @@
+v0.4.4
+======
+
+Adds API helper for persistence tracking and example script.
+
+v0.4.0
+======
+
+Adds api.collections.users
+
+v0.3.8
+======
+
+Adds support for spaces in XML dump filenames when using the dump mapper.
+
+v0.3.7
+======
+
+Fixes pickling issues in Timestamp
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/LICENSE b/mediawiki_dump_tools/Mediawiki-Utilities/LICENSE
new file mode 100644 (file)
index 0000000..f00d188
--- /dev/null
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Aaron Halfaker
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/MANIFEST.in b/mediawiki_dump_tools/Mediawiki-Utilities/MANIFEST.in
new file mode 100644 (file)
index 0000000..8dce522
--- /dev/null
@@ -0,0 +1 @@
+include LICENSE README.rst
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/README.rst b/mediawiki_dump_tools/Mediawiki-Utilities/README.rst
new file mode 100644 (file)
index 0000000..a481a3e
--- /dev/null
@@ -0,0 +1,25 @@
+===================
+MediaWiki Utilities
+===================
+MediaWiki Utilities is an open source (MIT Licensed) library developed by Aaron Halfaker for extracting and processing data from MediaWiki installations, slave databases and xml dumps.
+
+**Install with pip:** ``pip install mediawiki-utilities``
+
+**Note:** *Use of this library requires Python 3 or later.*
+
+**Documentation:** http://pythonhosted.org/mediawiki-utilities/
+
+About the author
+================
+:name: 
+       Aaron Halfaker
+:email:
+       aaron.halfaker@gmail.com
+:website:
+       http://halfaker.info --
+       http://en.wikipedia.org/wiki/User:EpochFail
+
+Contributors
+============
+None yet.  See http://github.com/halfak/mediawiki-utilities.  Pull requests are encouraged.
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/WORK_LOG.rst b/mediawiki_dump_tools/Mediawiki-Utilities/WORK_LOG.rst
new file mode 100644 (file)
index 0000000..bcbcb94
--- /dev/null
@@ -0,0 +1,63 @@
+2014-06-02
+       After some reading, it looks like py3 will do something reasonable with re-raised errors, so I'm just going to let the error be re-raised and call it good.
+
+2014-05-31
+       I figured out that you just plain can't get a stack trace out of a multiprocessing.Process in such a way that you an re-associate it with its exception on the other side.  I'm now working on putting together a picklable container exception that I can use to manage and format the exceptions that come out of a mapping function.  It's not going great.
+
+2014-04-08
+       I've been extending the API.  I added list=deletedrevs and tested (fixed) the api.Session.login() method.  It all seems to work now.  I also did some minor cleanup on  lib.title.Parser to make the method names more explicit.
+       
+       I'd like to start tracking changes so that I can build changelists to go with new versions.  For now, I'll keep track of substantial changes here.
+       
+       * Released 0.2.1
+       * Added list=deletedrevs to api module
+
+2014-03-27
+       I just fixed up the structure for lib.reverts.database.check() and check_row().  You can give check_row() a database row or check() a rev_id and page_id.  The functions should then either return None or the first reverting revision they encounter.
+       
+       I like this pattern.  Lib gets to reference core, but not vice versa.  I need to talk to the Wikimetrics people about implementing some of the metrics within a new lib.  Yet, one of the cool things about libs is that they don't necessarily need to be packaged with core.  So you could write something that makes use of core and other libs as a standalone package first and incorporate it later.  :D
+
+2014-03-20
+       Just a quick update today.  I realized that database.DB.add_args was setting
+       default values that won't make sense for anyone but me personally.  I cleared that up and added a way to set your own defaults.
+
+2014-03-18
+       Refactoring!  I've got a user.  He immediately found problems.  So I'm fixing them aggressively.  I just renamed the library back to "mw".  I also renamed the dump processing module to "xml_dump".  I hope that these name changes will make more sense.
+       
+       I also moved the revert detection functionality out of the database module and into the lib.reverts module.  I think that this makes more sense.  If it is a core functionality, it should live in code.  If it is a library, it should only have other libraries depend on it.  If I need to write a magical DB abstractor in lib, so be it.
+
+2014-02-08
+       It's time to kill `mw.lib.changes`.  I just don't see that working as a core
+       part of this library.  It might make sense to return build up another library
+       to handle changes.  I'll have to get back to that at some other time.
+
+2013-12-23
+       Still hacking on `mw.lib.changes`.  It's the same set of issues described in
+       the last log.  I'm making progress building a params parser.  I think that my strategy is going to be to let the user handle params parsing themselves with     a new `types.Protection` type.
+       
+       Oh! And I did get `types.TimestampType` extended to have a `strptime` method.
+       That's all nice and tested.
+       
+       Note that I think it might be a good idea to consolidate all defaults for
+       better documentation.
+       
+       Anyway.  All tests are passing.  It's time to work on something else for a
+       little while.
+
+2013-12-19
+       Still working on `mw.lib.changes`.  I like the structure for the most part.  It looks like I'm going to have to join `revision` and `logging` to `recentchanges` in order construct an appropriate `change.Change` from a row.  That means I'm going to need a funny new method on `database.RecentChanges`.  That's going to confuse people.  Boo.
+       
+       I also need to figure out a way to configure for the lame timestamp format that appears in blocks and page protections.  I think I'm going to extend `types.TimestampType` to have a `strptime` method.
+
+2013-12-18
+       Tests passing.  HistoricalMap was fine.  Will be code-complete once lib.changes is done.  Still need to figure out how I'm going to configure a title parser and pass it into the change constructor.  Also, I rediscovered how stupid the recentchanges table is.
+       
+       OK.. New lame thing.  So, when you "protect" a page, the log keeps the following type of value in log_params:
+       ``\u200e[edit=autoconfirmed] (expires 03:20, 21 November 2013 (UTC))``
+       
+       That date format... It's not the long or short format for `Timestamp`. I think it is a custom format that changes on a wiki-to-wiki basis.
+       
+       I feel sad.  This made my day worse.  It's important to remind myself of the fact that MediaWiki was not designed to allow me to reverse engineer it.
+       
+2013-12-17
+       Test on revert detector failing since simplifying restructure.  I'm not sure what the issue is, but I suspect that I broke something in util.ordered.HistoricalMap. -halfak
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/debian/changelog b/mediawiki_dump_tools/Mediawiki-Utilities/debian/changelog
new file mode 100644 (file)
index 0000000..53d7d21
--- /dev/null
@@ -0,0 +1,5 @@
+python3-mediawiki-utilities (0.4.16) UNRELEASED; urgency=medium
+
+  * Initial version of the package
+
+ -- yuvipanda <yuvipanda@riseup.net>  Tue, 04 Aug 2015 16:42:51 -0700
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/debian/compat b/mediawiki_dump_tools/Mediawiki-Utilities/debian/compat
new file mode 100644 (file)
index 0000000..ec63514
--- /dev/null
@@ -0,0 +1 @@
+9
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/debian/control b/mediawiki_dump_tools/Mediawiki-Utilities/debian/control
new file mode 100644 (file)
index 0000000..b6c7f55
--- /dev/null
@@ -0,0 +1,18 @@
+Source: python3-mediawiki-utilities
+Maintainer: Aaron Halfakar <aaron.halfakar@gmail.com>
+Section: python
+Priority: optional
+Build-Depends: python3-setuptools, python3-all, debhelper (>= 9), python3-nose, python3-pymysql, python3-requests
+Standards-Version: 3.9.6
+
+Package: python3-mediawiki-utilities
+Architecture: all
+Depends: ${misc:Depends}, ${python3:Depends}
+Description: Infrastructure for running webservices on tools.wmflabs.org
+ Provides scripts and a python package for running and controlling
+ user provided webservices on tools.wmflabs.org.
+ .
+ webservice-new is the user facing script that can start / stop / restart
+ webservices when run from commandline in bastion hosts.
+ webservice-runner is the script that starts on the exec hosts and
+ exec's to the appropriate command to run the webserver itself.
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/debian/copyright b/mediawiki_dump_tools/Mediawiki-Utilities/debian/copyright
new file mode 100644 (file)
index 0000000..3944fea
--- /dev/null
@@ -0,0 +1,26 @@
+Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: mediawiki-utilities
+
+Files: *
+Copyright: 2014 Aaron Halfaker <aaron.halfaker@gmail.com>
+License: MIT
+
+License: MIT
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ .
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+ .
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/debian/rules b/mediawiki_dump_tools/Mediawiki-Utilities/debian/rules
new file mode 100755 (executable)
index 0000000..641186e
--- /dev/null
@@ -0,0 +1,4 @@
+#!/usr/bin/make -f
+
+%:
+       dh $@ --with python3 --buildsystem=pybuild
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/Makefile b/mediawiki_dump_tools/Mediawiki-Utilities/doc/Makefile
new file mode 100644 (file)
index 0000000..0befa28
--- /dev/null
@@ -0,0 +1,182 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    = -v
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+       @echo "Please use \`make <target>' where <target> is one of"
+       @echo "  html       to make standalone HTML files"
+       @echo "  dirhtml    to make HTML files named index.html in directories"
+       @echo "  singlehtml to make a single large HTML file"
+       @echo "  pickle     to make pickle files"
+       @echo "  json       to make JSON files"
+       @echo "  htmlhelp   to make HTML files and a HTML help project"
+       @echo "  qthelp     to make HTML files and a qthelp project"
+       @echo "  devhelp    to make HTML files and a Devhelp project"
+       @echo "  epub       to make an epub"
+       @echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+       @echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+       @echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+       @echo "  text       to make text files"
+       @echo "  man        to make manual pages"
+       @echo "  texinfo    to make Texinfo files"
+       @echo "  info       to make Texinfo files and run them through makeinfo"
+       @echo "  gettext    to make PO message catalogs"
+       @echo "  changes    to make an overview of all changed/added/deprecated items"
+       @echo "  xml        to make Docutils-native XML files"
+       @echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+       @echo "  linkcheck  to check all external links for integrity"
+       @echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+       rm -rf $(BUILDDIR)/*
+
+html:
+       $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+       @echo
+       @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+       $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+       @echo
+       @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+       $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+       @echo
+       @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+       $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+       @echo
+       @echo "Build finished; now you can process the pickle files."
+
+json:
+       $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+       @echo
+       @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+       $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+       @echo
+       @echo "Build finished; now you can run HTML Help Workshop with the" \
+             ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+       $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+       @echo
+       @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+             ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+       @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/mediawiki-utilities.qhcp"
+       @echo "To view the help file:"
+       @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/mediawiki-utilities.qhc"
+
+devhelp:
+       $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+       @echo
+       @echo "Build finished."
+       @echo "To view the help file:"
+       @echo "# mkdir -p $$HOME/.local/share/devhelp/mediawiki-utilities"
+       @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/mediawiki-utilities"
+       @echo "# devhelp"
+
+epub:
+       $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+       @echo
+       @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+       $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+       @echo
+       @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+       @echo "Run \`make' in that directory to run these through (pdf)latex" \
+             "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+       $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+       @echo "Running LaTeX files through pdflatex..."
+       $(MAKE) -C $(BUILDDIR)/latex all-pdf
+       @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+       $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+       @echo "Running LaTeX files through platex and dvipdfmx..."
+       $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+       @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+       $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+       @echo
+       @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+       $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+       @echo
+       @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+       $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+       @echo
+       @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+       @echo "Run \`make' in that directory to run these through makeinfo" \
+             "(use \`make info' here to do that automatically)."
+
+info:
+       $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+       @echo "Running Texinfo files through makeinfo..."
+       make -C $(BUILDDIR)/texinfo info
+       @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+       $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+       @echo
+       @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+       $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+       @echo
+       @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+       $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+       @echo
+       @echo "Link check complete; look for any errors in the above output " \
+             "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+       $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+       @echo "Testing of doctests in the sources finished, look at the " \
+             "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+       $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+       @echo
+       @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+       $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+       @echo
+       @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
+
+htmlzip: html
+       cd _build/html/ && \
+       zip -r ../../html.zip * && \
+       cd ../../
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/_static/PLACEHOLDER b/mediawiki_dump_tools/Mediawiki-Utilities/doc/_static/PLACEHOLDER
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/_templates/PLACEHOLDER b/mediawiki_dump_tools/Mediawiki-Utilities/doc/_templates/PLACEHOLDER
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/conf.py b/mediawiki_dump_tools/Mediawiki-Utilities/doc/conf.py
new file mode 100644 (file)
index 0000000..08dae41
--- /dev/null
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# mediawiki-utilities documentation build configuration file, created by
+# sphinx-quickstart on Thu Apr 10 17:31:47 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, os.path.abspath('../'))
+import mw
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.doctest',
+    'sphinx.ext.todo',
+    'sphinx.ext.coverage',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.viewcode',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'mediawiki-utilities'
+copyright = '2014, Aaron Halfaker'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = mw.__version__
+# The full version, including alpha/beta/rc tags.
+release = mw.__version__
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'mediawiki-utilitiesdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  ('index', 'mediawiki-utilities.tex', 'mediawiki-utilities Documentation',
+   'Aaron Halfaker', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'mediawiki-utilities', 'mediawiki-utilities Documentation',
+     ['Aaron Halfaker'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'mediawiki-utilities', 'mediawiki-utilities Documentation',
+   'Aaron Halfaker', 'mediawiki-utilities', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/core/api.rst b/mediawiki_dump_tools/Mediawiki-Utilities/doc/core/api.rst
new file mode 100644 (file)
index 0000000..6c34ff4
--- /dev/null
@@ -0,0 +1,77 @@
+.. _mw.api:
+
+===================================
+mw.api -- MediaWiki API abstraction
+===================================
+
+This module contains a set of utilities for interacting with the MediaWiki API.
+
+Here's an example of a common usage pattern:
+       
+       >>> from mw import api
+       >>> 
+       >>> session = api.Session("https://en.wikipedia.org/w/api.php")
+       >>> 
+       >>> revisions = session.revisions.query(
+       ...     properties={'ids', 'content'},
+       ...     titles={"User:EpochFail"},
+       ...     direction="newer",
+       ...     limit=3
+       ... )
+       >>> 
+       >>> for rev in revisions:
+       ...     print(
+       ...             "rev_id={0}, length={1} characters".format(
+       ...                     rev['revid'],
+       ...                     len(rev.get('*', ""))
+       ...             )
+       ...     )
+       ... 
+       rev_id=190055192, length=124 characters
+       rev_id=276121340, length=132 characters
+       rev_id=276121389, length=124 characters
+
+Session
+=======
+
+.. autoclass:: mw.api.Session
+   :members:
+   :member-order: bysource
+
+
+Collections
+===========
+
+.. autoclass:: mw.api.DeletedRevisions
+   :members:
+
+.. autoclass:: mw.api.Pages
+   :members:
+
+.. autoclass:: mw.api.RecentChanges
+   :members:
+
+.. autoclass:: mw.api.Revisions
+   :members:
+
+.. autoclass:: mw.api.SiteInfo
+   :members:
+
+.. autoclass:: mw.api.UserContribs
+   :members:
+
+Errors
+======
+
+
+.. autoclass:: mw.api.errors.APIError
+   :members:
+   :inherited-members:
+
+.. autoclass:: mw.api.errors.AuthenticationError
+   :members:
+   :inherited-members:
+
+.. autoclass:: mw.api.errors.MalformedResponse
+   :members:
+   :inherited-members:
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/core/database.rst b/mediawiki_dump_tools/Mediawiki-Utilities/doc/core/database.rst
new file mode 100644 (file)
index 0000000..3939687
--- /dev/null
@@ -0,0 +1,53 @@
+.. _mw.database:
+
+=========================================
+mw.database -- MySQL database abstraction
+=========================================
+
+This module contains a set of utilities for interacting with MediaWiki databases.
+
+Here's an example of a common usage pattern:
+::
+       
+       from mw import database
+       
+       db = database.DB.from_params(
+               host="s1-analytics-slave.eqiad.wmnet", 
+               read_default_file="~/.my.cnf", 
+               user="research", 
+               db="enwiki"
+       )
+       revisions = db.revisions.query(user_id=9133062)
+       
+       for rev_row in revisions:
+               rev_row['rev_id']
+
+
+DB
+======
+
+.. autoclass:: mw.database.DB
+   :members:
+   :member-order: bysource
+   
+
+Collections
+===========
+
+.. autoclass:: mw.database.Archives
+   :members:
+
+.. autoclass:: mw.database.AllRevisions
+   :members:
+
+.. autoclass:: mw.database.Pages
+   :members:
+
+.. autoclass:: mw.database.RecentChanges
+   :members:
+
+.. autoclass:: mw.database.Revisions
+   :members:
+
+.. autoclass:: mw.database.Users
+   :members:
\ No newline at end of file
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/core/xml_dump.rst b/mediawiki_dump_tools/Mediawiki-Utilities/doc/core/xml_dump.rst
new file mode 100644 (file)
index 0000000..d38a696
--- /dev/null
@@ -0,0 +1,52 @@
+.. _mw.xml_dump:
+
+==================================
+mw.xml_dump -- XML dump processing
+==================================
+
+.. automodule:: mw.xml_dump
+
+The map() function
+==================
+
+.. autofunction:: mw.xml_dump.map
+
+Iteration
+=========
+
+.. autoclass:: mw.xml_dump.Iterator
+   :members:
+   :member-order: bysource
+
+.. autoclass:: mw.xml_dump.Page
+   :members:
+   :member-order: bysource
+
+.. autoclass:: mw.xml_dump.Redirect
+   :members:
+   :member-order: bysource
+
+.. autoclass:: mw.xml_dump.Revision
+   :members:
+   :member-order: bysource
+
+.. autoclass:: mw.xml_dump.Comment
+   :members:
+   :member-order: bysource
+
+.. autoclass:: mw.xml_dump.Contributor
+   :members:
+   :member-order: bysource
+
+.. autoclass:: mw.xml_dump.Text
+   :members:
+   :member-order: bysource
+
+Errors
+======
+
+.. autoclass:: mw.xml_dump.errors.FileTypeError
+   :members:
+
+.. autoclass:: mw.xml_dump.errors.MalformedXML
+   :members:
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/index.rst b/mediawiki_dump_tools/Mediawiki-Utilities/doc/index.rst
new file mode 100644 (file)
index 0000000..583a3c0
--- /dev/null
@@ -0,0 +1,100 @@
+.. mediawiki-utilities documentation master file, created by
+   sphinx-quickstart on Thu Apr 10 17:31:47 2014.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+===================
+MediaWiki Utilities
+===================
+
+MediaWiki Utilities is an open source (MIT Licensed) library developed by Aaron Halfaker for extracting and processing data from MediaWiki installations, slave databases and xml dumps.
+
+**Instal with pip:** ``pip install mediawiki-utilities``
+
+**Note:** *Use of this library requires Python 3 or later.*
+
+Types
+=====
+:ref:`mw.Timestamp <mw.types>`
+       A simple datatype for handling MediaWiki's various time formats.
+
+Core modules
+============
+
+:ref:`mw.api <mw.api>`
+       A set of utilities for interacting with MediaWiki's web API.
+       
+       * :class:`~mw.api.Session` -- Constructs an API session with a MediaWiki installation.  Contains convenience methods for accessing ``prop=revisions``,  ``list=usercontribs``, ``meta=siteinfo``, ``list=deletedrevs`` and ``list=recentchanges``.
+
+:ref:`mw.database <mw.database>`
+       A set of utilities for interacting with MediaWiki's database.
+       
+       * :class:`~mw.database.DB` -- Constructs a mysql database connector with convenience methods for accessing ``revision``, ``archive``, ``page``, ``user``, and ``recentchanges``.
+
+:ref:`mw.xml_dump <mw.xml_dump>`
+       A set of utilities for processing MediaWiki's XML database dumps quickly and without dealing with streaming XML. 
+       
+       * :func:`~mw.xml_dump.map` -- Applies a function to a set of dump files (:class:`~mw.xml_dump.Iterator`) using :class:`multiprocessing` and aggregates the output.
+       * :class:`~mw.xml_dump.Iterator` -- Constructs an iterator over a standard XML dump.  Dumps contain site_info and pages.  Pages contain metadata and revisions.  Revisions contain metadata and text.  This is probably why you are here.
+
+Libraries
+=========
+
+:ref:`mw.lib.persistence <mw.lib.persistence>`
+       A set of utilities for tracking the persistence of content between revisions.
+       
+       * :class:`~mw.lib.persistence.State` -- Constructs an object that represents the current content persistence state of a page.  Reports useful details about the persistence of content when updated.
+
+:ref:`mw.lib.reverts <mw.lib.reverts>`
+       A set of utilities for performing revert detection
+       
+       * :func:`~mw.lib.reverts.detect` -- Detects reverts in a sequence of revision events.
+       * :class:`~mw.lib.reverts.Detector` -- Constructs an identity revert detector that can be updated manually over the history of a page. 
+
+:ref:`mw.lib.sessions <mw.lib.sessions>`
+       A set of utilities for grouping revisions and other events into sessions
+       
+       * :func:`~mw.lib.sessions.cluster` -- Clusters a sequence of user actions into sessions.
+       * :class:`~mw.lib.sessions.Cache` -- Constructs a cache of recent user actions that can be updated manually in order to detect sessions.
+
+:ref:`mw.lib.title <mw.lib.title>`
+       A set of utilities for normalizing and parsing page titles
+       
+       * :func:`~mw.lib.title.normalize` -- Normalizes a page title.  
+       * :class:`~mw.lib.title.Parser` -- Constructs a parser with a set of namespaces that can be used to parse and normalize page titles. 
+
+About the author
+================
+:name: 
+       Aaron Halfaker
+:email:
+       aaron.halfaker@gmail.com
+:website:
+       http://halfaker.info --
+       http://en.wikipedia.org/wiki/User:EpochFail
+
+
+Contributors
+============
+None yet.  See http://github.com/halfak/mediawiki-utilities.  Pull requests are encouraged.
+
+
+Indices and tables
+==================
+
+.. toctree::
+   :maxdepth: 2
+   
+   types
+   core/api
+   core/database
+   core/xml_dump
+   lib/persistence
+   lib/reverts
+   lib/sessions
+   lib/title
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/persistence.rst b/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/persistence.rst
new file mode 100644 (file)
index 0000000..3c58efe
--- /dev/null
@@ -0,0 +1,35 @@
+.. _mw.lib.persistence:
+
+=======================================================
+mw.lib.persistence -- tracking content between revisions
+=======================================================
+
+.. autoclass:: mw.lib.persistence.State
+       :members:
+
+Tokenization
+============
+
+.. autoclass:: mw.lib.persistence.Tokens
+       :members:
+
+.. autoclass:: mw.lib.persistence.Token
+       :members:
+
+.. automodule:: mw.lib.persistence.tokenization
+   :members:
+   :member-order: bysource
+
+Difference
+==========
+
+.. automodule:: mw.lib.persistence.difference
+   :members:
+   :member-order: bysource
+
+Constants
+=========
+
+.. automodule:: mw.lib.persistence.defaults
+   :members:
+   :member-order: bysource
\ No newline at end of file
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/reverts.rst b/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/reverts.rst
new file mode 100644 (file)
index 0000000..2b6ae9a
--- /dev/null
@@ -0,0 +1,30 @@
+.. _mw.lib.reverts:
+
+=============================================
+mw.lib.reverts -- detecting reverts
+=============================================
+
+.. automodule:: mw.lib.reverts
+
+.. autofunction:: mw.lib.reverts.detect
+
+.. autoclass:: mw.lib.reverts.Revert
+
+.. autoclass:: mw.lib.reverts.Detector
+   :members:
+
+Convenience functions
+=====================
+.. automodule:: mw.lib.reverts.api
+   :members:
+   :member-order: bysource
+
+.. automodule:: mw.lib.reverts.database
+   :members:
+   :member-order: bysource
+
+Constants
+=========
+
+.. automodule:: mw.lib.reverts.defaults
+   :members:
\ No newline at end of file
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/sessions.rst b/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/sessions.rst
new file mode 100644 (file)
index 0000000..bd5bea7
--- /dev/null
@@ -0,0 +1,18 @@
+.. _mw.lib.sessions:
+
+===================================
+mw.lib.sessions -- event clustering
+===================================
+
+.. autofunction:: mw.lib.sessions.cluster
+
+.. autoclass:: mw.lib.sessions.Session
+
+.. autoclass:: mw.lib.sessions.Cache
+       :members:
+
+Constants
+=========
+
+.. automodule:: mw.lib.sessions.defaults
+   :members:
\ No newline at end of file
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/title.rst b/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/title.rst
new file mode 100644 (file)
index 0000000..61abb78
--- /dev/null
@@ -0,0 +1,15 @@
+.. _mw.lib.title:
+
+============================================================
+mw.lib.title -- parsing and normalizing titles
+============================================================
+
+.. autofunction:: mw.lib.title.normalize
+
+
+Title parser
+================
+.. autoclass:: mw.lib.title.Parser
+   :members:
+   :member-order: bysource
+
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/doc/types.rst b/mediawiki_dump_tools/Mediawiki-Utilities/doc/types.rst
new file mode 100644 (file)
index 0000000..fba2021
--- /dev/null
@@ -0,0 +1,11 @@
+.. _mw.types:
+
+========================
+mw.types -- common types
+========================
+
+.. autoclass:: mw.Timestamp
+   :members:
+
+.. autoclass:: mw.Namespace
+   :members:
\ No newline at end of file
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.deleted_revisions.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.deleted_revisions.py
new file mode 100644 (file)
index 0000000..06a1854
--- /dev/null
@@ -0,0 +1,37 @@
+"""
+Prints the rev_id, characters and hash of all revisions to Willy_on_Wheels.
+"""
+import getpass
+import hashlib
+import os
+import sys
+
+try:
+    sys.path.insert(0, os.path.abspath(os.getcwd()))
+
+    from mw import api
+except: raise
+
+
+
+api_session = api.Session("https://en.wikipedia.org/w/api.php")
+
+print("(EN) Wikipedia credentials...")
+username = input("Username: ")
+password = getpass.getpass("Password: ")
+api_session.login(username, password)
+
+revisions = api_session.deleted_revisions.query(
+    properties={'ids', 'content'},
+    titles={'Willy on Wheels'},
+    direction="newer"
+)
+
+for rev in revisions:
+    print(
+        "{0} ({1} chars): {2}".format(
+            rev['revid'],
+            len(rev.get('*', "")),
+            hashlib.sha1(bytes(rev.get('*', ""), 'utf8')).hexdigest()
+        )
+    )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.py
new file mode 100644 (file)
index 0000000..1893550
--- /dev/null
@@ -0,0 +1,19 @@
+"""
+Prints the rev_id of all revisions to User:EpochFail.
+"""
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.getcwd()))
+
+from mw import api
+
+api_session = api.Session("https://en.wikipedia.org/w/api.php")
+
+revisions = api_session.revisions.query(
+    properties={'ids'},
+    titles={'User:TestAccountForMWUtils'}
+)
+
+for rev in revisions:
+    print(rev['revid'])
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.recent_changes.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.recent_changes.py
new file mode 100644 (file)
index 0000000..d3d3ea3
--- /dev/null
@@ -0,0 +1,30 @@
+"""
+Prints the rev_id and hash of the 10 oldest edits in recent_changes.
+"""
+import os
+import sys
+
+try:
+    sys.path.insert(0, os.path.abspath(os.getcwd()))
+    from mw import api
+except:
+    raise
+
+api_session = api.Session("https://en.wikipedia.org/w/api.php")
+
+changes = api_session.recent_changes.query(
+    type={'edit', 'new'},
+    properties={'ids', 'sha1', 'timestamp'},
+    direction="newer",
+    limit=10
+)
+
+for change in changes:
+    print(
+        "{0} ({1}) @ {2}: {3}".format(
+            change['rcid'],
+            change['type'],
+            change['timestamp'],
+            change.get('sha1', "")
+        )
+    )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.revisions.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.revisions.py
new file mode 100644 (file)
index 0000000..d255703
--- /dev/null
@@ -0,0 +1,28 @@
+"""
+Prints the rev_id, characters and hash of all revisions to User:EpochFail.
+"""
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.getcwd()))
+
+import hashlib
+from mw import api
+
+api_session = api.Session("https://en.wikipedia.org/w/api.php")
+
+revisions = api_session.revisions.query(
+    properties={'ids', 'content'},
+    titles={"User:EpochFail"},
+    direction="newer",
+    limit=51
+)
+
+for rev in revisions:
+    print(
+        "{0} ({1} chars): {2}".format(
+            rev['revid'],
+            len(rev.get('*', "")),
+            hashlib.sha1(bytes(rev.get('*', ""), 'utf8')).hexdigest()
+        )
+    )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.users.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.users.py
new file mode 100644 (file)
index 0000000..943d38a
--- /dev/null
@@ -0,0 +1,20 @@
+"""
+Prints the rev_id, characters and hash of all revisions to User:EpochFail.
+"""
+import os
+import sys
+
+try:
+    sys.path.insert(0, os.path.abspath(os.getcwd()))
+    from mw import api
+except:
+    raise
+
+api_session = api.Session("https://en.wikipedia.org/w/api.php")
+
+user_docs = api_session.users.query(
+    users=["EpochFail", "Halfak (WMF)"]
+)
+
+for user_doc in user_docs:
+    print(user_doc)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/database.users.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/database.users.py
new file mode 100644 (file)
index 0000000..fd5ce79
--- /dev/null
@@ -0,0 +1,31 @@
+"""
+
+"""
+import os
+import sys
+
+try:
+    
+    sys.path.insert(0, os.path.abspath(os.getcwd()))
+    from mw import database
+    
+except:
+    raise
+
+
+
+db = database.DB.from_params(
+    host="analytics-store.eqiad.wmnet",
+    read_default_file="~/.my.cnf",
+    user="research",
+    db="enwiki"
+)
+
+users = db.users.query(
+    registered_after="20140101000000",
+    direction="newer",
+    limit=10
+)
+
+for user in users:
+    print("{user_id}:{user_name} -- {user_editcount} edits".format(**user))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/dump.xml b/mediawiki_dump_tools/Mediawiki-Utilities/examples/dump.xml
new file mode 100644 (file)
index 0000000..255c8d5
--- /dev/null
@@ -0,0 +1,59 @@
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+           xsi:schemaLocation="//www.mediawiki.org/xml/export-0.8/ http://www.mediawiki.org/xml/export-0.8.xsd"
+           version="0.8" xml:lang="en">
+  <siteinfo>
+    <sitename>Wikipedia</sitename>
+    <base>http://en.wikipedia.org/wiki/Main_Page</base>
+    <generator>MediaWiki 1.22wmf2</generator>
+    <case>first-letter</case>
+    <namespaces>
+      <namespace key="0" case="first-letter" />
+      <namespace key="1" case="first-letter">Talk</namespace>
+    </namespaces>
+  </siteinfo>
+  <page>
+    <title>Foo</title>
+    <ns>0</ns>
+    <id>1</id>
+    <revision>
+      <id>1</id>
+      <timestamp>2004-08-09T09:04:08Z</timestamp>
+      <contributor>
+        <username>Gen0cide</username>
+        <id>92182</id>
+      </contributor>
+      <text xml:space="preserve">Revision 1 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+    </revision>
+    <revision>
+      <id>2</id>
+      <timestamp>2004-08-10T09:04:08Z</timestamp>
+      <contributor>
+        <ip>222.152.210.109</ip>
+      </contributor>
+      <text xml:space="preserve">Revision 2 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <comment>Comment 2</comment>
+      <format>text/x-wiki</format>
+    </revision>
+  </page>
+  <page>
+    <title>Bar</title>
+    <ns>1</ns>
+    <id>2</id>
+    <revision>
+      <id>3</id>
+      <timestamp>2004-08-11T09:04:08Z</timestamp>
+      <contributor>
+        <ip>222.152.210.22</ip>
+      </contributor>
+      <text xml:space="preserve">Revision 3 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+    </revision>
+  </page>
+</mediawiki>
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/dump2.xml b/mediawiki_dump_tools/Mediawiki-Utilities/examples/dump2.xml
new file mode 100644 (file)
index 0000000..12b7ed6
--- /dev/null
@@ -0,0 +1,31 @@
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+           xsi:schemaLocation="//www.mediawiki.org/xml/export-0.8/ http://www.mediawiki.org/xml/export-0.8.xsd"
+           version="0.8" xml:lang="en">
+  <siteinfo>
+    <sitename>Wikipedia</sitename>
+    <base>http://en.wikipedia.org/wiki/Main_Page</base>
+    <generator>MediaWiki 1.22wmf2</generator>
+    <case>first-letter</case>
+    <namespaces>
+      <namespace key="0" case="first-letter" />
+      <namespace key="1" case="first-letter">Talk</namespace>
+    </namespaces>
+  </siteinfo>
+  <page>
+    <title>Herp</title>
+    <ns>1</ns>
+    <id>2</id>
+    <revision>
+      <id>4</id>
+      <timestamp>2004-08-11T09:04:08Z</timestamp>
+      <contributor>
+        <id>10</id>
+        <name>FOobar!?</name>
+      </contributor>
+      <text xml:space="preserve">Revision 4 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+    </revision>
+  </page>
+</mediawiki>
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.persistence.api.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.persistence.api.py
new file mode 100644 (file)
index 0000000..41e0500
--- /dev/null
@@ -0,0 +1,19 @@
+import pprint
+import re
+
+from mw.api import Session
+from mw.lib import persistence
+
+session = Session("https://en.wikipedia.org/w/api.php")
+
+rev, tokens_added, future_revs = persistence.api.score(session, 560561013,
+                                                       properties={'user'})
+
+words_re = re.compile("\w+", re.UNICODE)
+
+print("Words added")
+for token in tokens_added:
+    if words_re.search(token.text):
+        print("'{0}' survived:".format(token.text))
+        for frev in token.revisions:
+            print("\t{revid} by {user}".format(**frev))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.api.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.api.py
new file mode 100644 (file)
index 0000000..8f4012c
--- /dev/null
@@ -0,0 +1,18 @@
+"""
+Prints the reverting rev_id, rev_id and reverted to rev_id of all reverted
+revisions made by user "PermaNoob".
+"""
+from mw.api import Session
+from mw.lib import reverts
+
+session = Session("https://en.wikipedia.org/w/api.php")
+revisions = session.user_contribs.query(user={"PermaNoob"}, direction="newer")
+
+for rev in revisions:
+    revert = reverts.api.check_rev(session, rev, window=60*60*24*2)
+    if revert is not None:
+        print("{0} reverted {1} to {2}".format(
+            revert.reverting['revid'],
+            rev['revid'],
+            revert.reverted_to['revid'])
+        )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.database.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.database.py
new file mode 100644 (file)
index 0000000..111c46d
--- /dev/null
@@ -0,0 +1,23 @@
+"""
+Prints the reverting rev_id, rev_id and reverted to rev_id of all reverted
+revisions made by user with ID 9133062.
+"""
+from mw.database import DB
+from mw.lib import reverts
+
+db = DB.from_params(
+    host="s1-analytics-slave.eqiad.wmnet",
+    read_default_file="~/.my.cnf",
+    user="research",
+    db="enwiki"
+)
+revisions = db.revisions.query(user_id=9133062)
+
+for rev_row in revisions:
+    revert = reverts.database.check_row(db, rev_row)
+    if revert is not None:
+        print("{0} reverted {1} to {2}".format(
+            revert.reverting['rev_id'],
+            rev_row['rev_id'],
+            revert.reverted_to['rev_id'])
+        )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.py
new file mode 100644 (file)
index 0000000..46556b0
--- /dev/null
@@ -0,0 +1,21 @@
+"""
+Prints all reverted revisions of User:EpochFail.
+"""
+from mw.api import Session
+from mw.lib import reverts
+
+# Gather a page's revisions from the API
+api_session = Session("https://en.wikipedia.org/w/api.php")
+revs = api_session.revisions.query(
+    titles={"User:EpochFail"},
+    properties={'ids', 'sha1'},
+    direction="newer"
+)
+
+# Creates a revsion event iterator
+rev_events = ((rev['sha1'], rev) for rev in revs)
+
+# Detect and print reverts
+for revert in reverts.detect(rev_events):
+    print("{0} reverted back to {1}".format(revert.reverting['revid'],
+                                            revert.reverted_to['revid']))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.sessions.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.sessions.py
new file mode 100644 (file)
index 0000000..0b249fc
--- /dev/null
@@ -0,0 +1,17 @@
+"""
+Prints out session information for user "TextAccountForMWUtils"
+"""
+from mw.api import Session
+from mw.lib import sessions
+
+# Gather a user's revisions from the API
+api_session = Session("https://en.wikipedia.org/w/api.php")
+revs = api_session.user_contribs.query(
+    user={"TestAccountForMWUtils"},
+    direction="newer"
+)
+rev_events = ((rev['user'], rev['timestamp'], rev) for rev in revs)
+
+# Extract and print sessions
+for user, session in sessions.cluster(rev_events):
+    print("{0}'s session with {1} revisions".format(user, len(session)))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.title.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.title.py
new file mode 100644 (file)
index 0000000..184164b
--- /dev/null
@@ -0,0 +1,26 @@
+"""
+Demonstrates title normalization and parsing.
+"""
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.getcwd()))
+
+from mw.api import Session
+from mw.lib import title
+
+# Normalize titles
+title.normalize("foo bar")
+# > "Foo_bar"
+
+# Construct a title parser from the API
+api_session = Session("https://en.wikipedia.org/w/api.php")
+parser = title.Parser.from_api(api_session)
+
+# Handles normalization
+parser.parse("user:epochFail")
+# > 2, "EpochFail"
+
+# Handles namespace aliases
+parser.parse("WT:foobar")
+# > 5, "Foobar"
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/timestamp.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/timestamp.py
new file mode 100644 (file)
index 0000000..0c5b53b
--- /dev/null
@@ -0,0 +1,27 @@
+"""
+Demonstrates some simple Timestamp operations
+"""
+from mw import Timestamp
+
+# Seconds since Unix Epoch
+str(Timestamp(1234567890))
+# > '20090213233130'
+
+# Database format
+int(Timestamp("20090213233130"))
+# > 1234567890
+
+# API format
+int(Timestamp("2009-02-13T23:31:30Z"))
+# > 1234567890
+
+# Difference in seconds
+Timestamp("2009-02-13T23:31:31Z") - Timestamp(1234567890)
+# > 1
+
+# strptime and strftime
+Timestamp(1234567890).strftime("%Y foobar")
+# > '2009 foobar'
+
+str(Timestamp.strptime("2009 derp 10", "%Y derp %m"))
+# > '20091001000000'
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/xml_dump.iteration.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/xml_dump.iteration.py
new file mode 100644 (file)
index 0000000..7a9d4c4
--- /dev/null
@@ -0,0 +1,14 @@
+"""
+Prints out all rev_ids that appear in dump.xml.
+"""
+from mw.xml_dump import Iterator
+
+# Construct dump file iterator
+dump = Iterator.from_file(open("examples/dump.xml"))
+
+# Iterate through pages
+for page in dump:
+
+        # Iterate through a page's revisions
+        for revision in page:
+                print(revision.id)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/examples/xml_dump.map.py b/mediawiki_dump_tools/Mediawiki-Utilities/examples/xml_dump.map.py
new file mode 100644 (file)
index 0000000..66fa9cf
--- /dev/null
@@ -0,0 +1,15 @@
+"""
+Processes two dump files.
+"""
+from mw import xml_dump
+
+files = ["examples/dump.xml", "examples/dump2.xml"]
+
+
+def page_info(dump, path):
+    for page in dump:
+        yield page.id, page.namespace, page.title
+
+
+for page_id, page_namespace, page_title in xml_dump.map(files, page_info):
+    print("\t".join([str(page_id), str(page_namespace), page_title]))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/__init__.py
new file mode 100644 (file)
index 0000000..6252945
--- /dev/null
@@ -0,0 +1,3 @@
+from .types import Timestamp, Namespace
+
+__version__ = "0.4.18"
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/__init__.py
new file mode 100644 (file)
index 0000000..ced86a2
--- /dev/null
@@ -0,0 +1,5 @@
+from . import errors
+from .session import Session
+
+from .collections import Pages, RecentChanges, Revisions, SiteInfo, \
+    UserContribs, DeletedRevisions
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/__init__.py
new file mode 100644 (file)
index 0000000..dcfc3e2
--- /dev/null
@@ -0,0 +1,7 @@
+from .deleted_revisions import DeletedRevisions
+from .pages import Pages
+from .recent_changes import RecentChanges
+from .revisions import Revisions
+from .site_info import SiteInfo
+from .user_contribs import UserContribs
+from .users import Users
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/collection.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/collection.py
new file mode 100644 (file)
index 0000000..bc4285a
--- /dev/null
@@ -0,0 +1,68 @@
+import re
+
+
+class Collection:
+    """
+    Represents a collection of items that can be queried via the API.  This is
+    an abstract base class that should be extended
+    """
+
+    TIMESTAMP = re.compile(r"[0-9]{4}-?[0-9]{2}-?[0-9]{2}T?" +
+                           r"[0-9]{2}:?[0-9]{2}:?[0-9]{2}Z?")
+    """
+    A regular expression for matching the API's timestamp format.
+    """
+
+    DIRECTIONS = {'newer', 'older'}
+    """
+    A set of potential direction names.
+    """
+
+    def __init__(self, session):
+        """
+        :Parameters:
+            session : `mw.api.Session`
+                An api session to use for post & get.
+        """
+        self.session = session
+    
+    def _check_direction(self, direction):
+        if direction is None:
+            return direction
+        else:
+            direction = str(direction)
+
+            assert direction in {None} | self.DIRECTIONS, \
+                "Direction must be one of {0}".format(self.DIRECTIONS)
+
+            return direction
+
+    def _check_timestamp(self, timestamp):
+        if timestamp is None:
+            return timestamp
+        else:
+            timestamp = str(timestamp)
+
+            if not self.TIMESTAMP.match(timestamp):
+                raise TypeError(
+                    "{0} is not formatted like ".format(repr(timestamp)) +
+                    "a MediaWiki timestamp."
+                )
+
+            return timestamp
+
+    def _items(self, items, none=True, levels=None, type=lambda val: val):
+
+        if none and items is None:
+            return None
+        else:
+            items = {str(type(item)) for item in items}
+
+            if levels is not None:
+                levels = {str(level) for level in levels}
+
+                assert len(items - levels) == 0, \
+                    "items {0} not in levels {1}".format(
+                        items - levels, levels)
+
+            return "|".join(items)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/deleted_revisions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/deleted_revisions.py
new file mode 100644 (file)
index 0000000..f2dda38
--- /dev/null
@@ -0,0 +1,150 @@
+import logging
+import sys
+
+from ...types import Timestamp
+from ...util import none_or
+from ..errors import MalformedResponse
+from .collection import Collection
+
+logger = logging.getLogger("mw.api.collections.deletedrevs")
+
+
+class DeletedRevisions(Collection):
+    PROPERTIES = {'ids', 'flags', 'timestamp', 'user', 'userid', 'size',
+                  'sha1', 'contentmodel', 'comment', 'parsedcomment', 'content',
+                  'tags'}
+
+    # TODO:
+    # This is *not* the right way to do this, but it should work for all queries.
+    MAX_REVISIONS = 500
+
+    def get(self, rev_id, *args, **kwargs):
+
+        rev_id = int(rev_id)
+
+        revs = list(self.query(revids={rev_id}, **kwargs))
+
+        if len(revs) < 1:
+            raise KeyError(rev_id)
+        else:
+            return revs[0]
+
+    def query(self, *args, limit=sys.maxsize, **kwargs):
+        """
+        Queries deleted revisions.
+        See https://www.mediawiki.org/wiki/API:Deletedrevs
+
+        :Parameters:
+            titles : set(str)
+                A set of page names to query (note that namespace prefix is expected)
+            start : :class:`mw.Timestamp`
+                A timestamp to start querying from
+            end : :class:`mw.Timestamp`
+                A timestamp to end querying
+            from_title : str
+                A title from which to start querying (alphabetically)
+            to_title : str
+                A title from which to stop querying (alphabetically)
+            prefix : str
+                A title prefix to match on
+            drcontinue : str
+                When more results are available, use this to continue (3) Note: may only work if drdir is set to newer.
+            unique : bool
+                List only one revision for each page
+            tag : str
+                Only list revision tagged with this tag
+            user : str
+                Only list revisions saved by this user_text
+            excludeuser : str
+                Do not list revision saved by this user_text
+            namespace : int
+                Only list pages in this namespace (id)
+            limit : int
+                Limit the number of results
+            direction : str
+                "newer" or "older"
+            properties : set(str)
+                A list of properties to include in the results:
+
+
+                * ids            - The ID of the revision.
+                * flags          - Revision flags (minor).
+                * timestamp      - The timestamp of the revision.
+                * user           - User that made the revision.
+                * userid         - User ID of the revision creator.
+                * size           - Length (bytes) of the revision.
+                * sha1           - SHA-1 (base 16) of the revision.
+                * contentmodel   - Content model ID of the revision.
+                * comment        - Comment by the user for the revision.
+                * parsedcomment  - Parsed comment by the user for the revision.
+                * content        - Text of the revision.
+                * tags           - Tags for the revision.
+        """
+        # `limit` means something diffent here
+        kwargs['limit'] = min(limit, self.MAX_REVISIONS)
+        revisions_yielded = 0
+        done = False
+        while not done and revisions_yielded <= limit:
+            rev_docs, query_continue = self._query(*args, **kwargs)
+            for doc in rev_docs:
+                yield doc
+                revisions_yielded += 1
+                if revisions_yielded >= limit:
+                    break
+
+            if query_continue != "" and len(rev_docs) > 0:
+                kwargs['query_continue'] = query_continue
+            else:
+                done = True
+
+    def _query(self, titles=None, pageids=None, revids=None,
+               start=None, end=None, query_continue=None, unique=None, tag=None,
+               user=None, excludeuser=None, namespace=None, limit=None,
+               properties=None, direction=None):
+
+        params = {
+            'action': "query",
+            'prop': "deletedrevisions"
+        }
+
+        params['titles'] = self._items(titles)
+        params['pageids'] = self._items(pageids)
+        params['revids'] = self._items(revids)
+        params['drvprop'] = self._items(properties, levels=self.PROPERTIES)
+        params['drvlimit'] = none_or(limit, int)
+        params['drvstart'] = self._check_timestamp(start)
+        params['drvend'] = self._check_timestamp(end)
+
+        params['drvdir'] = self._check_direction(direction)
+        params['drvuser'] = none_or(user, str)
+        params['drvexcludeuser'] = none_or(excludeuser, int)
+        params['drvtag'] = none_or(tag, str)
+        params.update(query_continue or {'continue': ""})
+
+        doc = self.session.get(params)
+        doc_copy = dict(doc)
+
+        try:
+            if 'continue' in doc:
+                query_continue = doc['continue']
+            else:
+                query_continue = ''
+
+            pages = doc['query']['pages'].values()
+            rev_docs = []
+
+            for page_doc in pages:
+                page_rev_docs = page_doc.get('deletedrevisions', [])
+
+                try: del page_doc['deletedrevisions']
+                except KeyError: pass
+
+                for rev_doc in page_rev_docs:
+                    rev_doc['page'] = page_doc
+
+                rev_docs.extend(page_rev_docs)
+
+            return rev_docs, query_continue
+
+        except KeyError as e:
+            raise MalformedResponse(str(e), doc)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/pages.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/pages.py
new file mode 100644 (file)
index 0000000..2a2fdf7
--- /dev/null
@@ -0,0 +1,50 @@
+import logging
+
+from ...util import none_or
+from .collection import Collection
+
+logger = logging.getLogger("mw.api.collections.pages")
+
+
+class Pages(Collection):
+    """
+    TODO
+    """
+
+    def _edit(self, title=None, pageid=None, section=None, sectiontitle=None,
+              text=None, token=None, summary=None, minor=None,
+              notminor=None, bot=None, basetimestamp=None,
+              starttimestamp=None, recreate=None, createonly=None,
+              nocreate=None, watch=None, unwatch=None, watchlist=None,
+              md5=None, prependtext=None, appendtext=None, undo=None,
+              undoafter=None, redirect=None, contentformat=None,
+              contentmodel=None, assert_=None, nassert=None,
+              captchaword=None, captchaid=None):
+        params = {
+            'action': "edit"
+        }
+        params['title'] = none_or(title, str)
+        params['pageid'] = none_or(pageid, int)
+        params['section'] = none_or(section, int, levels={'new'})
+        params['sectiontitle'] = none_or(sectiontitle, str)
+        params['text'] = none_or(text, str)
+        params['token'] = none_or(token, str)
+        params['summary'] = none_or(summary, str)
+        params['minor'] = none_or(minor, bool)
+        params['notminor'] = none_or(notminor, bool)
+        params['bot'] = none_or(bot, bool)
+        params['basetimestamp'] = self._check_timestamp(basetimestamp)
+        params['starttimestamp'] = self._check_timestamp(starttimestamp)
+        params['recreate'] = none_or(recreate, bool)
+        params['createonly'] = none_or(createonly, bool)
+        params['nocreate'] = none_or(nocreate, bool)
+        params['watch'] = none_or(watch, bool)
+        params['unwatch'] = none_or(unwatch, bool)
+        params['watchlist'] = none_or(watchlist, bool)
+        params['md5'] = none_or(md5, str)
+        params['prependtext'] = none_or(prependtext, str)
+        params['appendtext'] = none_or(appendtext, str)
+        params['undo'] = none_or(undo, int)
+        params['undoafter'] = none_or(undoafter, int)
+
+        # TODO finish this
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/recent_changes.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/recent_changes.py
new file mode 100644 (file)
index 0000000..01b90d7
--- /dev/null
@@ -0,0 +1,192 @@
+import logging
+import re
+
+from ...util import none_or
+from ..errors import MalformedResponse
+from .collection import Collection
+
+logger = logging.getLogger("mw.api.collections.recent_changes")
+
+
+class RecentChanges(Collection):
+    """
+    Recent changes (revisions, page creations, registrations, moves, etc.)
+    """
+
+    RCCONTINUE = re.compile(r"([0-9]{4}-[0-9]{2}-[0-9]{2}T" +
+                            r"[0-9]{2}:[0-9]{2}:[0-9]{2}Z|" +
+                            r"[0-9]{14})" +
+                            r"\|[0-9]+")
+
+    PROPERTIES = {'user', 'userid', 'comment', 'timestamp', 'title',
+                  'ids', 'sizes', 'redirect', 'flags', 'loginfo',
+                  'tags', 'sha1'}
+
+    SHOW = {'minor', '!minor', 'bot', '!bot', 'anon', '!anon',
+            'redirect', '!redirect', 'patrolled', '!patrolled'}
+    
+    TYPES = {'edit', 'external', 'new', 'log'}
+    
+    DIRECTIONS = {'newer', 'older'}
+
+    MAX_CHANGES = 50
+
+    def _check_rccontinue(self, rccontinue):
+        if rccontinue is None:
+            return None
+        elif self.RCCONTINUE.match(rccontinue):
+            return rccontinue
+        else:
+            raise TypeError(
+                "rccontinue {0} is not formatted correctly ".format(rccontinue) +
+                "'%Y-%m-%dT%H:%M:%SZ|<last_rcid>'"
+            )
+
+    def query(self, *args, limit=None, **kwargs):
+        """
+        Enumerate recent changes.
+        See `<https://www.mediawiki.org/wiki/API:Recentchanges>`_
+
+        :Parameters:
+            start : :class:`mw.Timestamp`
+                The timestamp to start enumerating from
+            end : :class:`mw.Timestamp`
+                The timestamp to end enumerating
+            direction :
+                "newer" or "older"
+            namespace : int
+                Filter log entries to only this namespace(s)
+            user : str
+                Only list changes by this user
+            excludeuser : str
+                Don't list changes by this user
+            tag : str
+                Only list changes tagged with this tag
+            properties : set(str)
+                Include additional pieces of information
+
+                * user           - Adds the user responsible for the edit and tags if they are an IP
+                * userid         - Adds the user id responsible for the edit
+                * comment        - Adds the comment for the edit
+                * parsedcomment  - Adds the parsed comment for the edit
+                * flags          - Adds flags for the edit
+                * timestamp      - Adds timestamp of the edit
+                * title          - Adds the page title of the edit
+                * ids            - Adds the page ID, recent changes ID and the new and old revision ID
+                * sizes          - Adds the new and old page length in bytes
+                * redirect       - Tags edit if page is a redirect
+                * patrolled      - Tags patrollable edits as being patrolled or unpatrolled
+                * loginfo        - Adds log information (logid, logtype, etc) to log entries
+                * tags           - Lists tags for the entry
+                * sha1           - Adds the content checksum for entries associated with a revision
+
+            token : set(str)
+                Which tokens to obtain for each change
+
+                * patrol
+
+            show : set(str)
+                Show only items that meet this criteria. For example, to see
+                only minor edits done by logged-in users, set
+                show={'minor', '!anon'}.
+
+                * minor
+                * !minor
+                * bot
+                * !bot
+                * anon
+                * !anon
+                * redirect
+                * !redirect
+                * patrolled
+                * !patrolled
+                * unpatrolled
+            limit : int
+                How many total changes to return
+            type : set(str)
+                Which types of changes to show
+
+                * edit
+                * external
+                * new
+                * log
+
+            toponly : bool
+                Only list changes which are the latest revision
+            rccontinue : str
+                Use this to continue loading results from where you last left off
+        """
+        limit = none_or(limit, int)
+
+        changes_yielded = 0
+        done = False
+        while not done:
+
+            if limit is None:
+                kwargs['limit'] = self.MAX_CHANGES
+            else:
+                kwargs['limit'] = min(limit - changes_yielded, self.MAX_CHANGES)
+
+            rc_docs, rccontinue = self._query(*args, **kwargs)
+
+            for doc in rc_docs:
+                yield doc
+                changes_yielded += 1
+
+                if limit is not None and changes_yielded >= limit:
+                    done = True
+                    break
+
+            if rccontinue is not None and len(rc_docs) > 0:
+
+                kwargs['rccontinue'] = rccontinue
+            else:
+                done = True
+
+    def _query(self, start=None, end=None, direction=None, namespace=None,
+               user=None, excludeuser=None, tag=None, properties=None,
+               token=None, show=None, limit=None, type=None,
+               toponly=None, rccontinue=None):
+
+        params = {
+            'action': "query",
+            'list': "recentchanges"
+        }
+
+        params['rcstart'] = none_or(start, str)
+        params['rcend'] = none_or(end, str)
+
+        assert direction in {None} | self.DIRECTIONS, \
+            "Direction must be one of {0}".format(self.DIRECTIONS)
+
+        params['rcdir'] = direction
+        params['rcnamespace'] = none_or(namespace, int)
+        params['rcuser'] = none_or(user, str)
+        params['rcexcludeuser'] = none_or(excludeuser, str)
+        params['rctag'] = none_or(tag, str)
+        params['rcprop'] = self._items(properties, levels=self.PROPERTIES)
+        params['rctoken'] = none_or(tag, str)
+        params['rcshow'] = self._items(show, levels=self.SHOW)
+        params['rclimit'] = none_or(limit, int)
+        params['rctype'] = self._items(type, self.TYPES)
+        params['rctoponly'] = none_or(toponly, bool)
+        params['rccontinue'] = self._check_rccontinue(rccontinue)
+
+        doc = self.session.get(params)
+
+        try:
+            rc_docs = doc['query']['recentchanges']
+
+            if 'query-continue' in doc:
+                rccontinue = \
+                        doc['query-continue']['recentchanges']['rccontinue']
+            elif len(rc_docs) > 0:
+                rccontinue = "|".join([rc_docs[-1]['timestamp'],
+                                       str(rc_docs[-1]['rcid'] + 1)])
+            else:
+                pass  # Leave it be
+
+        except KeyError as e:
+            raise MalformedResponse(str(e), doc)
+
+        return rc_docs, rccontinue
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/revisions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/revisions.py
new file mode 100644 (file)
index 0000000..4d0c67e
--- /dev/null
@@ -0,0 +1,220 @@
+import logging
+
+from ...util import none_or
+from ..errors import MalformedResponse
+from .collection import Collection
+
+logger = logging.getLogger("mw.api.collections.revisions")
+
+
+class Revisions(Collection):
+    """
+    A collection of revisions indexes by title, page_id and user_text.
+    Note that revisions of deleted pages are queriable via
+    :class:`mw.api.DeletedRevs`.
+    """
+    
+    PROPERTIES = {'ids', 'flags', 'timestamp', 'user', 'userid', 'size',
+                  'sha1', 'contentmodel', 'comment', 'parsedcomment',
+                  'content', 'tags', 'flagged'}
+    
+    DIFF_TO = {'prev', 'next', 'cur'}
+    
+    # This is *not* the right way to do this, but it should work for all queries.
+    MAX_REVISIONS = 50
+    
+    def get(self, rev_id, **kwargs):
+        """
+        Get a single revision based on it's ID.  Throws a :py:class:`KeyError`
+        if the rev_id cannot be found.
+        
+        :Parameters:
+            rev_id : int
+                Revision ID
+            ``**kwargs``
+                Passed to :py:meth:`query`
+            
+        :Returns:
+            A single rev dict
+        """
+        rev_id = int(rev_id)
+        
+        revs = list(self.query(revids={rev_id}, **kwargs))
+        
+        if len(revs) < 1:
+            raise KeyError(rev_id)
+        else:
+            return revs[0]
+        
+    def query(self, *args, limit=None, **kwargs):
+        """
+        Get revision information.
+        See `<https://www.mediawiki.org/wiki/API:Properties#revisions_.2F_rv>`_
+        
+        :Parameters:
+            properties : set(str)
+                Which properties to get for each revision:
+                
+                * ids            - The ID of the revision
+                * flags          - Revision flags (minor)
+                * timestamp      - The timestamp of the revision
+                * user           - User that made the revision
+                * userid         - User id of revision creator
+                * size           - Length (bytes) of the revision
+                * sha1           - SHA-1 (base 16) of the revision
+                * contentmodel   - Content model id
+                * comment        - Comment by the user for revision
+                * parsedcomment  - Parsed comment by the user for the revision
+                * content        - Text of the revision
+                * tags           - Tags for the revision
+            limit : int
+                Limit how many revisions will be returned
+                No more than 500 (5000 for bots) allowed
+            start_id : int
+                From which revision id to start enumeration (enum)
+            end_id : int
+                Stop revision enumeration on this revid
+            start : :class:`mw.Timestamp`
+                From which revision timestamp to start enumeration (enum)
+            end : :class:`mw.Timestamp`
+                Enumerate up to this timestamp
+            direction : str
+                "newer" or "older"
+            user : str
+                Only include revisions made by user_text
+            excludeuser : bool
+                Exclude revisions made by user
+            tag : str
+                Only list revisions tagged with this tag
+            expandtemplates : bool
+                Expand templates in revision content (requires "content" propery)
+            generatexml : bool
+                Generate XML parse tree for revision content (requires "content" propery)
+            parse : bool
+                Parse revision content (requires "content" propery)
+            section : int
+                Only retrieve the content of this section number
+            token : set(str)
+                Which tokens to obtain for each revision
+                
+                * rollback - See `<https://www.mediawiki.org/wiki/API:Edit_-_Rollback#Token>`_
+            rvcontinue : str
+                When more results are available, use this to continue
+            diffto : int
+                Revision ID to diff each revision to. Use "prev", "next" and
+                "cur" for the previous, next and current revision respectively
+            difftotext : str
+                Text to diff each revision to. Only diffs a limited number of
+                revisions. Overrides diffto. If section is set, only that
+                section will be diffed against this text
+            contentformat : str
+                Serialization format used for difftotext and expected for output of content
+                
+                * text/x-wiki
+                * text/javascript
+                * text/css
+                * text/plain
+                * application/json
+        
+        :Returns:
+            An iterator of rev dicts returned from the API.
+        """
+        
+        revisions_yielded = 0
+        done = False
+        while not done:
+            if limit == None:
+                kwargs['limit'] = self.MAX_REVISIONS
+            else:
+                kwargs['limit'] = min(limit - revisions_yielded, self.MAX_REVISIONS)
+            
+            rev_docs, rvcontinue = self._query(*args, **kwargs)
+            
+            for doc in rev_docs:
+                yield doc
+                revisions_yielded += 1
+                
+                if limit != None and revisions_yielded >= limit:
+                    done = True
+                    break
+                
+            if rvcontinue != None and len(rev_docs) > 0:
+                kwargs['rvcontinue'] = rvcontinue
+            else:
+                done = True
+            
+    
+    def _query(self, revids=None, titles=None, pageids=None, properties=None,
+                     limit=None, start_id=None, end_id=None, start=None,
+                     end=None, direction=None, user=None, excludeuser=None,
+                     tag=None, expandtemplates=None, generatexml=None,
+                     parse=None, section=None, token=None, rvcontinue=None,
+                     diffto=None, difftotext=None, contentformat=None):
+        
+        params = {
+            'action': "query",
+            'prop': "revisions",
+            'rawcontinue': ''
+        }
+        
+        params['revids'] = self._items(revids, type=int)
+        params['titles'] = self._items(titles)
+        params['pageids'] = self._items(pageids, type=int)
+        
+        params['rvprop'] = self._items(properties, levels=self.PROPERTIES)
+        
+        if revids == None: # Can't have a limit unless revids is none
+            params['rvlimit'] = none_or(limit, int)
+            
+        params['rvstartid'] = none_or(start_id, int)
+        params['rvendid'] = none_or(end_id, int)
+        params['rvstart'] = self._check_timestamp(start)
+        params['rvend'] = self._check_timestamp(end)
+        
+        params['rvdir'] = self._check_direction(direction)
+        params['rvuser'] = none_or(user, str)
+        params['rvexcludeuser'] = none_or(excludeuser, int)
+        params['rvtag'] = none_or(tag, str)
+        params['rvexpandtemplates'] = none_or(expandtemplates, bool)
+        params['rvgeneratexml'] = none_or(generatexml, bool)
+        params['rvparse'] = none_or(parse, bool)
+        params['rvsection'] = none_or(section, int)
+        params['rvtoken'] = none_or(token, str)
+        params['rvcontinue'] = none_or(rvcontinue, str)
+        params['rvdiffto'] = self._check_diffto(diffto)
+        params['rvdifftotext'] = none_or(difftotext, str)
+        params['rvcontentformat'] = none_or(contentformat, str)
+        
+        doc = self.session.get(params)
+        
+        try:
+            if 'query-continue' in doc:
+                rvcontinue = doc['query-continue']['revisions']['rvcontinue']
+            else:
+                rvcontinue = None
+            
+            pages = doc['query'].get('pages', {}).values()
+            rev_docs = []
+            
+            for page_doc in pages:
+                if 'missing' in page_doc or 'revisions' not in page_doc: continue
+                
+                page_rev_docs = page_doc['revisions']
+                del page_doc['revisions']
+                
+                for rev_doc in page_rev_docs:
+                    rev_doc['page'] = page_doc
+                
+                rev_docs.extend(page_rev_docs)
+            
+            return rev_docs, rvcontinue
+            
+        except KeyError as e:
+            raise MalformedResponse(str(e), doc)
+        
+    
+    def _check_diffto(self, diffto):
+        if diffto == None or diffto in self.DIFF_TO:
+            return diffto
+        else:
+            return int(diffto)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/site_info.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/site_info.py
new file mode 100644 (file)
index 0000000..f6a5807
--- /dev/null
@@ -0,0 +1,81 @@
+import logging
+
+from ..errors import MalformedResponse
+from .collection import Collection
+
+logger = logging.getLogger("mw.api.collections.site_info")
+
+
+class SiteInfo(Collection):
+    """
+    General information about the site.
+    """
+
+    PROPERTIES = {'general', 'namespaces', 'namespacealiases',
+                  'specialpagealiases', 'magicwords', 'interwikimap',
+                  'dbrepllag', 'statistics', 'usergroups', 'extensions',
+                  'fileextensions', 'rightsinfo', 'languages', 'skins',
+                  'extensiontags', 'functionhooks', 'showhooks',
+                  'variables', 'protocols'}
+
+    FILTERIW = {'local', '!local'}
+
+    def query(self, properties=None, filteriw=None, showalldb=None,
+              numberinggroup=None, inlanguagecode=None):
+        """
+        General information about the site.
+        See `<https://www.mediawiki.org/wiki/API:Meta#siteinfo_.2F_si>`_
+
+        :Parameters:
+            properties: set(str)
+                Which sysinfo properties to get:
+
+                * general               - Overall system information
+                * namespaces            - List of registered namespaces and their canonical names
+                * namespacealiases      - List of registered namespace aliases
+                * specialpagealiases    - List of special page aliases
+                * magicwords            - List of magic words and their aliases
+                * statistics            - Returns site statistics
+                * interwikimap          - Returns interwiki map (optionally filtered, (optionally localised by using siinlanguagecode))
+                * dbrepllag             - Returns database server with the highest replication lag
+                * usergroups            - Returns user groups and the associated permissions
+                * extensions            - Returns extensions installed on the wiki
+                * fileextensions        - Returns list of file extensions allowed to be uploaded
+                * rightsinfo            - Returns wiki rights (license) information if available
+                * restrictions          - Returns information on available restriction (protection) types
+                * languages             - Returns a list of languages MediaWiki supports(optionally localised by using siinlanguagecode)
+                * skins                 - Returns a list of all enabled skins
+                * extensiontags         - Returns a list of parser extension tags
+                * functionhooks         - Returns a list of parser function hooks
+                * showhooks             - Returns a list of all subscribed hooks (contents of $wgHooks)
+                * variables             - Returns a list of variable IDs
+                * protocols             - Returns a list of protocols that are allowed in external links.
+                * defaultoptions        - Returns the default values for user preferences.
+            filteriw : str
+                "local" or "!local" Return only local or only nonlocal entries of the interwiki map
+            showalldb : bool
+                List all database servers, not just the one lagging the most
+            numberingroup : bool
+                Lists the number of users in user groups
+            inlanguagecode : bool
+                Language code for localised language names (best effort, use CLDR extension)
+  """
+
+        siprop = self._items(properties, levels=self.PROPERTIES)
+
+        doc = self.session.get(
+            {
+                'action': "query",
+                'meta': "siteinfo",
+                'siprop': siprop,
+                'sifilteriw': filteriw,
+                'sishowalldb': showalldb,
+                'sinumberinggroup': numberinggroup,
+                'siinlanguagecode': inlanguagecode
+            }
+        )
+
+        try:
+            return doc['query']
+        except KeyError as e:
+            raise MalformedResponse(str(e), doc)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/user_contribs.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/user_contribs.py
new file mode 100644 (file)
index 0000000..7d16144
--- /dev/null
@@ -0,0 +1,132 @@
+import logging
+
+from ...util import none_or
+from ..errors import MalformedResponse
+from .collection import Collection
+
+logger = logging.getLogger("mw.api.collections.user_contribs")
+
+
+class UserContribs(Collection):
+    """
+    A collection of revisions indexes by user.
+    """
+
+    PROPERTIES = {'ids', 'title', 'timestamp', 'comment', 'parsedcomment',
+                  'size', 'sizediff', 'flags', 'patrolled', 'tags'}
+
+    SHOW = {'minor', '!minor', 'patrolled', '!patrolled'}
+
+    MAX_REVISIONS = 50
+
+    def query(self, *args, limit=None, **kwargs):
+        """
+        Get a user's revisions.
+        See `<https://www.mediawiki.org/wiki/API:Usercontribs>`_
+
+        :Parameters:
+            limit : int
+                The maximum number of contributions to return.
+            start : :class:`mw.Timestamp`
+                The start timestamp to return from
+            end : :class:`mw.Timestamp`
+                The end timestamp to return to
+            user : set(str)
+                The users to retrieve contributions for.  Maximum number of values 50 (500 for bots)
+            userprefix : set(str)
+                Retrieve contributions for all users whose names begin with this value.
+            direction : str
+                "newer" or "older"
+            namespace : int
+                Only list contributions in these namespaces
+            properties :
+                Include additional pieces of information
+
+                * ids            - Adds the page ID and revision ID
+                * title          - Adds the title and namespace ID of the page
+                * timestamp      - Adds the timestamp of the edit
+                * comment        - Adds the comment of the edit
+                * parsedcomment  - Adds the parsed comment of the edit
+                * size           - Adds the new size of the edit
+                * sizediff       - Adds the size delta of the edit against its parent
+                * flags          - Adds flags of the edit
+                * patrolled      - Tags patrolled edits
+                * tags           - Lists tags for the edit
+            show : set(str)
+                Show only items that meet thse criteria, e.g. non minor edits only: ucshow=!minor.
+                NOTE: If ucshow=patrolled or ucshow=!patrolled is set, revisions older than
+                $wgRCMaxAge (2592000) won't be shown
+
+                * minor
+                * !minor,
+                * patrolled,
+                * !patrolled,
+                * top,
+                * !top,
+                * new,
+                * !new
+            tag : str
+                Only list revisions tagged with this tag
+            toponly : bool
+                DEPRECATED! Only list changes which are the latest revision
+        """
+        limit = none_or(limit, int)
+
+        revisions_yielded = 0
+        done = False
+        while not done:
+
+            if limit is None:
+                kwargs['limit'] = self.MAX_REVISIONS
+            else:
+                kwargs['limit'] = min(limit - revisions_yielded, self.MAX_REVISIONS)
+
+            uc_docs, uccontinue = self._query(*args, **kwargs)
+
+            for doc in uc_docs:
+                yield doc
+                revisions_yielded += 1
+
+                if limit is not None and revisions_yielded >= limit:
+                    done = True
+                    break
+
+            if uccontinue is None or len(uc_docs) == 0:
+                done = True
+            else:
+                kwargs['uccontinue'] = uccontinue
+
+    def _query(self, user=None, userprefix=None, limit=None, start=None,
+               end=None, direction=None, namespace=None, properties=None,
+               show=None, tag=None, toponly=None,
+               uccontinue=None):
+
+        params = {
+            'action': "query",
+            'list': "usercontribs"
+        }
+        params['uclimit'] = none_or(limit, int)
+        params['ucstart'] = self._check_timestamp(start)
+        params['ucend'] = self._check_timestamp(end)
+        if uccontinue is not None:
+            params.update(uccontinue)
+        params['ucuser'] = self._items(user, type=str)
+        params['ucuserprefix'] = self._items(userprefix, type=str)
+        params['ucdir'] = self._check_direction(direction)
+        params['ucnamespace'] = none_or(namespace, int)
+        params['ucprop'] = self._items(properties, levels=self.PROPERTIES)
+        params['ucshow'] = self._items(show, levels=self.SHOW)
+
+        doc = self.session.get(params)
+        try:
+            if 'query-continue' in doc:
+                uccontinue = doc['query-continue']['usercontribs']
+            else:
+                uccontinue = None
+
+            uc_docs = doc['query']['usercontribs']
+
+            return uc_docs, uccontinue
+
+        except KeyError as e:
+            raise MalformedResponse(str(e), doc)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/users.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/users.py
new file mode 100644 (file)
index 0000000..89d59aa
--- /dev/null
@@ -0,0 +1,83 @@
+import logging
+
+from ...util import none_or
+from ..errors import MalformedResponse
+from .collection import Collection
+
+logger = logging.getLogger("mw.api.collections.users")
+
+
+class Users(Collection):
+    """
+    A collection of information about users
+    """
+
+    PROPERTIES = {'blockinfo', 'implicitgroups', 'groups', 'registration',
+                  'emailable', 'editcount', 'gender'}
+
+    SHOW = {'minor', '!minor', 'patrolled', '!patrolled'}
+
+    MAX_REVISIONS = 50
+
+    def query(self, *args, **kwargs):
+        """
+        Get a user's metadata.
+        See `<https://www.mediawiki.org/wiki/API:Users>`_
+
+        :Parameters:
+            users : str
+                The usernames of the users to be retrieved.
+            
+            properties : set(str)
+                Include additional pieces of information
+
+                blockinfo      - Tags if the user is blocked, by whom, and
+                                 for what reason
+                groups         - Lists all the groups the user(s) belongs to
+                implicitgroups - Lists all the groups a user is automatically
+                                 a member of
+                rights         - Lists all the rights the user(s) has
+                editcount      - Adds the user's edit count
+                registration   - Adds the user's registration timestamp
+                emailable      - Tags if the user can and wants to receive
+                                 email through [[Special:Emailuser]]
+                gender         - Tags the gender of the user. Returns "male",
+                                 "female", or "unknown"
+        """
+        done = False
+        while not done:
+
+            us_docs, query_continue = self._query(*args, **kwargs)
+
+            for doc in us_docs:
+                yield doc
+
+            if query_continue is None or len(us_docs) == 0:
+                done = True
+            else:
+                kwargs['query_continue'] = query_continue
+
+    def _query(self, users, query_continue=None, properties=None):
+
+        params = {
+            'action': "query",
+            'list': "users"
+        }
+        params['ususers'] = self._items(users, type=str)
+        params['usprop'] = self._items(properties, levels=self.PROPERTIES)
+        if query_continue is not None:
+            params.update(query_continue)
+
+        doc = self.session.get(params)
+        try:
+            if 'query-continue' in doc:
+                query_continue = doc['query-continue']['users']
+            else:
+                query_continue = None
+
+            us_docs = doc['query']['users']
+
+            return us_docs, query_continue
+
+        except KeyError as e:
+            raise MalformedResponse(str(e), doc)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/errors.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/errors.py
new file mode 100644 (file)
index 0000000..621ce27
--- /dev/null
@@ -0,0 +1,48 @@
+class DocError(Exception):
+    def __init__(self, message, doc):
+        super().__init__(message)
+
+        self.doc = doc
+        """
+        The document returned by the API that brought about this error.
+        """
+
+
+class APIError(DocError):
+    def __init__(self, doc):
+
+        code = doc.get('error', {}).get('code')
+        message = doc.get('error', {}).get('message')
+
+        super().__init__("{0}:{1}".format(code, message), doc)
+
+        self.code = code
+        """
+        The error code returned by the api -- if available.
+        """
+
+        self.message = message
+        """
+        The error message returned by the api -- if available.
+        """
+
+class AuthenticationError(DocError):
+    def __init__(self, doc):
+        result = doc['login']['result']
+        super().__init__(result, doc)
+
+        self.result = result
+        """
+        The result code of an authentication attempt.
+        """
+
+
+class MalformedResponse(DocError):
+    def __init__(self, key, doc):
+
+        super().__init__("Expected to find '{0}' in result.".format(key), doc)
+
+        self.key = key
+        """
+        The expected, but missing key from the API call.
+        """
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/session.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/session.py
new file mode 100644 (file)
index 0000000..630451f
--- /dev/null
@@ -0,0 +1,134 @@
+import logging
+
+from ..util import api
+from .collections import (DeletedRevisions, Pages, RecentChanges, Revisions,
+                          SiteInfo, UserContribs, Users)
+from .errors import APIError, AuthenticationError, MalformedResponse
+
+logger = logging.getLogger("mw.api.session")
+
+DEFAULT_USER_AGENT = "MediaWiki-Utilities"
+"""
+The default User-Agent to be sent with requests to the API.
+"""
+
+class Session(api.Session):
+    """
+    Represents a connection to a MediaWiki API.
+
+    Cookies and other session information is preserved.
+
+    :Parameters:
+        uri : str
+            The base URI for the API to use.  Usually ends in "api.php"
+        user_agent : str
+            The User-Agent to be sent with requests.  Will raise a warning if
+            left to default value.
+    """
+
+    def __init__(self, uri, *args, user_agent=DEFAULT_USER_AGENT, **kwargs):
+        """
+        Constructs a new :class:`Session`.
+        """
+
+        if user_agent == DEFAULT_USER_AGENT:
+            logger.warning("Sending requests with default User-Agent.  "  +
+                           "Set 'user_agent' on api.Session to quiet this " +
+                           "message.")
+
+        if 'headers' in kwargs:
+            kwargs['headers']['User-Agent'] = str(user_agent)
+        else:
+            kwargs['headers'] = {'User-Agent': str(user_agent)}
+
+        super().__init__(uri, *args, **kwargs)
+
+        self.pages = Pages(self)
+        """
+        An instance of :class:`mw.api.Pages`.
+        """
+
+        self.revisions = Revisions(self)
+        """
+        An instance of :class:`mw.api.Revisions`.
+        """
+
+        self.recent_changes = RecentChanges(self)
+        """
+        An instance of :class:`mw.api.RecentChanges`.
+        """
+
+        self.site_info = SiteInfo(self)
+        """
+        An instance of :class:`mw.api.SiteInfo`.
+        """
+
+        self.user_contribs = UserContribs(self)
+        """
+        An instance of :class:`mw.api.UserContribs`.
+        """
+
+        self.users = Users(self)
+        """
+        An instance of :class:`mw.api.Users`.
+        """
+
+        self.deleted_revisions = DeletedRevisions(self)
+        """
+        An instance of :class:`mw.api.DeletedRevisions`.
+        """
+
+    def login(self, username, password, token=None):
+        """
+        Performs a login operation.  This method usually makes two requests to
+        API -- one to get a token and one to use the token to log in.  If
+        authentication fails, this method will throw an
+        :class:`.errors.AuthenticationError`.
+
+        :Parameters:
+            username : str
+                Your username
+            password : str
+                Your password
+
+        :Returns:
+            The response in a json :py:class:`dict`
+        """
+
+        doc = self.post(
+            {
+                'action': "login",
+                'lgname': username,
+                'lgpassword': password,
+                'lgtoken': token, # If None, we'll be getting a token
+            }
+        )
+
+
+        try:
+            if doc['login']['result'] == "Success":
+                return doc
+            elif doc['login']['result'] == "NeedToken":
+
+                if token is not None:
+                    # Woops.  We've been here before.  Better error out.
+                    raise AuthenticationError(doc)
+                else:
+                    token = doc['login']['token']
+                    return self.login(username, password, token=token)
+            else:
+                raise AuthenticationError(doc)
+
+        except KeyError as e:
+            raise MalformedResponse(e.message, doc)
+
+
+    def request(self, type, params, **kwargs):
+        params.update({'format': "json"})
+
+        doc = super().request(type, params, **kwargs).json()
+
+        if 'error' in doc:
+            raise APIError(doc)
+
+        return doc
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/__init__.py
new file mode 100644 (file)
index 0000000..f2ca37a
--- /dev/null
@@ -0,0 +1,4 @@
+# from . import errors
+from .db import DB
+from .collections import Pages, RecentChanges, Revisions, Archives, \
+    AllRevisions, Users
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/__init__.py
new file mode 100644 (file)
index 0000000..ca24195
--- /dev/null
@@ -0,0 +1,4 @@
+from .pages import Pages
+from .recent_changes import RecentChanges
+from .revisions import Revisions, Archives, AllRevisions
+from .users import Users
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/collection.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/collection.py
new file mode 100644 (file)
index 0000000..9c2fd0c
--- /dev/null
@@ -0,0 +1,11 @@
+class Collection:
+    DIRECTIONS = {'newer', 'older'}
+
+    def __init__(self, db):
+        self.db = db
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        return "{0}({1})".format(self.__class__.__name__, repr(self.db))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/pages.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/pages.py
new file mode 100644 (file)
index 0000000..f20c8ac
--- /dev/null
@@ -0,0 +1,65 @@
+import logging
+
+from ...util import none_or
+from .collection import Collection
+
+logger = logging.getLogger("mw.database.collections.pages")
+
+
+class Pages(Collection):
+    def get(self, page_id=None, namespace_title=None, rev_id=None):
+        """
+        Gets a single page based on a legitimate identifier of the page.  Note
+        that namespace_title expects a tuple of namespace ID and title.
+
+        :Parameters:
+            page_id : int
+                Page ID
+            namespace_title : ( int, str )
+                the page's namespace ID and title
+            rev_id : int
+                a revision ID included in the page's history
+
+        :Returns:
+            iterator over result rows
+        """
+
+        page_id = none_or(page_id, int)
+        namespace_title = none_or(namespace_title, tuple)
+        rev_id = none_or(rev_id, int)
+
+        query = """
+        SELECT page.*
+        FROM page
+        """
+        values = []
+
+        if page_id is not None:
+            query += """
+                WHERE page_id = %s
+            """
+            values.append(page_id)
+
+        if namespace_title is not None:
+            namespace, title = namespace_title
+
+            query += " WHERE page_namespace = %s and page_title = %s "
+            values.extend([int(namespace), str(title)])
+
+        elif rev_id is not None:
+            query += """
+                WHERE page_id = (SELECT rev_page FROM revision WHERE rev_id = %s)
+            """
+            values.append(rev_id)
+
+        else:
+            raise TypeError("Must specify a page identifier.")
+
+        cursor = self.db.shared_connection.cursor()
+        cursor.execute(
+            query,
+            values
+        )
+
+        for row in cursor:
+            return row
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/recent_changes.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/recent_changes.py
new file mode 100644 (file)
index 0000000..5f67190
--- /dev/null
@@ -0,0 +1,128 @@
+import logging
+import time
+
+from ...types import Timestamp
+from ...util import none_or
+from .collection import Collection
+
+logger = logging.getLogger("mw.database.collections.pages")
+
+
+class RecentChanges(Collection):
+    # (https://www.mediawiki.org/wiki/Manual:Recentchanges_table)
+    TYPES = {
+        'edit': 0,  # edit of existing page
+        'new': 1,  # new page
+        'move': 2,  # Marked as obsolete
+        'log': 3,  # log action (introduced in MediaWiki 1.2)
+        'move_over_redirect': 4,  # Marked as obsolete
+        'external': 5  # An external recent change. Primarily used by Wikidata
+    }
+
+    def listen(self, last=None, types=None, max_wait=5):
+        """
+        Listens to the recent changes table.  Given no parameters, this function
+        will return an iterator over the entire recentchanges table and then
+        continue to "listen" for new changes to come in every 5 seconds.
+
+        :Parameters:
+            last : dict
+                a recentchanges row to pick up after
+            types : set ( str )
+                a set of recentchanges types to filter for
+            max_wait : float
+                the maximum number of seconds to wait between repeated queries
+
+        :Returns:
+            A never-ending iterator over change rows.
+        """
+        while True:
+            if last is not None:
+                after = last['rc_timestamp']
+                after_id = last['rc_id']
+            else:
+                after = None
+                after_id = None
+
+            start = time.time()
+            rcs = self.query(after=after, after_id=after_id, direction="newer")
+
+            count = 0
+            for rc in rcs:
+                yield rc
+                count += 1
+
+            time.sleep(max_wait - (time.time() - start))
+
+    def query(self, before=None, after=None, before_id=None, after_id=None,
+              types=None, direction=None, limit=None):
+        """
+        Queries the ``recentchanges`` table.  See
+        `<https://www.mediawiki.org/wiki/Manual:Recentchanges_table>`_
+
+        :Parameters:
+            before : :class:`mw.Timestamp`
+                The maximum timestamp
+            after : :class:`mw.Timestamp`
+                The minimum timestamp
+            before_id : int
+                The minimum ``rc_id``
+            after_id : int
+                The maximum ``rc_id``
+            types : set ( str )
+                Which types of changes to return?
+
+                * ``edit`` -- Edits to existing pages
+                * ``new`` -- Edits that create new pages
+                * ``move`` -- (obsolete)
+                * ``log`` -- Log actions (introduced in MediaWiki 1.2)
+                * ``move_over_redirect`` -- (obsolete)
+                * ``external`` -- An external recent change. Primarily used by Wikidata
+
+            direction : str
+                "older" or "newer"
+            limit : int
+                limit the number of records returned
+        """
+        before = none_or(before, Timestamp)
+        after = none_or(after, Timestamp)
+        before_id = none_or(before_id, int)
+        after_id = none_or(after_id, int)
+        types = none_or(types, levels=self.TYPES)
+        direction = none_or(direction, levels=self.DIRECTIONS)
+        limit = none_or(limit, int)
+
+        query = """
+            SELECT * FROM recentchanges
+            WHERE 1
+        """
+        values = []
+
+        if before is not None:
+            query += " AND rc_timestamp < %s "
+            values.append(before.short_format())
+        if after is not None:
+            query += " AND rc_timestamp < %s "
+            values.append(after.short_format())
+        if before_id is not None:
+            query += " AND rc_id < %s "
+            values.append(before_id)
+        if after_id is not None:
+            query += " AND rc_id < %s "
+            values.append(after_id)
+        if types is not None:
+            query += " AND rc_type IN ({0}) ".format(
+                ",".join(self.TYPES[t] for t in types)
+            )
+
+        if direction is not None:
+            direction = ("ASC " if direction == "newer" else "DESC ")
+            query += " ORDER BY rc_timestamp {0}, rc_id {0}".format(dir)
+
+        if limit is not None:
+            query += " LIMIT %s "
+            values.append(limit)
+
+        cursor.execute(query, values)
+        for row in cursor:
+            yield row
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/revisions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/revisions.py
new file mode 100644 (file)
index 0000000..0ff9a52
--- /dev/null
@@ -0,0 +1,410 @@
+import logging
+import time
+from itertools import chain
+
+from ...types import Timestamp
+from ...util import iteration, none_or
+from .collection import Collection
+
+logger = logging.getLogger("mw.database.collections.revisions")
+
+
+class AllRevisions(Collection):
+    def get(self, rev_id, include_page=False):
+        """
+        Gets a single revisions by ID.  Checks both the ``revision`` and
+        ``archive`` tables.  This method throws a :class:`KeyError` if a
+        revision cannot be found.
+
+        :Parameters:
+            rev_id : int
+                Revision ID
+            include_page : bool
+                Join revision returned against ``page``
+
+        :Returns:
+            A revision row
+        """
+        rev_id = int(rev_id)
+        try:
+            rev_row = self.db.revisions.get(rev_id, include_page=include_page)
+        except KeyError as e:
+            rev_row = self.db.archives.get(rev_id)
+
+        return rev_row
+
+    def query(self, *args, **kwargs):
+        """
+        Queries revisions (excludes revisions to deleted pages)
+
+        :Parameters:
+            page_id : int
+                Page identifier.  Filter revisions to this page.
+            user_id : int
+                User identifier.  Filter revisions to those made by this user.
+            user_text : str
+                User text (user_name or IP address).  Filter revisions to those
+                made by this user.
+            before : :class:`mw.Timestamp`
+                Filter revisions to those made before this timestamp.
+            after : :class:`mw.Timestamp`
+                Filter revisions to those made after this timestamp.
+            before_id : int
+                Filter revisions to those with an ID before this ID
+            after_id : int
+                Filter revisions to those with an ID after this ID
+            direction : str
+                "newer" or "older"
+            limit : int
+                Limit the number of results
+            include_page : bool
+                Join revisions returned against ``page``
+
+        :Returns:
+            An iterator over revision rows.
+        """
+
+        revisions = self.db.revisions.query(*args, **kwargs)
+        archives = self.db.archives.query(*args, **kwargs)
+
+        if 'direction' in kwargs:
+            direction = kwargs['direction']
+            if direction not in self.DIRECTIONS:
+                raise TypeError("direction must be in {0}".format(self.DIRECTIONS))
+
+            if direction == "newer":
+                collated_revisions = iteration.sequence(
+                    revisions,
+                    archives,
+                    compare=lambda r1, r2:\
+                            (r1['rev_timestamp'], r1['rev_id']) <=
+                            (r2['rev_timestamp'], r2['rev_id'])
+                )
+            else:  # direction == "older"
+                collated_revisions = iteration.sequence(
+                    revisions,
+                    archives,
+                    compare=lambda r1, r2:\
+                            (r1['rev_timestamp'], r1['rev_id']) >=
+                            (r2['rev_timestamp'], r2['rev_id'])
+                )
+        else:
+            collated_revisions = chain(revisions, archives)
+
+        if 'limit' in kwargs:
+            limit = kwargs['limit']
+
+            for i, rev in enumerate(collated_revisions):
+                yield rev
+                if i >= limit:
+                    break
+
+        else:
+            for rev in collated_revisions:
+                yield rev
+
+
+class Revisions(Collection):
+    
+    def get(self, rev_id, include_page=False):
+        """
+        Gets a single revisions by ID.  Checks the ``revision`` table.   This
+        method throws a :class:`KeyError` if a revision cannot be found.
+
+        :Parameters:
+            rev_id : int
+                Revision ID
+            include_page : bool
+                Join revision returned against ``page``
+
+        :Returns:
+            A revision row
+        """
+        rev_id = int(rev_id)
+
+        query = """
+            SELECT *, FALSE AS archived FROM revision
+        """
+        if include_page:
+            query += """
+                INNER JOIN page ON page_id = rev_page
+            """
+
+        query += " WHERE rev_id = %s"
+
+        cursor.execute(query, [rev_id])
+
+        for row in cursor:
+            return row
+
+        raise KeyError(rev_id)
+
+    def query(self, page_id=None, user_id=None, user_text=None,
+              before=None, after=None, before_id=None, after_id=None,
+              direction=None, limit=None, include_page=False):
+        """
+        Queries revisions (excludes revisions to deleted pages)
+
+        :Parameters:
+            page_id : int
+                Page identifier.  Filter revisions to this page.
+            user_id : int
+                User identifier.  Filter revisions to those made by this user.
+            user_text : str
+                User text (user_name or IP address).  Filter revisions to those
+                made by this user.
+            before : :class:`mw.Timestamp`
+                Filter revisions to those made before this timestamp.
+            after : :class:`mw.Timestamp`
+                Filter revisions to those made after this timestamp.
+            before_id : int
+                Filter revisions to those with an ID before this ID
+            after_id : int
+                Filter revisions to those with an ID after this ID
+            direction : str
+                "newer" or "older"
+            limit : int
+                Limit the number of results
+            include_page : bool
+                Join revisions returned against ``page``
+
+        :Returns:
+            An iterator over revision rows.
+        """
+        start_time = time.time()
+
+        page_id = none_or(page_id, int)
+        user_id = none_or(user_id, int)
+        user_text = none_or(user_text, str)
+        before = none_or(before, Timestamp)
+        after = none_or(after, Timestamp)
+        before_id = none_or(before_id, int)
+        after_id = none_or(after_id, int)
+        direction = none_or(direction, levels=self.DIRECTIONS)
+        include_page = bool(include_page)
+
+        query = """
+            SELECT *, FALSE AS archived FROM revision
+        """
+
+        if include_page:
+            query += """
+                INNER JOIN page ON page_id = rev_page
+            """
+
+        query += """
+            WHERE 1
+        """
+        values = []
+
+        if page_id is not None:
+            query += " AND rev_page = %s "
+            values.append(page_id)
+        if user_id is not None:
+            query += " AND rev_user = %s "
+            values.append(user_id)
+        if user_text is not None:
+            query += " AND rev_user_text = %s "
+            values.append(user_text)
+        if before is not None:
+            query += " AND rev_timestamp < %s "
+            values.append(before.short_format())
+        if after is not None:
+            query += " AND rev_timestamp > %s "
+            values.append(after.short_format())
+        if before_id is not None:
+            query += " AND rev_id < %s "
+            values.append(before_id)
+        if after_id is not None:
+            query += " AND rev_id > %s "
+            values.append(after_id)
+
+        if direction is not None:
+            
+            direction = ("ASC " if direction == "newer" else "DESC ")
+            
+            if before_id != None or after_id != None:
+                query += " ORDER BY rev_id {0}, rev_timestamp {0}".format(direction)
+            else:
+                query += " ORDER BY rev_timestamp {0}, rev_id {0}".format(direction)
+
+        if limit is not None:
+            query += " LIMIT %s "
+            values.append(limit)
+
+        cursor = self.db.shared_connection.cursor()
+        cursor.execute(query, values)
+        count = 0
+        for row in cursor:
+            yield row
+            count += 1
+
+        logger.debug("%s revisions read in %s seconds" % (count, time.time() - start_time))
+
+
+class Archives(Collection):
+    def get(self, rev_id):
+        """
+        Gets a single revisions by ID.  Checks the ``archive`` table. This
+        method throws a :class:`KeyError` if a revision cannot be found.
+
+        :Parameters:
+            rev_id : int
+                Revision ID
+
+        :Returns:
+            A revision row
+        """
+        rev_id = int(rev_id)
+
+        query = """
+            SELECT
+                ar_id,
+                ar_rev_id AS rev_id,
+                ar_page_id AS rev_page,
+                ar_page_id AS page_id,
+                ar_title AS page_title,
+                ar_namespace AS page_namespace,
+                ar_text_id AS rev_text_id,
+                ar_comment AS rev_comment,
+                ar_user AS rev_user,
+                ar_user_text AS rev_user_text,
+                ar_timestamp AS rev_timestamp,
+                ar_minor_edit AS rev_minor_edit,
+                ar_deleted AS rev_deleted,
+                ar_len AS rev_len,
+                ar_parent_id AS rev_parent_id,
+                ar_sha1 AS rev_sha1,
+                TRUE AS archived
+            FROM archive
+            WHERE ar_rev_id = %s
+        """
+
+        cursor.execute(query, [rev_id])
+        for row in cursor:
+            return row
+
+        raise KeyError(rev_id)
+
+    def query(self, page_id=None, user_id=None, user_text=None,
+              before=None, after=None, before_id=None, after_id=None,
+              before_ar_id=None, after_ar_id=None,
+              direction=None, limit=None, include_page=True):
+        """
+        Queries archived revisions (revisions of deleted pages)
+
+        :Parameters:
+            page_id : int
+                Page identifier.  Filter revisions to this page.
+            user_id : int
+                User identifier.  Filter revisions to those made by this user.
+            user_text : str
+                User text (user_name or IP address).  Filter revisions to those
+                made by this user.
+            before : :class:`mw.Timestamp`
+                Filter revisions to those made before this timestamp.
+            after : :class:`mw.Timestamp`
+                Filter revisions to those made after this timestamp.
+            before_id : int
+                Filter revisions to those with an ID before this ID
+            after_id : int
+                Filter revisions to those with an ID after this ID
+            direction : str
+                "newer" or "older"
+            limit : int
+                Limit the number of results
+            include_page : bool
+                This field is ignored.  It's only here for compatibility with
+                :class:`mw.database.Revision`.
+
+        :Returns:
+            An iterator over revision rows.
+        """
+        page_id = none_or(page_id, int)
+        user_id = none_or(user_id, int)
+        before = none_or(before, Timestamp)
+        after = none_or(after, Timestamp)
+        before_id = none_or(before_id, int)
+        after_id = none_or(after_id, int)
+        direction = none_or(direction, levels=self.DIRECTIONS)
+        limit = none_or(limit, int)
+
+        start_time = time.time()
+        cursor = self.db.shared_connection.cursor()
+
+        query = """
+            SELECT
+                ar_id,
+                ar_rev_id AS rev_id,
+                ar_page_id AS rev_page,
+                ar_page_id AS page_id,
+                ar_title AS page_title,
+                ar_namespace AS page_namespace,
+                ar_text_id AS rev_text_id,
+                ar_comment AS rev_comment,
+                ar_user AS rev_user,
+                ar_user_text AS rev_user_text,
+                ar_timestamp AS rev_timestamp,
+                ar_minor_edit AS rev_minor_edit,
+                ar_deleted AS rev_deleted,
+                ar_len AS rev_len,
+                ar_parent_id AS rev_parent_id,
+                ar_sha1 AS rev_sha1,
+                TRUE AS archived
+            FROM archive
+        """
+
+        query += """
+            WHERE 1
+        """
+        values = []
+
+        if page_id is not None:
+            query += " AND ar_page_id = %s "
+            values.append(page_id)
+        if user_id is not None:
+            query += " AND ar_user = %s "
+            values.append(user_id)
+        if user_text is not None:
+            query += " AND ar_user_text = %s "
+            values.append(user_text)
+        if before is not None:
+            query += " AND ar_timestamp < %s "
+            values.append(before.short_format())
+        if after is not None:
+            query += " AND ar_timestamp > %s "
+            values.append(after.short_format())
+        if before_id is not None:
+            query += " AND ar_rev_id < %s "
+            values.append(before_id)
+        if after_id is not None:
+            query += " AND ar_rev_id > %s "
+            values.append(after_id)
+        if before_ar_id is not None:
+            query += " AND ar_id < ? "
+            values.append(before_ar_id)
+        if after_ar_id is not None:
+            query += " AND ar_id > ? "
+            values.append(after_ar_id)
+
+        if direction is not None:
+            dir = ("ASC " if direction == "newer" else "DESC ")
+            
+            if before is not None or after is not None:
+                query += " ORDER BY ar_timestamp {0}, ar_rev_id {0}".format(dir)
+            elif before_id is not None or after_id is not None:
+                query += " ORDER BY ar_rev_id {0}, ar_timestamp {0}".format(dir)
+            else:
+                query += " ORDER BY ar_id {0}".format(dir)
+        
+        if limit is not None:
+            query += " LIMIT %s "
+            values.append(limit)
+
+        cursor.execute(query, values)
+        count = 0
+        for row in cursor:
+            yield row
+            count += 1
+
+        logger.debug("%s revisions read in %s seconds" % (count, time.time() - start_time))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/users.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/users.py
new file mode 100644 (file)
index 0000000..7065fa4
--- /dev/null
@@ -0,0 +1,154 @@
+import logging
+import time
+
+from ...types import Timestamp
+from ...util import none_or
+from .collection import Collection
+
+logger = logging.getLogger("mw.database.collections.users")
+
+
+class Users(Collection):
+    CREATION_ACTIONS = {'newusers', 'create', 'create2', 'autocreate',
+                        'byemail'}
+
+    def get(self, user_id=None, user_name=None):
+        """
+        Gets a single user row from the database.  Raises a :class:`KeyError`
+        if a user cannot be found.
+
+        :Parameters:
+            user_id : int
+                User ID
+            user_name : str
+                User's name
+
+        :Returns:
+            A user row.
+        """
+        user_id = none_or(user_id, int)
+        user_name = none_or(user_name, str)
+
+        query = """
+            SELECT user.*
+            FROM user
+        """
+        values = []
+
+        if user_id is not None:
+            query += """
+                WHERE user_id = %s
+            """
+            values.append(user_id)
+
+        elif user_name is not None:
+            query += """
+                WHERE user_name = %s
+            """
+            values.append(user_name)
+
+        else:
+            raise TypeError("Must specify a user identifier.")
+
+        cursor = self.db.shared_connection.cursor()
+        cursor.execute(
+            query,
+            values
+        )
+
+        for row in cursor:
+            return row
+
+        raise KeyError(user_id if user_id is not None else user_name)
+
+    def query(self, registered_before=None, registered_after=None,
+              before_id=None, after_id=None, limit=None,
+              direction=None, self_created_only=False):
+        """
+        Queries users based on various filtering parameters.
+
+        :Parameters:
+            registered_before : :class:`mw.Timestamp`
+                A timestamp to search before (inclusive)
+            registered_after : :class:`mw.Timestamp`
+                A timestamp to search after (inclusive)
+            before_id : int
+                A user_id to search before (inclusive)
+            after_id : int
+                A user_ud to search after (inclusive)
+            direction : str
+                "newer" or "older"
+            limit : int
+                Limit the results to at most this number
+            self_creations_only : bool
+                limit results to self_created user accounts
+
+        :Returns:
+            an iterator over ``user`` table rows
+        """
+        start_time = time.time()
+
+        registered_before = none_or(registered_before, Timestamp)
+        registered_after = none_or(registered_after, Timestamp)
+        before_id = none_or(before_id, str)
+        after_id = none_or(after_id, str)
+        direction = none_or(direction, levels=self.DIRECTIONS)
+        limit = none_or(limit, int)
+        self_created_only = bool(self_created_only)
+
+        query = """
+            SELECT user.*
+            FROM user
+        """
+        values = []
+
+        if self_created_only:
+            query += """
+                INNER JOIN logging ON
+                    log_user = user_id
+                    log_type = "newusers" AND
+                    log_action = "create"
+            """
+
+        query += "WHERE 1 "
+
+        if registered_before is not None:
+            query += "AND user_registration <= %s "
+            values.append(registered_before.short_format())
+        if registered_after is not None:
+            query += "AND user_registration >= %s "
+            values.append(registered_after.short_format())
+        if before_id is not None:
+            query += "AND user_id <= %s "
+            values.append(before_id)
+        if after_id is not None:
+            query += "AND user_id >= %s "
+            values.append(after_id)
+
+        query += "GROUP BY user_id "  # In case of duplicate log events
+        
+        if direction is not None:
+            if registered_before is not None or registered_after is not None:
+                if direction == "newer":
+                    query += "ORDER BY user_registration ASC "
+                else:
+                    query += "ORDER BY user_registration DESC "
+            else:
+                if direction == "newer":
+                    query += "ORDER BY user_id ASC "
+                else:
+                    query += "ORDER BY user_id DESC "
+
+        if limit is not None:
+            query += "LIMIT %s "
+            values.append(limit)
+
+        cursor = self.db.shared_connection.cursor()
+        cursor.execute(query, values)
+
+        count = 0
+        for row in cursor:
+            yield row
+            count += 1
+
+        logger.debug("%s users queried in %s seconds" % (count, time.time() - start_time))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/db.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/db.py
new file mode 100644 (file)
index 0000000..acca28f
--- /dev/null
@@ -0,0 +1,134 @@
+import getpass
+import logging
+import os
+
+import pymysql
+import pymysql.cursors
+
+from .collections import AllRevisions, Archives, Pages, Revisions, Users
+
+logger = logging.getLogger("mw.database.db")
+
+
+class DB:
+    """
+    Represents a connection to a MySQL database.
+
+    :Parameters:
+        connection = :class:`oursql.Connection`
+            A connection to a MediaWiki database
+    """
+
+    def __init__(self, connection):
+        self.shared_connection = connection
+        self.shared_connection.cursorclass = pymysql.cursors.DictCursor
+
+        self.revisions = Revisions(self)
+        """
+        An instance of :class:`mw.database.Revisions`.
+        """
+
+        self.archives = Archives(self)
+        """
+        An instance of :class:`mw.database.Archives`.
+        """
+
+        self.all_revisions = AllRevisions(self)
+        """
+        An instance of :class:`mw.database.AllRevisions`.
+        """
+
+        self.pages = Pages(self)
+        """
+        An instance of :class:`mw.database.Pages`.
+        """
+
+        self.users = Users(self)
+        """
+        An instance of :class:`mw.database.Users`.
+        """
+
+    def __repr__(self):
+        return "%s(%s)" % (
+            self.__class__.__name__,
+            ", ".join(
+                [repr(arg) for arg in self.args] +
+                ["%s=%r" % (k, v) for k, v in self.kwargs.items()]
+            )
+        )
+
+    def __str__(self):
+        return self.__repr__()
+
+    @classmethod
+    def add_arguments(cls, parser, defaults=None):
+        """
+        Adds the arguments to an :class:`argparse.ArgumentParser` in order to
+        create a database connection.
+        """
+        defaults = defaults if defaults is not None else defaults
+
+        default_host = defaults.get('host', "localhost")
+        parser.add_argument(
+            '--host', '-h',
+            help="MySQL database host to connect to (defaults to {0})".format(default_host),
+            default=default_host
+        )
+
+        default_database = defaults.get('database', getpass.getuser())
+        parser.add_argument(
+            '--database', '-d',
+            help="MySQL database name to connect to (defaults to  {0})".format(default_database),
+            default=default_database
+        )
+
+        default_defaults_file = defaults.get('defaults-file', os.path.expanduser("~/.my.cnf"))
+        parser.add_argument(
+            '--defaults-file',
+            help="MySQL defaults file (defaults to {0})".format(default_defaults_file),
+            default=default_defaults_file
+        )
+
+        default_user = defaults.get('user', getpass.getuser())
+        parser.add_argument(
+            '--user', '-u',
+            help="MySQL user (defaults to %s)".format(default_user),
+            default=default_user
+        )
+        return parser
+
+    @classmethod
+    def from_arguments(cls, args):
+        """
+        Constructs a :class:`~mw.database.DB`.
+        Consumes :class:`argparse.ArgumentParser` arguments given by
+        :meth:`add_arguments` in order to create a :class:`DB`.
+
+        :Parameters:
+            args : :class:`argparse.Namespace`
+                A collection of argument values returned by :class:`argparse.ArgumentParser`'s :meth:`parse_args()`
+        """
+        connection = pymysql.connect(
+            args.host,
+            args.user,
+            database=args.database,
+            read_default_file=args.defaults_file
+        )
+        return cls(connection)
+
+    @classmethod
+    def from_params(cls, *args, **kwargs):
+        """
+        Constructs a :class:`~mw.database.DB`.  Passes `*args` and `**kwargs`
+        to :meth:`oursql.connect` and configures the connection.
+
+        :Parameters:
+            args : :class:`argparse.Namespace`
+                A collection of argument values returned by :class:`argparse.ArgumentParser`'s :meth:`parse_args()`
+        """
+        kwargs['cursorclass'] = pymysql.cursors.DictCursor
+        if kwargs['db']:
+            kwargs['database'] = kwargs['db']
+            del kwargs['db']
+        connection = pymysql.connect(*args, **kwargs)
+        return cls(connection)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/__init__.py
new file mode 100644 (file)
index 0000000..38ee519
--- /dev/null
@@ -0,0 +1,14 @@
+"""
+A package with utilities for managing the persistent word analysis across text
+versions of a document.  `PersistenceState` is the highest level of the
+interface and the part of the system that's most interesting externally.  `Word`s
+are also very important.  The current implementation of `Word` only accounts for
+how the number of revisions in which a Word is visible.  If persistent word
+views (or something similar) is intended to be kept, refactoring will be
+necessary.
+"""
+
+from .state import State
+from .tokens import Tokens, Token
+from . import defaults
+from . import api
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/api.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/api.py
new file mode 100644 (file)
index 0000000..c2c122e
--- /dev/null
@@ -0,0 +1,85 @@
+from .. import reverts
+from ...util import none_or
+from .state import State
+
+
+def track(session, rev_id, page_id=None, revert_radius=reverts.defaults.RADIUS,
+          future_revisions=reverts.defaults.RADIUS, properties=None):
+    """
+    Computes a persistence score for a revision by processing the revisions
+    that took place around it.
+
+    :Parameters:
+        session : :class:`mw.api.Session`
+            An API session to make use of
+        rev_id : int
+            the ID of the revision to check
+        page_id : int
+            the ID of the page the revision occupies (slower if not provided)
+        revert_radius : int
+            a positive integer indicating the maximum number of revisions that can be reverted
+    """
+
+    if not hasattr(session, "revisions"):
+        raise TypeError("session is wrong type.  Expected a mw.api.Session.")
+
+    rev_id = int(rev_id)
+    page_id = none_or(page_id, int)
+    revert_radius = int(revert_radius)
+    if revert_radius < 1:
+        raise TypeError("invalid radius.  Expected a positive integer.")
+    properties = set(properties) if properties is not None else set()
+
+
+    # If we don't have the page_id, we're going to need to look them up
+    if page_id is None:
+        rev = session.revisions.get(rev_id, properties={'ids'})
+        page_id = rev['page']['pageid']
+
+    # Load history and current rev
+    current_and_past_revs = list(session.revisions.query(
+        pageids={page_id},
+        limit=revert_radius + 1,
+        start_id=rev_id,
+        direction="older",
+        properties={'ids', 'timestamp', 'content', 'sha1'} | properties
+    ))
+
+    try:
+        # Extract current rev and reorder history
+        current_rev, past_revs = (
+            current_and_past_revs[0],  # Current rev is the first one returned
+            reversed(current_and_past_revs[1:])  # The rest are past revs, but they are in the wrong order
+        )
+    except IndexError:
+        # Only way to get here is if there isn't enough history.  Couldn't be
+        # reverted.  Just return None.
+        return None
+
+    # Load future revisions
+    future_revs = session.revisions.query(
+        pageids={page_id},
+        limit=future_revisions,
+        start_id=rev_id + 1, # Ensures that we skip the current revision
+        direction="newer",
+        properties={'ids', 'timestamp', 'content', 'sha1'} | properties
+    )
+
+    state = State(revert_radius=revert_radius)
+
+    # Process old revisions
+    for rev in past_revs:
+        state.process(rev.get('*', ""), rev, rev.get('sha1'))
+
+    # Process current revision
+    _, tokens_added, _ = state.process(current_rev.get('*'), current_rev,
+                                         current_rev.get('sha1'))
+
+    # Process new revisions
+    future_revs = list(future_revs)
+    for rev in future_revs:
+        state.process(rev.get('*', ""), rev, rev.get('sha1'))
+
+    return current_rev, tokens_added, future_revs
+
+score = track
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/defaults.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/defaults.py
new file mode 100644 (file)
index 0000000..1369792
--- /dev/null
@@ -0,0 +1,11 @@
+from . import tokenization, difference
+
+TOKENIZE = tokenization.wikitext_split
+"""
+The standard tokenizing function.
+"""
+
+DIFF = difference.sequence_matcher
+"""
+The standard diff function
+"""
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/difference.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/difference.py
new file mode 100644 (file)
index 0000000..34a671e
--- /dev/null
@@ -0,0 +1,49 @@
+from difflib import SequenceMatcher
+
+
+def sequence_matcher(old, new):
+    """
+    Generates a sequence of operations using :class:`difflib.SequenceMatcher`.
+
+    :Parameters:
+        old : list( `hashable` )
+            Old tokens
+        new : list( `hashable` )
+            New tokens
+
+    Returns:
+        Minimal operations needed to convert `old` to `new`
+    """
+    sm = SequenceMatcher(None, list(old), list(new))
+    return sm.get_opcodes()
+
+
+def apply(ops, old, new):
+    """
+    Applies operations (delta) to copy items from `old` to `new`.
+
+    :Parameters:
+        ops : list((op, a1, a2, b1, b2))
+            Operations to perform
+        old : list( `hashable` )
+            Old tokens
+        new : list( `hashable` )
+            New tokens
+    :Returns:
+        An iterator over elements matching `new` but copied from `old`
+    """
+    for code, a_start, a_end, b_start, b_end in ops:
+        if code == "insert":
+            for t in new[b_start:b_end]:
+                yield t
+        elif code == "replace":
+            for t in new[b_start:b_end]:
+                yield t
+        elif code == "equal":
+            for t in old[a_start:a_end]:
+                yield t
+        elif code == "delete":
+            pass
+        else:
+            assert False, \
+                "encounted an unrecognized operation code: " + repr(code)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/state.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/state.py
new file mode 100644 (file)
index 0000000..9aade7c
--- /dev/null
@@ -0,0 +1,149 @@
+from hashlib import sha1
+
+from . import defaults
+from .. import reverts
+from .tokens import Token, Tokens
+
+
+class Version:
+    __slots__ = ('tokens')
+
+    def __init__(self):
+        self.tokens = None
+
+
+class State:
+    """
+    Represents the state of word persistence in a page.
+    See `<https://meta.wikimedia.org/wiki/Research:Content_persistence>`_
+
+    :Parameters:
+        tokenize : function( `str` ) --> list( `str` )
+            A tokenizing function
+        diff : function(list( `str` ), list( `str` )) --> list( `ops` )
+            A function to perform a difference between token lists
+        revert_radius : int
+            a positive integer indicating the maximum revision distance that a revert can span.
+        revert_detector : :class:`mw.lib.reverts.Detector`
+            a revert detector to start process with
+    :Example:
+        >>> from pprint import pprint
+        >>> from mw.lib import persistence
+        >>>
+        >>> state = persistence.State()
+        >>>
+        >>> pprint(state.process("Apples are red.", revision=1))
+        ([Token(text='Apples', revisions=[1]),
+          Token(text=' ', revisions=[1]),
+          Token(text='are', revisions=[1]),
+          Token(text=' ', revisions=[1]),
+          Token(text='red', revisions=[1]),
+          Token(text='.', revisions=[1])],
+         [Token(text='Apples', revisions=[1]),
+          Token(text=' ', revisions=[1]),
+          Token(text='are', revisions=[1]),
+          Token(text=' ', revisions=[1]),
+          Token(text='red', revisions=[1]),
+          Token(text='.', revisions=[1])],
+         [])
+        >>> pprint(state.process("Apples are blue.", revision=2))
+        ([Token(text='Apples', revisions=[1, 2]),
+          Token(text=' ', revisions=[1, 2]),
+          Token(text='are', revisions=[1, 2]),
+          Token(text=' ', revisions=[1, 2]),
+          Token(text='blue', revisions=[2]),
+          Token(text='.', revisions=[1, 2])],
+         [Token(text='blue', revisions=[2])],
+         [Token(text='red', revisions=[1])])
+        >>> pprint(state.process("Apples are red.", revision=3)) # A revert!
+        ([Token(text='Apples', revisions=[1, 2, 3]),
+          Token(text=' ', revisions=[1, 2, 3]),
+          Token(text='are', revisions=[1, 2, 3]),
+          Token(text=' ', revisions=[1, 2, 3]),
+          Token(text='red', revisions=[1, 3]),
+          Token(text='.', revisions=[1, 2, 3])],
+         [],
+         [])
+    """
+
+    def __init__(self, tokenize=defaults.TOKENIZE, diff=defaults.DIFF,
+                 revert_radius=reverts.defaults.RADIUS,
+                 revert_detector=None):
+        self.tokenize = tokenize
+        self.diff = diff
+
+        # Either pass a detector or the revert radius so I can make one
+        if revert_detector is None:
+            self.revert_detector = reverts.Detector(int(revert_radius))
+        else:
+            self.revert_detector = revert_detector
+
+        # Stores the last tokens
+        self.last = None
+
+    def process(self, text, revision=None, checksum=None):
+        """
+        Modifies the internal state based a change to the content and returns
+        the sets of words added and removed.
+
+        :Parameters:
+            text : str
+                The text content of a revision
+            revision : `mixed`
+                Revision meta data
+            checksum : str
+                A checksum hash of the text content (will be generated if not provided)
+
+        :Returns:
+            Three :class:`~mw.lib.persistence.Tokens` lists
+
+            current_tokens : :class:`~mw.lib.persistence.Tokens`
+                A sequence of :class:`~mw.lib.persistence.Token` for the
+                processed revision
+            tokens_added : :class:`~mw.lib.persistence.Tokens`
+                A set of tokens that were inserted by the processed revision
+            tokens_removed : :class:`~mw.lib.persistence.Tokens`
+                A sequence of :class:`~mw.lib.persistence.Token` removed by the
+                processed revision
+
+        """
+        if checksum is None:
+            checksum = sha1(bytes(text, 'utf8')).hexdigest()
+
+        version = Version()
+
+        revert = self.revert_detector.process(checksum, version)
+        if revert is not None:  # Revert
+
+            # Empty words.
+            tokens_added = Tokens()
+            tokens_removed = Tokens()
+
+            # Extract reverted_to revision
+            _, _, reverted_to = revert
+            version.tokens = reverted_to.tokens
+
+        else:
+
+            if self.last is None:  # First version of the page!
+
+                version.tokens = Tokens(Token(t) for t in self.tokenize(text))
+                tokens_added = version.tokens
+                tokens_removed = Tokens()
+
+            else:
+
+                # NOTICE: HEAVY COMPUTATION HERE!!!
+                #
+                # OK.  It's not that heavy.  It's just performing a diff,
+                # but you're still going to spend most of your time here.
+                # Diffs usually run in O(n^2) -- O(n^3) time and most tokenizers
+                # produce a lot of tokens.
+                version.tokens, tokens_added, tokens_removed = \
+                    self.last.tokens.compare(self.tokenize(text), self.diff)
+
+        version.tokens.persist(revision)
+
+        self.last = version
+
+        return version.tokens, tokens_added, tokens_removed
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_difference.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_difference.py
new file mode 100644 (file)
index 0000000..89429dc
--- /dev/null
@@ -0,0 +1,12 @@
+from nose.tools import eq_
+
+from .. import difference
+
+
+def test_sequence_matcher():
+    t1 = "foobar derp hepl derpl"
+    t2 = "fooasldal 3 hepl asl a derpl"
+
+    ops = difference.sequence_matcher(t1, t2)
+
+    eq_("".join(difference.apply(ops, t1, t2)), t2)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_state.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_state.py
new file mode 100644 (file)
index 0000000..41c77c9
--- /dev/null
@@ -0,0 +1,25 @@
+from nose.tools import eq_
+
+from ..state import State
+
+
+def test_state():
+    contents_revisions = [
+        ("Apples are red.", 0),
+        ("Apples are blue.", 1),
+        ("Apples are red.", 2),
+        ("Apples are tasty and red.", 3),
+        ("Apples are tasty and blue.", 4)
+    ]
+
+    state = State()
+
+    token_sets = [state.process(c, r) for c, r in contents_revisions]
+
+    for i, (content, revision) in enumerate(contents_revisions):
+        eq_("".join(token_sets[i][0].texts()), content)
+
+    eq_(token_sets[0][0][0].text, "Apples")
+    eq_(len(token_sets[0][0][0].revisions), 5)
+    eq_(token_sets[0][0][4].text, "red")
+    eq_(len(token_sets[0][0][4].revisions), 3)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokenization.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokenization.py
new file mode 100644 (file)
index 0000000..45bc6c3
--- /dev/null
@@ -0,0 +1,10 @@
+from nose.tools import eq_
+
+from .. import tokenization
+
+
+def test_wikitext_split():
+    eq_(
+        list(tokenization.wikitext_split("foo bar herp {{derp}}")),
+        ["foo", " ", "bar", " ", "herp", " ", "{{", "derp", "}}"]
+    )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokens.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokens.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokenization.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokenization.py
new file mode 100644 (file)
index 0000000..d759754
--- /dev/null
@@ -0,0 +1,16 @@
+import re
+
+
+def wikitext_split(text):
+    """
+    Performs the simplest possible split of latin character-based languages
+    and wikitext.
+
+    :Parameters:
+        text : str
+            Text to split.
+    """
+    return re.findall(
+        r"[\w]+|\[\[|\]\]|\{\{|\}\}|\n+| +|&\w+;|'''|''|=+|\{\||\|\}|\|\-|.",
+        text
+    )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokens.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokens.py
new file mode 100644 (file)
index 0000000..48287ce
--- /dev/null
@@ -0,0 +1,98 @@
+class Token:
+    """
+    Represents a chunk of text and the revisions of a page that it survived.
+    """
+    __slots__ = ('text', 'revisions')
+
+    def __init__(self, text, revisions=None):
+        self.text = text
+        """
+        The text of the token.
+        """
+
+        self.revisions = revisions if revisions is not None else []
+        """
+        The meta data for the revisions that the token has appeared within.
+        """
+
+    def persist(self, revision):
+        self.revisions.append(revision)
+
+    def __repr__(self):
+        return "{0}({1})".format(
+            self.__class__.__name__,
+            ", ".join([
+                "text={0}".format(repr(self.text)),
+                "revisions={0}".format(repr(self.revisions))
+            ])
+        )
+
+
+class Tokens(list):
+    """
+    Represents a :class:`list` of :class:`~mw.lib.persistence.Token` with some
+    useful helper functions.
+
+    :Example:
+
+        >>> from mw.lib.persistence import Token, Tokens
+        >>>
+        >>> tokens = Tokens()
+        >>> tokens.append(Token("foo"))
+        >>> tokens.extend([Token(" "), Token("bar")])
+        >>>
+        >>> tokens[0]
+        Token(text='foo', revisions=[])
+        >>>
+        >>> "".join(tokens.texts())
+        'foo bar'
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def persist(self, revision):
+        for token in self:
+            token.persist(revision)
+
+    def texts(self):
+        for token in self:
+            yield token.text
+
+    def compare(self, new, diff):
+        old = self.texts()
+
+        return self.apply_diff(diff(old, new), self, new)
+
+    @classmethod
+    def apply_diff(cls, ops, old, new):
+
+        tokens = cls()
+        tokens_added = cls()
+        tokens_removed = cls()
+
+        for code, a_start, a_end, b_start, b_end in ops:
+            if code == "insert":
+                for token_text in new[b_start:b_end]:
+                    token = Token(token_text)
+                    tokens.append(token)
+                    tokens_added.append(token)
+
+            elif code == "replace":
+                for token_text in new[b_start:b_end]:
+                    token = Token(token_text)
+                    tokens.append(token)
+                    tokens_added.append(token)
+
+                tokens_removed.extend(t for t in old[a_start:a_end])
+
+            elif code == "equal":
+                tokens.extend(old[a_start:a_end])
+            elif code == "delete":
+                tokens_removed.extend(old[a_start:a_end])
+
+            else:
+                assert False, \
+                    "encounted an unrecognized operation code: " + repr(code)
+
+        return (tokens, tokens_added, tokens_removed)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/__init__.py
new file mode 100644 (file)
index 0000000..742b90b
--- /dev/null
@@ -0,0 +1,24 @@
+"""
+This module provides a set of utilities for detecting identity reverts in
+revisioned content.
+
+To detect reverts in a stream of revisions to a single page, you can use
+:func:`detect`.  If you'll be detecting reverts in a collection of pages or
+would, for some other reason, prefer to process revisions one at a time,
+:class:`Detector` and it's :meth:`~Detector.process` will allow you to do so.
+
+To detect reverts one-at-time and arbitrarily, you can user the `check()`
+functions:
+
+* :func:`database.check` and :func:`database.check_row` use a :class:`mw.database.DB`
+* :func:`api.check` and :func:`api.check_rev` use a :class:`mw.api.Session`
+
+Note that these functions are less performant than detecting reverts in a
+stream of page revisions.  This can be practical when trying to identify
+reverted revisions in a user's contribution history.
+"""
+from .detector import Detector, Revert
+from .functions import detect, reverts
+from . import database
+from . import api
+from . import defaults
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/api.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/api.py
new file mode 100644 (file)
index 0000000..6fbcc7e
--- /dev/null
@@ -0,0 +1,134 @@
+from itertools import chain
+
+from . import defaults
+from ...types import Timestamp
+from ...util import none_or
+from .dummy_checksum import DummyChecksum
+from .functions import detect
+
+
+def check_rev(session, rev, **kwargs):
+    """
+    Checks whether a revision (database row) was reverted (identity) and returns
+    a named tuple of Revert(reverting, reverteds, reverted_to).
+
+    :Parameters:
+        session : :class:`mw.api.Session`
+            An API session to make use of
+        rev : dict
+            a revision dict containing 'revid' and 'page.id'
+        radius : int
+            a positive integer indicating the maximum number of revisions that can be reverted
+        before : :class:`mw.Timestamp`
+            if set, limits the search for *reverting* revisions to those which were saved before this timestamp
+        properties : set( str )
+            a set of properties to include in revisions (see :class:`mw.api.Revisions`)
+    """
+
+    # extract rev_id, sha1, page_id
+    if 'revid' in rev:
+        rev_id = rev['revid']
+    else:
+        raise TypeError("rev must have 'rev_id'")
+    if 'page' in rev:
+        page_id = rev['page']['id']
+    elif 'pageid' in rev:
+        page_id = rev['pageid']
+    else:
+        raise TypeError("rev must have 'page' or 'pageid'")
+
+    # run the regular check
+    return check(session, rev_id, page_id=page_id, **kwargs)
+
+
+def check(session, rev_id, page_id=None, radius=defaults.RADIUS,
+          before=None, window=None, properties=None):
+    """
+    Checks whether a revision was reverted (identity) and returns a named tuple
+    of Revert(reverting, reverteds, reverted_to).
+
+    :Parameters:
+        session : :class:`mw.api.Session`
+            An API session to make use of
+        rev_id : int
+            the ID of the revision to check
+        page_id : int
+            the ID of the page the revision occupies (slower if not provided)
+        radius : int
+            a positive integer indicating the maximum number of revisions
+            that can be reverted
+        before : :class:`mw.Timestamp`
+            if set, limits the search for *reverting* revisions to those which
+            were saved before this timestamp
+        window : int
+            if set, limits the search for *reverting* revisions to those which
+            were saved within `window` seconds after the reverted edit
+        properties : set( str )
+            a set of properties to include in revisions (see :class:`mw.api.Revisions`)
+    """
+
+    if not hasattr(session, "revisions"):
+        raise TypeError("session wrong type.  Expected a mw.api.Session.")
+
+    rev_id = int(rev_id)
+    radius = int(radius)
+    if radius < 1:
+        raise TypeError("invalid radius.  Expected a positive integer.")
+
+    page_id = none_or(page_id, int)
+    before = none_or(before, Timestamp)
+    properties = set(properties) if properties is not None else set()
+
+    # If we don't have the page_id, we're going to need to look them up
+    if page_id is None:
+        rev = session.revisions.get(rev_id, properties={'ids'})
+        page_id = rev['page']['pageid']
+
+    # Load history and current rev
+    current_and_past_revs = list(session.revisions.query(
+        pageids={page_id},
+        limit=radius + 1,
+        start_id=rev_id,
+        direction="older",
+        properties={'ids', 'timestamp', 'sha1'} | properties
+    ))
+
+    try:
+        # Extract current rev and reorder history
+        current_rev, past_revs = (
+            current_and_past_revs[0],  # Current rev is the first one returned
+            reversed(current_and_past_revs[1:])  # The rest are past revs, but they are in the wrong order
+        )
+    except IndexError:
+        # Only way to get here is if there isn't enough history.  Couldn't be
+        # reverted.  Just return None.
+        return None
+
+    if window is not None and before is None:
+        before = Timestamp(current_rev['timestamp']) + window
+
+    # Load future revisions
+    future_revs = session.revisions.query(
+        pageids={page_id},
+        limit=radius,
+        start_id=rev_id + 1, # Ensures that we skip the current revision
+        end=before,
+        direction="newer",
+        properties={'ids', 'timestamp', 'sha1'} | properties
+    )
+
+    # Convert to an iterable of (checksum, rev) pairs for detect() to consume
+    checksum_revisions = chain(
+        ((rev['sha1'] if 'sha1' in rev else DummyChecksum(), rev)
+         for rev in past_revs),
+        [(current_rev.get('sha1', DummyChecksum()), current_rev)],
+        ((rev['sha1'] if 'sha1' in rev else DummyChecksum(), rev)
+         for rev in future_revs),
+    )
+
+    for revert in detect(checksum_revisions, radius=radius):
+        # Check that this is a relevant revert
+        if rev_id in [rev['revid'] for rev in revert.reverteds]:
+            return revert
+
+    return None
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/database.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/database.py
new file mode 100644 (file)
index 0000000..a79f86f
--- /dev/null
@@ -0,0 +1,148 @@
+import random
+from itertools import chain
+
+from . import defaults
+from ...types import Timestamp
+from ...util import none_or
+from .dummy_checksum import DummyChecksum
+from .functions import detect
+
+HEX = "1234567890abcdef"
+
+def random_sha1():
+    return ''.join(random.choice(HEX) for i in range(40))
+
+"""
+Simple constant used in order to not do weird things with a dummy revision.
+"""
+
+
+def check_row(db, rev_row, **kwargs):
+    """
+    Checks whether a revision (database row) was reverted (identity) and returns
+    a named tuple of Revert(reverting, reverteds, reverted_to).
+
+    :Parameters:
+        db : :class:`mw.database.DB`
+            A database connection to make use of.
+        rev_row : dict
+            a revision row containing 'rev_id' and 'rev_page' or 'page_id'
+        radius : int
+            a positive integer indicating the the maximum number of revisions that can be reverted
+        check_archive : bool
+            should the archive table be checked for reverting revisions?
+        before : `Timestamp`
+            if set, limits the search for *reverting* revisions to those which were saved before this timestamp
+    """
+
+    # extract rev_id, sha1, page_id
+    if 'rev_id' in rev_row:
+        rev_id = rev_row['rev_id']
+    else:
+        raise TypeError("rev_row must have 'rev_id'")
+    if 'page_id' in rev_row:
+        page_id = rev_row['page_id']
+    elif 'rev_page' in rev_row:
+        page_id = rev_row['rev_page']
+    else:
+        raise TypeError("rev_row must have 'page_id' or 'rev_page'")
+
+    # run the regular check
+    return check(db, rev_id, page_id=page_id, **kwargs)
+
+
+def check(db, rev_id, page_id=None, radius=defaults.RADIUS, check_archive=False,
+          before=None, window=None):
+
+    """
+    Checks whether a revision was reverted (identity) and returns a named tuple
+    of Revert(reverting, reverteds, reverted_to).
+
+    :Parameters:
+        db : `mw.database.DB`
+            A database connection to make use of.
+        rev_id : int
+            the ID of the revision to check
+        page_id : int
+            the ID of the page the revision occupies (slower if not provided)
+        radius : int
+            a positive integer indicating the maximum number of revisions that can be reverted
+        check_archive : bool
+            should the archive table be checked for reverting revisions?
+        before : `Timestamp`
+            if set, limits the search for *reverting* revisions to those which were saved before this timestamp
+        window : int
+            if set, limits the search for *reverting* revisions to those which
+            were saved within `window` seconds after the reverted edit
+    """
+
+    if not hasattr(db, "revisions") and hasattr(db, "all_revisions"):
+        raise TypeError("db wrong type.  Expected a mw.database.DB.")
+
+    rev_id = int(rev_id)
+    radius = int(radius)
+    if radius < 1:
+        raise TypeError("invalid radius.  Expected a positive integer.")
+    page_id = none_or(page_id, int)
+    check_archive = bool(check_archive)
+    before = none_or(before, Timestamp)
+
+    # If we are searching the archive, we'll need to use `all_revisions`.
+    if check_archive:
+        dbrevs = db.all_revisions
+    else:
+        dbrevs = db.revisions
+
+    # If we don't have the sha1 or page_id, we're going to need to look them up
+    if page_id is None:
+        row = dbrevs.get(rev_id=rev_id)
+        page_id = row['rev_page']
+
+    # Load history and current rev
+    current_and_past_revs = list(dbrevs.query(
+        page_id=page_id,
+        limit=radius + 1,
+        before_id=rev_id + 1,  # Ensures that we capture the current revision
+        direction="older"
+    ))
+
+    try:
+        # Extract current rev and reorder history
+        current_rev, past_revs = (
+            current_and_past_revs[0],  # Current rev is the first one returned
+            reversed(current_and_past_revs[1:])  # The rest are past revs, but they are in the wrong order
+        )
+    except IndexError:
+        # Only way to get here is if there isn't enough history.  Couldn't be
+        # reverted.  Just return None.
+        return None
+
+    if window is not None and before is None:
+        before = Timestamp(current_rev['rev_timestamp']) + window
+
+    # Load future revisions
+    future_revs = dbrevs.query(
+        page_id=page_id,
+        limit=radius,
+        after_id=rev_id,
+        before=before,
+        direction="newer"
+    )
+
+    # Convert to an iterable of (checksum, rev) pairs for detect() to consume
+    checksum_revisions = chain(
+        ((rev['rev_sha1'] if rev['rev_sha1'] is not None \
+          else DummyChecksum(), rev)
+         for rev in past_revs),
+        [(current_rev['rev_sha1'] or DummyChecksum(), current_rev)],
+        ((rev['rev_sha1'] if rev['rev_sha1'] is not None \
+          else DummyChecksum(), rev)
+         for rev in future_revs)
+    )
+
+    for revert in detect(checksum_revisions, radius=radius):
+        # Check that this is a relevant revert
+        if rev_id in [rev['rev_id'] for rev in revert.reverteds]:
+            return revert
+
+    return None
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/defaults.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/defaults.py
new file mode 100644 (file)
index 0000000..286413f
--- /dev/null
@@ -0,0 +1,24 @@
+RADIUS = 15
+"""
+TODO: Better documentation here.  For the time being, see:
+
+Priedhorsky, R., Chen, J., Lam, S. T. K., Panciera, K., Terveen, L., &
+Riedl, J. (2007, November). Creating, destroying, and restoring value in
+Wikipedia. In Proceedings of the 2007 international ACM conference on
+Supporting group work (pp. 259-268). ACM.
+"""
+
+
+class DUMMY_SHA1: pass
+"""
+Used in when checking for reverts when the checksum of the revision of interest
+is unknown.
+
+>>> DUMMY_SHA1 in {"aaa", "bbb"} # or any 40 character hex
+False
+>>>
+>>> DUMMY_SHA1 == DUMMY_SHA1
+True
+>>> {DUMMY_SHA1, DUMMY_SHA1}
+{<class '__main__.DUMMY_SHA1'>}
+"""
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/detector.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/detector.py
new file mode 100644 (file)
index 0000000..e616e0b
--- /dev/null
@@ -0,0 +1,83 @@
+from collections import namedtuple
+
+from ...util import ordered
+from . import defaults
+
+Revert = namedtuple("Revert", ['reverting', 'reverteds', 'reverted_to'])
+"""
+Represents a revert event.  This class behaves like
+:class:`collections.namedtuple`.  Note that the datatypes of `reverting`,
+`reverteds` and `reverted_to` is not specified since those types will depend
+on the revision data provided during revert detection.
+
+:Members:
+    **reverting**
+        The reverting revision data : `mixed`
+    **reverteds**
+        The reverted revision data (ordered chronologically) : list( `mixed` )
+    **reverted_to**
+        The reverted-to revision data : `mixed`
+"""
+
+
+class Detector(ordered.HistoricalMap):
+    """
+    Detects revert events in a stream of revisions (to the same page) based on
+    matching checksums.  To detect reverts, construct an instance of this class and call
+    :meth:`process` in chronological order (``direction == "newer"``).
+
+    See `<https://meta.wikimedia.org/wiki/R:Identity_revert>`_
+
+    :Parameters:
+        radius : int
+            a positive integer indicating the maximum revision distance that a revert can span.
+
+    :Example:
+        >>> from mw.lib import reverts
+        >>> detector = reverts.Detector()
+        >>>
+        >>> detector.process("aaa", {'rev_id': 1})
+        >>> detector.process("bbb", {'rev_id': 2})
+        >>> detector.process("aaa", {'rev_id': 3})
+        Revert(reverting={'rev_id': 3}, reverteds=[{'rev_id': 2}], reverted_to={'rev_id': 1})
+        >>> detector.process("ccc", {'rev_id': 4})
+
+    """
+
+    def __init__(self, radius=defaults.RADIUS):
+        """
+        :Parameters:
+            radius : int
+                a positive integer indicating the maximum revision distance that a revert can span.
+        """
+        if radius < 1:
+            raise TypeError("invalid radius. Expected a positive integer.")
+        super().__init__(maxlen=radius + 1)
+
+    def process(self, checksum, revision=None):
+        """
+        Process a new revision and detect a revert if it occurred.  Note that
+        you can pass whatever you like as `revision` and it will be returned in
+        the case that a revert occurs.
+
+        :Parameters:
+            checksum : str
+                Any identity-machable string-based hash of revision content
+            revision : `mixed`
+                Revision meta data.  Note that any data will just be returned in the
+                case of a revert.
+
+        :Returns:
+            a :class:`~mw.lib.reverts.Revert` if one occured or `None`
+        """
+        revert = None
+
+        if checksum in self:  # potential revert
+
+            reverteds = list(self.up_to(checksum))
+
+            if len(reverteds) > 0:  # If no reverted revisions, this is a noop
+                revert = Revert(revision, reverteds, self[checksum])
+
+        self.insert(checksum, revision)
+        return revert
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/dummy_checksum.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/dummy_checksum.py
new file mode 100644 (file)
index 0000000..2bc6c67
--- /dev/null
@@ -0,0 +1,24 @@
+class DummyChecksum():
+    """
+    Used in when checking for reverts when the checksum of the revision of interest
+    is unknown.  DummyChecksums won't match eachother or anything else, but they
+    will match themselves and they are hashable.
+
+    >>> dummy1 = DummyChecksum()
+    >>> dummy1
+    <#140687347334280>
+    >>> dummy1 == dummy1
+    True
+    >>>
+    >>> dummy2 = DummyChecksum()
+    >>> dummy2
+    <#140687347334504>
+    >>> dummy1 == dummy2
+    False
+    >>>
+    >>> {"foo", "bar", dummy1, dummy1, dummy2}
+    {<#140687347334280>, 'foo', <#140687347334504>, 'bar'}
+    """
+    
+    def __str__(self): repr(self)
+    def __repr__(self): return "<#" + str(id(self)) + ">"
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/functions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/functions.py
new file mode 100644 (file)
index 0000000..c129bd9
--- /dev/null
@@ -0,0 +1,46 @@
+from .detector import Detector
+from . import defaults
+
+
+def detect(checksum_revisions, radius=defaults.RADIUS):
+    """
+    Detects reverts that occur in a sequence of revisions.  Note that,
+    `revision` data meta will simply be returned in the case of a revert.
+
+    This function serves as a convenience wrapper around calls to
+    :class:`Detector`'s :meth:`~Detector.process`
+    method.
+
+    :Parameters:
+        checksum_revisions : iter( ( checksum : str, revision : `mixed` ) )
+            an iterable over tuples of checksum and revision meta data
+        radius : int
+            a positive integer indicating the maximum revision distance that a revert can span.
+
+    :Return:
+        a iterator over :class:`Revert`
+
+    :Example:
+        >>> from mw.lib import reverts
+        >>>
+        >>> checksum_revisions = [
+        ...     ("aaa", {'rev_id': 1}),
+        ...     ("bbb", {'rev_id': 2}),
+        ...     ("aaa", {'rev_id': 3}),
+        ...     ("ccc", {'rev_id': 4})
+        ... ]
+        >>>
+        >>> list(reverts.detect(checksum_revisions))
+        [Revert(reverting={'rev_id': 3}, reverteds=[{'rev_id': 2}], reverted_to={'rev_id': 1})]
+
+    """
+
+    revert_detector = Detector(radius)
+
+    for checksum, revision in checksum_revisions:
+        revert = revert_detector.process(checksum, revision)
+        if revert is not None:
+            yield revert
+
+# For backwards compatibility
+reverts = detect
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/test_detector.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/test_detector.py
new file mode 100644 (file)
index 0000000..39b5ad1
--- /dev/null
@@ -0,0 +1,33 @@
+from nose.tools import eq_
+
+from ..detector import Detector
+
+
+def test_detector():
+    detector = Detector(2)
+
+    eq_(detector.process("a", {'id': 1}), None)
+
+    # Check noop
+    eq_(detector.process("a", {'id': 2}), None)
+
+    # Short revert
+    eq_(detector.process("b", {'id': 3}), None)
+    eq_(
+        detector.process("a", {'id': 4}),
+        ({'id': 4}, [{'id': 3}], {'id': 2})
+    )
+
+    # Medium revert
+    eq_(detector.process("c", {'id': 5}), None)
+    eq_(detector.process("d", {'id': 6}), None)
+    eq_(
+        detector.process("a", {'id': 7}),
+        ({'id': 7}, [{'id': 6}, {'id': 5}], {'id': 4})
+    )
+
+    # Long (undetected) revert
+    eq_(detector.process("e", {'id': 8}), None)
+    eq_(detector.process("f", {'id': 9}), None)
+    eq_(detector.process("g", {'id': 10}), None)
+    eq_(detector.process("a", {'id': 11}), None)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/test_functions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/test_functions.py
new file mode 100644 (file)
index 0000000..09df733
--- /dev/null
@@ -0,0 +1,23 @@
+from nose.tools import eq_
+
+from ..functions import reverts
+
+
+def test_reverts():
+    checksum_revisions = [
+        ("a", {'id': 1}),
+        ("b", {'id': 2}),
+        ("c", {'id': 3}),
+        ("a", {'id': 4}),
+        ("d", {'id': 5}),
+        ("b", {'id': 6}),
+        ("a", {'id': 7})
+    ]
+
+    expected = [
+        ({'id': 4}, [{'id': 3}, {'id': 2}], {'id': 1}),
+        ({'id': 7}, [{'id': 6}, {'id': 5}], {'id': 4})
+    ]
+
+    for revert in reverts(checksum_revisions, radius=2):
+        eq_(revert, expected.pop(0))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/__init__.py
new file mode 100644 (file)
index 0000000..eb5c3e8
--- /dev/null
@@ -0,0 +1,4 @@
+from .functions import cluster, sessions
+from .event import Event
+from .cache import Cache, Session
+from . import defaults
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/cache.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/cache.py
new file mode 100644 (file)
index 0000000..a55348f
--- /dev/null
@@ -0,0 +1,121 @@
+import logging
+from collections import namedtuple
+
+from ...util import Heap
+from ...types import Timestamp
+from . import defaults
+from .event import Event, unpack_events
+
+
+logger = logging.getLogger("mw.lib.sessions.cache")
+
+Session = namedtuple("Session", ["user", "events"])
+"""
+Represents a user session (a cluster over events for a user).  This class
+behaves like :class:`collections.namedtuple`.  Note that the datatypes of
+`events`, is not specified since those types will depend on the revision data
+provided during revert detection.
+
+:Members:
+    **user**
+        A hashable user identifier : `hashable`
+    **events**
+        A list of event data : list( `mixed` )
+"""
+
+
+class Cache:
+    """
+    A cache of recent user session.  Since sessions expire once activities stop
+    for at least `cutoff` seconds, this class manages a cache of *active*
+    sessions.
+
+    :Parameters:
+        cutoff : int
+            Maximum amount of time in seconds between session events
+
+    :Example:
+        >>> from mw.lib import sessions
+        >>>
+        >>> cache = sessions.Cache(cutoff=3600)
+        >>>
+        >>> list(cache.process("Willy on wheels", 100000, {'rev_id': 1}))
+        []
+        >>> list(cache.process("Walter", 100001, {'rev_id': 2}))
+        []
+        >>> list(cache.process("Willy on wheels", 100001, {'rev_id': 3}))
+        []
+        >>> list(cache.process("Walter", 100035, {'rev_id': 4}))
+        []
+        >>> list(cache.process("Willy on wheels", 103602, {'rev_id': 5}))
+        [Session(user='Willy on wheels', events=[{'rev_id': 1}, {'rev_id': 3}])]
+        >>> list(cache.get_active_sessions())
+        [Session(user='Walter', events=[{'rev_id': 2}, {'rev_id': 4}]), Session(user='Willy on wheels', events=[{'rev_id': 5}])]
+
+
+    """
+
+    def __init__(self, cutoff=defaults.CUTOFF):
+        self.cutoff = int(cutoff)
+
+        self.recently_active = Heap()
+        self.active_users = {}
+
+    def process(self, user, timestamp, data=None):
+        """
+        Processes a user event.
+
+        :Parameters:
+            user : `hashable`
+                A hashable value to identify a user (`int` or `str` are OK)
+            timestamp : :class:`mw.Timestamp`
+                The timestamp of the event
+            data : `mixed`
+                Event meta data
+
+        :Returns:
+            A generator of :class:`~mw.lib.sessions.Session` expired after
+            processing the user event.
+        """
+        event = Event(user, Timestamp(timestamp), data)
+
+        for user, events in self._clear_expired(event.timestamp):
+            yield Session(user, unpack_events(events))
+
+        # Apply revision
+        if event.user in self.active_users:
+            events = self.active_users[event.user]
+        else:
+            events = []
+            self.active_users[event.user] = events
+            self.recently_active.push((event.timestamp, events))
+
+        events.append(event)
+
+    def get_active_sessions(self):
+        """
+        Retrieves the active, unexpired sessions.
+
+        :Returns:
+            A generator of :class:`~mw.lib.sessions.Session`
+
+        """
+        for last_timestamp, events in self.recently_active:
+            yield Session(events[-1].user, unpack_events(events))
+
+    def _clear_expired(self, timestamp):
+
+        # Cull old sessions
+        while (len(self.recently_active) > 0 and
+               timestamp - self.recently_active.peek()[0] >= self.cutoff):
+
+            _, events = self.recently_active.pop()
+
+            if timestamp - events[-1].timestamp >= self.cutoff:
+                del self.active_users[events[-1].user]
+                yield events[-1].user, events
+            else:
+                self.recently_active.push((events[-1].timestamp, events))
+
+    def __repr__(self):
+        return "%s(%s)".format(self.__class__.__name__, repr(self.cutoff))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/defaults.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/defaults.py
new file mode 100644 (file)
index 0000000..ed5945a
--- /dev/null
@@ -0,0 +1,6 @@
+CUTOFF = 60 * 60
+"""
+TODO: Better documentation here.
+For the time being, see
+`<https://meta.wikimedia.org/wiki/Research:Edit_session>`_
+"""
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/event.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/event.py
new file mode 100644 (file)
index 0000000..6e57d22
--- /dev/null
@@ -0,0 +1,19 @@
+import logging
+from collections import namedtuple
+
+logger = logging.getLogger("mw.lib.sessions.event")
+
+
+# class Event:
+#   __slots__ = ('user', 'timestamp', 'data')
+#
+#   def __init__(self, user, timestamp, data=None):
+#       self.user = user
+#       self.timestamp = Timestamp(timestamp)
+#       self.data = data
+
+Event = namedtuple("Event", ['user', 'timestamp', 'data'])
+
+
+def unpack_events(events):
+    return list(e.data for e in events)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/functions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/functions.py
new file mode 100644 (file)
index 0000000..cede5db
--- /dev/null
@@ -0,0 +1,68 @@
+import logging
+
+from .cache import Cache
+from . import defaults
+
+logger = logging.getLogger("mw.lib.sessions.functions")
+
+
+def cluster(user_events, cutoff=defaults.CUTOFF):
+    """
+    Clusters user sessions from a sequence of user events.  Note that,
+    `event` data will simply be returned in the case of a revert.
+
+    This function serves as a convenience wrapper around calls to
+    :class:`~mw.lib.sessions.Cache`'s :meth:`~mw.lib.sessions.Cache.process`
+    method.
+
+    :Parameters:
+        user_events : iter( (user, timestamp, event) )
+            an iterable over tuples of user, timestamp and event data.
+
+            * user : `hashable`
+            * timestamp : :class:`mw.Timestamp`
+            * event : `mixed`
+
+        cutoff : int
+            the maximum time between events within a user session
+
+    :Returns:
+        a iterator over :class:`~mw.lib.sessions.Session`
+
+    :Example:
+        >>> from mw.lib import sessions
+        >>>
+        >>> user_events = [
+        ...     ("Willy on wheels", 100000, {'rev_id': 1}),
+        ...     ("Walter", 100001, {'rev_id': 2}),
+        ...     ("Willy on wheels", 100001, {'rev_id': 3}),
+        ...     ("Walter", 100035, {'rev_id': 4}),
+        ...     ("Willy on wheels", 103602, {'rev_id': 5})
+        ... ]
+        >>>
+        >>> for user, events in sessions.cluster(user_events):
+        ...     (user, events)
+        ...
+        ('Willy on wheels', [{'rev_id': 1}, {'rev_id': 3}])
+        ('Walter', [{'rev_id': 2}, {'rev_id': 4}])
+        ('Willy on wheels', [{'rev_id': 5}])
+
+
+    """
+
+    # Construct the session manager
+    cache = Cache(cutoff)
+
+    # Apply the events
+    for user, timestamp, event in user_events:
+
+        for session in cache.process(user, timestamp, event):
+            yield session
+
+    # Yield the left-overs
+    for session in cache.get_active_sessions():
+        yield session
+
+
+# For backwards compatibility
+sessions = cluster
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/test_cache.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/test_cache.py
new file mode 100644 (file)
index 0000000..2f7f0fa
--- /dev/null
@@ -0,0 +1,22 @@
+from nose.tools import eq_
+
+from ..cache import Cache
+
+
+def test_session_manager():
+    cache = Cache(cutoff=2)
+
+    user_sessions = list(cache.process("foo", 1))
+    eq_(user_sessions, [])
+
+    user_sessions = list(cache.process("bar", 2))
+    eq_(user_sessions, [])
+
+    user_sessions = list(cache.process("foo", 2))
+    eq_(user_sessions, [])
+
+    user_sessions = list(cache.process("bar", 10))
+    eq_(len(user_sessions), 2)
+
+    user_sessions = list(cache.get_active_sessions())
+    eq_(len(user_sessions), 1)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/test_functions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/test_functions.py
new file mode 100644 (file)
index 0000000..6c3bc43
--- /dev/null
@@ -0,0 +1,50 @@
+from itertools import chain
+
+from nose.tools import eq_
+from .. import defaults
+from ..functions import sessions
+
+
+EVENTS = {
+    "foo": [
+        [
+            ("foo", 1234567890, 1),
+            ("foo", 1234567892, 2),
+            ("foo", 1234567894, 3)
+        ],
+        [
+            ("foo", 1234567894 + defaults.CUTOFF, 4),
+            ("foo", 1234567897 + defaults.CUTOFF, 5)
+        ]
+    ],
+    "bar": [
+        [
+            ("bar", 1234567891, 6),
+            ("bar", 1234567892, 7),
+            ("bar", 1234567893, 8)
+        ],
+        [
+            ("bar", 1234567895 + defaults.CUTOFF, 9),
+            ("bar", 1234567898 + defaults.CUTOFF, 0)
+        ]
+    ]
+}
+
+
+def test_group_events():
+    events = []
+    events.extend(chain(*EVENTS['foo']))
+    events.extend(chain(*EVENTS['bar']))
+
+    events.sort()
+
+    user_sessions = sessions(events)
+
+    counts = {
+        'foo': 0,
+        'bar': 0
+    }
+
+    for user, session in user_sessions:
+        eq_(list(e[2] for e in EVENTS[user][counts[user]]), list(session))
+        counts[user] += 1
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/__init__.py
new file mode 100644 (file)
index 0000000..5b77d0c
--- /dev/null
@@ -0,0 +1,2 @@
+from .functions import normalize
+from .parser import Parser
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/functions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/functions.py
new file mode 100644 (file)
index 0000000..931a7a0
--- /dev/null
@@ -0,0 +1,25 @@
+def normalize(title):
+    """
+    Normalizes a page title to the database format.  E.g. spaces are converted
+    to underscores and the first character in the title is converted to
+    upper-case.
+
+    :Parameters:
+        title : str
+            A page title
+    :Returns:
+        The normalized title.
+    :Example:
+        >>> from mw.lib import title
+        >>>
+        >>> title.normalize("foo bar")
+        'Foo_bar'
+
+    """
+    if title is None:
+        return title
+    else:
+        if len(title) > 0:
+            return (title[0].upper() + title[1:]).replace(" ", "_")
+        else:
+            return ""
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/parser.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/parser.py
new file mode 100644 (file)
index 0000000..7467aca
--- /dev/null
@@ -0,0 +1,171 @@
+from ...types import Namespace
+from ...util import autovivifying, none_or
+from .functions import normalize
+
+
+class Parser:
+    """
+    Constructs a page name parser from a set of :class:`mw.Namespace`.  Such a
+    parser can be used to convert a full page name (namespace included with a
+    colon; e.g, ``"Talk:Foo"``) into a namespace ID and
+    :func:`mw.lib.title.normalize`'d page title (e.g., ``(1, "Foo")``).
+
+    :Parameters:
+        namespaces : set( :class:`mw.Namespace` )
+    :Example:
+        >>> from mw import Namespace
+        >>> from mw.lib import title
+        >>>
+        >>> parser = title.Parser(
+        ...     [
+        ...             Namespace(0, "", case="first-letter"),
+        ...             Namespace(1, "Discuss\u00e3o", canonical="Talk", case="first-letter"),
+        ...             Namespace(2, "Usu\u00e1rio(a)", canonical="User", aliases={"U"}, case="first-letter")
+        ...     ]
+        ... )
+        >>>
+        >>> parser.parse("Discuss\u00e3o:Foo") # Using the standard name
+        (1, 'Foo')
+        >>> parser.parse("Talk:Foo bar") # Using the cannonical name
+        (1, 'Foo_bar')
+        >>> parser.parse("U:Foo bar") # Using an alias
+        (2, 'Foo_bar')
+        >>> parser.parse("Herpderp:Foo bar") # Psuedo namespace
+        (0, 'Herpderp:Foo_bar')
+    """
+
+    def __init__(self, namespaces=None):
+        namespaces = none_or(namespaces, set)
+
+        self.ids = {}
+        self.names = {}
+
+        if namespaces is not None:
+            for namespace in namespaces:
+                self.add_namespace(namespace)
+
+    def parse(self, page_name):
+        """
+        Parses a page name to extract the namespace.
+
+        :Parameters:
+            page_name : str
+                A page name including the namespace prefix and a colon (if not Main)
+
+        :Returns:
+            A tuple of (namespace : `int`, title : `str`)
+        """
+        parts = page_name.split(":", 1)
+        if len(parts) == 1:
+            ns_id = 0
+            title = normalize(page_name)
+        else:
+            ns_name, title = parts
+            ns_name, title = normalize(ns_name), normalize(title)
+
+            if self.contains_name(ns_name):
+                ns_id = self.get_namespace(name=ns_name).id
+            else:
+                ns_id = 0
+                title = normalize(page_name)
+
+        return ns_id, title
+
+    def add_namespace(self, namespace):
+        """
+        Adds a namespace to the parser.
+
+        :Parameters:
+            namespace : :class:`mw.Namespace`
+                A namespace
+        """
+        self.ids[namespace.id] = namespace
+        self.names[namespace.name] = namespace
+
+        for alias in namespace.aliases:
+            self.names[alias] = namespace
+
+        if namespace.canonical is not None:
+            self.names[namespace.canonical] = namespace
+
+    def contains_name(self, name):
+        return normalize(name) in self.names
+
+    def get_namespace(self, id=None, name=None):
+        """
+        Gets a namespace from the parser.  Throws a :class:`KeyError` if a
+        namespace cannot be found.
+
+        :Parameters:
+            id : int
+                A namespace ID
+            name : str
+                A namespace name (standard, cannonical names and aliases
+                will be searched)
+        :Returns:
+            A :class:`mw.Namespace`.
+        """
+        if id is not None:
+            return self.ids[int(id)]
+        else:
+            return self.names[normalize(name)]
+
+    @classmethod
+    def from_site_info(cls, si_doc):
+        """
+        Constructs a parser from the result of a :meth:`mw.api.SiteInfo.query`.
+
+        :Parameters:
+            si_doc : dict
+                The result of a site_info request.
+
+        :Returns:
+            An initialized :class:`mw.lib.title.Parser`
+        """
+        aliases = autovivifying.Dict(vivifier=lambda k: [])
+        # get aliases
+        if 'namespacealiases' in si_doc:
+            for alias_doc in si_doc['namespacealiases']:
+                aliases[alias_doc['id']].append(alias_doc['*'])
+
+        namespaces = []
+        for ns_doc in si_doc['namespaces'].values():
+            namespaces.append(
+                Namespace.from_doc(ns_doc, aliases)
+            )
+
+        return Parser(namespaces)
+
+    @classmethod
+    def from_api(cls, session):
+        """
+        Constructs a parser from a :class:`mw.api.Session`
+
+        :Parameters:
+            session : :class:`mw.api.Session`
+                An open API session
+
+        :Returns:
+            An initialized :class:`mw.lib.title.Parser`
+        """
+        si_doc = session.site_info.query(
+            properties={'namespaces', 'namespacealiases'}
+        )
+
+        return cls.from_site_info(si_doc)
+
+    @classmethod
+    def from_dump(cls, dump):
+        """
+        Constructs a parser from a :class:`mw.xml_dump.Iterator`.  Note that
+        XML database dumps do not include namespace aliases or cannonical names
+        so the parser that will be constructed will only work in common cases.
+
+        :Parameters:
+            dump : :class:`mw.xml_dump.Iterator`
+                An XML dump iterator
+
+        :Returns:
+            An initialized :class:`mw.lib.title.Parser`
+        """
+        return cls(dump.namespaces)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/test_functions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/test_functions.py
new file mode 100644 (file)
index 0000000..cee6f68
--- /dev/null
@@ -0,0 +1,10 @@
+from nose.tools import eq_
+
+from ..functions import normalize
+
+
+def test_normalize():
+    eq_("Foobar", normalize("Foobar"))  # Same
+    eq_("Foobar", normalize("foobar"))  # Capitalize
+    eq_("FooBar", normalize("fooBar"))  # Late capital
+    eq_("Foo_bar", normalize("Foo bar"))  # Space
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/test_parser.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/test_parser.py
new file mode 100644 (file)
index 0000000..a5e64ca
--- /dev/null
@@ -0,0 +1,58 @@
+from nose.tools import eq_
+
+from ....types import Namespace
+from ..parser import Parser
+
+
+def test_simple():
+    parser = Parser(
+        [
+            Namespace(0, "", case="first-letter"),
+            Namespace(1, "Discuss\u00e3o", canonical="Talk", case="first-letter"),
+            Namespace(2, "Usu\u00e1rio(a)", canonical="User", case="first-letter")
+        ]
+    )
+
+    eq_((1, "Foo"), parser.parse("Discuss\u00e3o:Foo"))
+    eq_((1, "Foo_bar"), parser.parse("Discuss\u00e3o:Foo bar"))
+    eq_((0, "Herpderp:Foo_bar"), parser.parse("Herpderp:Foo bar"))
+
+
+def test_from_site_info():
+    parser = Parser.from_site_info(
+        {
+            "namespaces": {
+                "0": {
+                    "id": 0,
+                    "case": "first-letter",
+                    "*": "",
+                    "content": ""
+                },
+                "1": {
+                    "id": 1,
+                    "case": "first-letter",
+                    "*": "Discuss\u00e3o",
+                    "subpages": "",
+                    "canonical": "Talk"
+                },
+                "2": {
+                    "id": 2,
+                    "case": "first-letter",
+                    "*": "Usu\u00e1rio(a)",
+                    "subpages": "",
+                    "canonical": "User"
+                }
+            },
+            "namespacealiases": [
+                {
+                    "id": 1,
+                    "*": "WAFFLES"
+                }
+            ]
+        }
+    )
+
+    eq_((1, "Foo"), parser.parse("Discuss\u00e3o:Foo"))
+    eq_((1, "Foo_bar"), parser.parse("Discuss\u00e3o:Foo bar"))
+    eq_((0, "Herpderp:Foo_bar"), parser.parse("Herpderp:Foo bar"))
+    eq_((1, "Foo_bar"), parser.parse("WAFFLES:Foo bar"))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/__init__.py
new file mode 100644 (file)
index 0000000..0994667
--- /dev/null
@@ -0,0 +1,2 @@
+from .timestamp import Timestamp
+from .namespace import Namespace
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/namespace.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/namespace.py
new file mode 100644 (file)
index 0000000..7c6d9bf
--- /dev/null
@@ -0,0 +1,61 @@
+from . import serializable
+from ..util import none_or
+
+
+class Namespace(serializable.Type):
+    """
+    Namespace meta data.
+    """
+
+    __slots__ = ('id', 'name', 'aliases', 'case', 'canonical')
+
+    def __init__(self, id, name, canonical=None, aliases=None, case=None,
+                       content=False):
+        
+        self.id = int(id)
+        """
+        Namespace ID : `int`
+        """
+
+        self.name = none_or(name, str)
+        """
+        Namespace name : `str`
+        """
+
+        self.aliases = serializable.Set.deserialize(aliases or [], str)
+        """
+        Alias names : set( `str` )
+        """
+
+        self.case = none_or(case, str)
+        """
+        Case sensitivity : `str` | `None`
+        """
+
+        self.canonical = none_or(canonical, str)
+        """
+        Canonical name : `str` | `None`
+        """
+        
+        self.content = bool(content)
+        """
+        Is considered a content namespace : `bool`
+        """
+
+    def __hash__(self):
+        return self.id
+    
+    @classmethod
+    def from_doc(cls, doc, aliases={}):
+        """
+        Constructs a namespace object from a namespace doc returned by the API
+        site_info call.
+        """
+        return cls(
+            doc['id'],
+            doc['*'],
+            canonical=doc.get('canonical'),
+            aliases=set(aliases.get(doc['id'], [])),
+            case=doc['case'],
+            content='content' in doc
+        )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/serializable.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/serializable.py
new file mode 100644 (file)
index 0000000..2170b6b
--- /dev/null
@@ -0,0 +1,97 @@
+from itertools import chain
+
+
+class Type:
+    def __eq__(self, other):
+        if other is None:
+            return False
+        try:
+            for key in self.keys():
+                if getattr(self, key) != getattr(other, key):
+                    return False
+
+            return True
+        except KeyError:
+            return False
+
+    def __neq__(self, other):
+        return not self.__eq__(other)
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        return "%s(%s)" % (
+            self.__class__.__name__,
+            ", ".join(
+                "%s=%r" % (k, v) for k, v in self.items()
+            )
+        )
+
+    def items(self):
+        for key in self.keys():
+            yield key, getattr(self, key)
+
+    def keys(self):
+        return (
+            key for key in
+            chain(getattr(self, "__slots__", []), self.__dict__.keys())
+            if key[:2] != "__"
+        )
+
+    def serialize(self):
+        return dict(
+            (k, self._serialize(v))
+            for k, v in self.items()
+        )
+
+    def _serialize(self, value):
+        if hasattr(value, "serialize"):
+            return value.serialize()
+        else:
+            return value
+
+    @classmethod
+    def deserialize(cls, doc_or_instance):
+        if isinstance(doc_or_instance, cls):
+            return doc_or_instance
+        else:
+            return cls(**doc_or_instance)
+
+
+class Dict(dict, Type):
+    def serialize(self):
+        return {k: self._serialize(v) for k, v in self.items()}
+
+    @staticmethod
+    def deserialize(d, value_deserializer=lambda v: v):
+        if isinstance(d, Dict):
+            return d
+        else:
+            return Dict((k, value_deserializer(v)) for k, v in d.items())
+
+
+class Set(set, Type):
+    def serialize(self):
+        return [self._serialize(v) for v in self]
+
+    @staticmethod
+    def deserialize(s, value_deserializer=lambda v: v):
+
+        if isinstance(s, Set):
+            return s
+        else:
+            return Set(value_deserializer(v) for v in s)
+
+
+class List(list, Type):
+    def serialize(self):
+        return list(self._serialize(v) for v in self)
+
+    @staticmethod
+    def deserialize(l, value_deserializer=lambda v: v):
+
+        if isinstance(l, List):
+            return l
+        else:
+            return List(value_deserializer(v) for v in l)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/tests/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/tests/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/tests/test_namespace.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/tests/test_namespace.py
new file mode 100644 (file)
index 0000000..4487d29
--- /dev/null
@@ -0,0 +1,32 @@
+from nose.tools import eq_
+
+from ..namespace import Namespace
+
+
+def test_namespace():
+    namespace = Namespace(10, "Foo", canonical="Bar", aliases={'WT'},
+                          case="foobar", content=False)
+
+    eq_(namespace.id, 10)
+    eq_(namespace.name, "Foo")
+    eq_(namespace.canonical, "Bar")
+    eq_(namespace.aliases, {'WT'})
+    eq_(namespace.case, "foobar")
+    eq_(namespace.content, False)
+
+def test_namespace_from_doc():
+    
+    doc = {
+        "id": 0,
+        "case": "first-letter",
+        "*": "",
+        "content": ""
+    }
+    
+    namespace = Namespace.from_doc(doc)
+    eq_(namespace.id, 0)
+    eq_(namespace.name, "")
+    eq_(namespace.canonical, None)
+    eq_(namespace.aliases, set())
+    eq_(namespace.case, "first-letter")
+    eq_(namespace.content, True)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/tests/test_serializable.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/tests/test_serializable.py
new file mode 100644 (file)
index 0000000..a439792
--- /dev/null
@@ -0,0 +1,25 @@
+from nose.tools import eq_
+
+from .. import serializable
+
+
+def test_type():
+    class Foo(serializable.Type):
+        def __init__(self, foo, bar):
+            self.foo = foo
+            self.bar = bar
+
+    foo = Foo(1, "bar")
+    eq_(foo, Foo.deserialize(foo))
+    eq_(foo, Foo.deserialize(foo.serialize()))
+
+
+def test_dict():
+    d = serializable.Dict()
+    d['foo'] = "bar"
+    d['derp'] = "herp"
+
+    eq_(d['foo'], "bar")
+    assert 'derp' in d
+
+    eq_(d, serializable.Dict.deserialize(d.serialize(), str))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/tests/test_timestamp.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/tests/test_timestamp.py
new file mode 100644 (file)
index 0000000..eb3dc0a
--- /dev/null
@@ -0,0 +1,87 @@
+import pickle
+
+from nose.tools import eq_
+
+from ..timestamp import LONG_MW_TIME_STRING, Timestamp
+
+
+def test_self():
+    t1 = Timestamp(1234567890)
+
+    # Unix timestamp
+    eq_(t1, Timestamp(int(t1)))
+
+    # Short format
+    eq_(t1, Timestamp(t1.short_format()))
+
+    # Long format
+    eq_(t1, Timestamp(t1.long_format()))
+
+
+def test_comparison():
+    t1 = Timestamp(1234567890)
+    t2 = Timestamp(1234567891)
+
+    assert t1 < t2, "Less than comparison failed"
+    assert t2 > t1, "Greater than comparison failed"
+    assert not t2 < t1, "Not less than comparison failed"
+    assert not t1 > t2, "Not greater than comparison failed"
+
+    assert t1 <= t2, "Less than or equal to comparison failed"
+    assert t1 <= t1, "Less than or equal to comparison failed"
+    assert t2 >= t1, "Greater than or equal to comparison failed"
+    assert t2 >= t2, "Greater than or equal to comparison failed"
+    assert not t2 <= t1, "Not less than or equal to comparison failed"
+    assert not t1 >= t2, "Not greater than or equal to comparison failed"
+    
+
+
+def test_subtraction():
+    t1 = Timestamp(1234567890)
+    t2 = Timestamp(1234567891)
+
+    eq_(t2 - t1, 1)
+    eq_(t1 - t2, -1)
+    eq_(t2 - 1, t1)
+
+
+def test_strptime():
+    eq_(
+        Timestamp("2009-02-13T23:31:30Z"),
+        Timestamp.strptime("2009-02-13T23:31:30Z", LONG_MW_TIME_STRING)
+    )
+
+    eq_(
+        Timestamp.strptime(
+            "expires 03:20, 21 November 2013 (UTC)",
+            "expires %H:%M, %d %B %Y (UTC)"
+        ),
+        Timestamp("2013-11-21T03:20:00Z")
+    )
+
+
+def test_strftime():
+    eq_(
+        Timestamp("2009-02-13T23:31:30Z").strftime(LONG_MW_TIME_STRING),
+        "2009-02-13T23:31:30Z"
+    )
+
+    eq_(
+        Timestamp("2009-02-13T23:31:30Z").strftime("expires %H:%M, %d %B %Y (UTC)"),
+        "expires 23:31, 13 February 2009 (UTC)"
+    )
+
+
+def test_serialization():
+    timestamp = Timestamp(1234567890)
+    eq_(
+        timestamp,
+        Timestamp.deserialize(timestamp.serialize())
+    )
+
+def test_pickling():
+    timestamp = Timestamp(1234567890)
+    eq_(
+        timestamp,
+        pickle.loads(pickle.dumps(timestamp))
+    )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/timestamp.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/types/timestamp.py
new file mode 100644 (file)
index 0000000..e27cf9a
--- /dev/null
@@ -0,0 +1,318 @@
+import calendar
+import datetime
+import time
+
+from . import serializable
+
+LONG_MW_TIME_STRING = '%Y-%m-%dT%H:%M:%SZ'
+"""
+The longhand version of MediaWiki time strings.
+"""
+
+SHORT_MW_TIME_STRING = '%Y%m%d%H%M%S'
+"""
+The shorthand version of MediaWiki time strings.
+"""
+
+
+class Timestamp(serializable.Type):
+    """
+    An immutable type for working with MediaWiki timestamps in their various
+    forms.
+
+    :Parameters:
+        time_thing : :class:`mw.Timestamp` | :py:class:`~time.time_struct` | :py:class:`~datetime.datetime` | :py:class:`str` | :py:class:`int` | :py:class:`float`
+            The timestamp type from which to construct the timestamp class.
+
+    :Returns:
+        :class:`mw.Timestamp`
+
+    You can make use of a lot of different *time things* to initialize a
+    :class:`mw.Timestamp`.
+
+    * If a :py:class:`~time.time_struct` or :py:class:`~datetime.datetime` are provided, a `Timestamp` will be constructed from their values.
+    * If an `int` or `float` are provided, they will be assumed to a unix timestamp in seconds since Jan. 1st, 1970 UTC.
+    * If a `str` is provided, it will be be checked against known MediaWiki timestamp formats.  E.g., ``'%Y%m%d%H%M%S'`` and ``'%Y-%m-%dT%H:%M:%SZ'``.
+    * If a :class:`mw.Timestamp` is provided, the same `Timestamp` will be returned.
+
+    For example::
+
+        >>> import datetime, time
+        >>> from mw import Timestamp
+        >>> Timestamp(1234567890)
+        Timestamp('2009-02-13T23:31:30Z')
+        >>> Timestamp(1234567890) == Timestamp("2009-02-13T23:31:30Z")
+        True
+        >>> Timestamp(1234567890) == Timestamp("20090213233130")
+        True
+        >>> Timestamp(1234567890) == Timestamp(datetime.datetime.utcfromtimestamp(1234567890))
+        True
+        >>> Timestamp(1234567890) == Timestamp(time.strptime("2009-02-13T23:31:30Z", "%Y-%m-%dT%H:%M:%SZ"))
+        True
+        >>> Timestamp(1234567890) == Timestamp(Timestamp(1234567890))
+        True
+
+
+    You can also do math and comparisons of timestamps.::
+
+        >>> from mw import Timestamp
+        >>> t = Timestamp(1234567890)
+        >>> t
+        Timestamp('2009-02-13T23:31:30Z')
+        >>> t2 = t + 10
+        >>> t2
+        Timestamp('2009-02-13T23:31:40Z')
+        >>> t += 1
+        >>> t
+        Timestamp('2009-02-13T23:31:31Z')
+        >>> t2 - t
+        9
+        >>> t < t2
+        True
+
+
+    """
+
+    def __new__(cls, time_thing):
+        if isinstance(time_thing, cls):
+            return time_thing
+        elif isinstance(time_thing, time.struct_time):
+            return cls.from_time_struct(time_thing)
+        elif isinstance(time_thing, datetime.datetime):
+            return cls.from_datetime(time_thing)
+        elif type(time_thing) in (int, float):
+            return cls.from_unix(time_thing)
+        else:
+            return cls.from_string(time_thing)
+
+    def __init__(self, time_thing):
+        # Important that this does nothing in order to allow __new__ to behave
+        # as expected.  User `initialize()` instead.
+        pass
+
+    def initialize(self, time_struct):
+        self.__time = time_struct
+
+    def short_format(self):
+        """
+        Constructs a long, ``'%Y%m%d%H%M%S'`` formatted string common to the
+        database. This method is roughly equivalent to calling
+        ``strftime('%Y%m%d%H%M%S')``.
+
+        :Parameters:
+            format : str
+                The string format
+
+        :Returns:
+            A formatted string
+        """
+        return self.strftime(SHORT_MW_TIME_STRING)
+
+    def long_format(self):
+        """
+        Constructs a long, ``'%Y-%m-%dT%H:%M:%SZ'`` formatted string common to the
+        API. This method is roughly equivalent to calling
+        ``strftime('%Y-%m-%dT%H:%M:%SZ')``.
+
+        :Parameters:
+            format : str
+                The string format
+
+        :Returns:
+            A formatted string
+        """
+        return self.strftime(LONG_MW_TIME_STRING)
+
+    def strftime(self, format):
+        """
+        Constructs a formatted string.
+        See `<https://docs.python.org/3/library/time.html#time.strftime>`_ for a
+        discussion of formats descriptors.
+
+        :Parameters:
+            format : str
+                The format description
+
+        :Returns:
+            A formatted string
+        """
+        return time.strftime(format, self.__time)
+
+    @classmethod
+    def strptime(cls, string, format):
+        """
+        Constructs a :class:`mw.Timestamp` from an explicitly formatted string.
+        See `<https://docs.python.org/3/library/time.html#time.strftime>`_ for a
+        discussion of formats descriptors.
+
+        :Parameters:
+            string : str
+                A formatted timestamp
+            format : str
+                The format description
+
+        :Returns:
+            :class:`mw.Timestamp`
+        """
+        return cls.from_time_struct(time.strptime(string, format))
+
+    @classmethod
+    def from_time_struct(cls, time_struct):
+        """
+        Constructs a :class:`mw.Timestamp` from a :class:`time.time_struct`.
+
+        :Parameters:
+            time_struct : :class:`time.time_struct`
+                A time structure
+
+        :Returns:
+            :class:`mw.Timestamp`
+        """
+        instance = super().__new__(cls)
+        instance.initialize(time_struct)
+        return instance
+
+    @classmethod
+    def from_datetime(cls, dt):
+        """
+        Constructs a :class:`mw.Timestamp` from a :class:`datetime.datetime`.
+
+        :Parameters:
+            dt : :class:`datetime.datetime``
+                A datetime.
+
+        :Returns:
+            :class:`mw.Timestamp`
+        """
+        time_struct = dt.timetuple()
+        return cls.from_time_struct(time_struct)
+
+    @classmethod
+    def from_unix(cls, seconds):
+        """
+        Constructs a :class:`mw.Timestamp` from a unix timestamp (in seconds
+        since Jan. 1st, 1970 UTC).
+
+        :Parameters:
+            seconds : int
+                A unix timestamp
+
+        :Returns:
+            :class:`mw.Timestamp`
+        """
+        time_struct = datetime.datetime.utcfromtimestamp(seconds).timetuple()
+        return cls.from_time_struct(time_struct)
+
+    @classmethod
+    def from_string(cls, string):
+        """
+        Constructs a :class:`mw.Timestamp` from a MediaWiki formatted string.
+        This method is provides a convenient way to construct from common
+        MediaWiki timestamp formats. E.g., ``%Y%m%d%H%M%S`` and
+        ``%Y-%m-%dT%H:%M:%SZ``.
+
+        :Parameters:
+            string : str
+                A formatted timestamp
+
+        :Returns:
+            :class:`mw.Timestamp`
+        """
+        if type(string) == bytes:
+            string = str(string, 'utf8')
+        else:
+            string = str(string)
+
+        try:
+            return cls.strptime(string, SHORT_MW_TIME_STRING)
+        except ValueError as e:
+            try:
+                return cls.strptime(string, LONG_MW_TIME_STRING)
+            except ValueError as e:
+                raise ValueError(
+                    "{0} is not a valid Wikipedia date format".format(
+                        repr(string)
+                    )
+                )
+
+        return cls.from_time_struct(time_struct)
+
+    def __format__(self, format):
+        return self.strftime(format)
+
+    def __str__(self):
+        return self.short_format()
+
+    def serialize(self):
+        return self.unix()
+
+    @classmethod
+    def deserialize(cls, time_thing):
+        return Timestamp(time_thing)
+
+    def __repr__(self):
+        return "{0}({1})".format(
+            self.__class__.__name__,
+            repr(self.long_format())
+        )
+
+    def __int__(self):
+        return self.unix()
+
+    def __float__(self):
+        return float(self.unix())
+
+    def unix(self):
+        """
+        :Returns:
+            the number of seconds since Jan. 1st, 1970 UTC.
+        """
+        return int(calendar.timegm(self.__time))
+
+    def __sub__(self, other):
+        if isinstance(other, Timestamp):
+            return self.unix() - other.unix()
+        else:
+            return self + (other * -1)
+
+    def __add__(self, seconds):
+        return Timestamp(self.unix() + seconds)
+
+    def __eq__(self, other):
+        try:
+            return self.__time == other.__time
+        except AttributeError:
+            return False
+
+    def __lt__(self, other):
+        try:
+            return self.__time < other.__time
+        except AttributeError:
+            return NotImplemented
+
+    def __gt__(self, other):
+        try:
+            return self.__time > other.__time
+        except AttributeError:
+            return NotImplemented
+
+    def __le__(self, other):
+        try:
+            return self.__time <= other.__time
+        except AttributeError:
+            return NotImplemented
+
+    def __ge__(self, other):
+        try:
+            return self.__time >= other.__time
+        except AttributeError:
+            return NotImplemented
+
+    def __ne__(self, other):
+        try:
+            return not self.__time == other.__time
+        except AttributeError:
+            return NotImplemented
+        
+    def __getnewargs__(self):
+        return (self.__time,)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/__init__.py
new file mode 100644 (file)
index 0000000..02f8de4
--- /dev/null
@@ -0,0 +1,2 @@
+from .functions import none_or
+from .heap import Heap
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/api.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/api.py
new file mode 100644 (file)
index 0000000..b018b79
--- /dev/null
@@ -0,0 +1,53 @@
+import logging
+import time
+
+import requests
+
+from .functions import none_or
+
+logger = logging.getLogger("mw.util.api.session")
+
+FAILURE_THRESHOLD = 5
+TIMEOUT = 20
+
+
+class Session:
+    def __init__(self, uri, headers=None, timeout=None,
+                 failure_threshold=None, wait_step=2):
+        if uri is None:
+            raise TypeError("uri must not be None")
+
+        self.uri = str(uri)
+        self.headers = headers if headers is not None else {}
+        self.session = requests.Session()
+
+        self.failure_threshold = int(failure_threshold or FAILURE_THRESHOLD)
+        self.timeout = float(TIMEOUT)
+        self.wait_step = float(wait_step)
+
+        self.failed = 0
+
+    def __sleep(self):
+        time.sleep(self.failed * (self.wait_step ** self.failed))
+
+    def get(self, params, **kwargs):
+        return self.request('GET', params, **kwargs)
+
+    def post(self, params, **kwargs):
+        return self.request('POST', params, **kwargs)
+
+    def request(self, type, params):
+        try:
+            result = self.session.request(type, self.uri, params=params,
+                                          timeout=self.timeout)
+            self.failed = 0
+            return result
+        except (requests.HTTPError, requests.ConnectionError):
+            self.failed += 1
+
+            if self.failed > self.failure_threshold:
+                self.failed = 0
+                raise
+            else:
+                self.__sleep()
+                self.request(type, params)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/autovivifying.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/autovivifying.py
new file mode 100644 (file)
index 0000000..6a9c1aa
--- /dev/null
@@ -0,0 +1,11 @@
+class Dict(dict):
+    def __init__(self, *args, vivifier=lambda k: None, **kwargs):
+        self.vivifier = vivifier
+
+        dict.__init__(self, *args, **kwargs)
+
+    def __getitem__(self, key):
+        if key not in self:
+            dict.__setitem__(self, key, self.vivifier(key))
+
+        return dict.__getitem__(self, key)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/functions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/functions.py
new file mode 100644 (file)
index 0000000..28aaa6a
--- /dev/null
@@ -0,0 +1,21 @@
+def none_or(val, func=None, levels=None):
+    if val is None:
+        return None
+    else:
+        if levels is not None:
+            if val not in set(levels):
+                raise KeyError(val)
+
+            return val
+        else:
+            return func(val)
+
+
+def try_keys(dictionary, keys):
+    attempted_keys = []
+    for key in keys:
+        if key in dictionary:
+            return dictionary[key]
+        attempted_keys.append(key)
+
+    raise KeyError("|".join(str(k) for k in attempted_keys))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/heap.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/heap.py
new file mode 100644 (file)
index 0000000..2ae77d1
--- /dev/null
@@ -0,0 +1,22 @@
+import heapq
+
+
+class Heap(list):
+    def __init__(self, *args, **kwargs):
+        list.__init__(self, *args, **kwargs)
+        heapq.heapify(self)
+
+    def pop(self):
+        return heapq.heappop(self)
+
+    def push(self, item):
+        heapq.heappush(self, item)
+
+    def peek(self):
+        return self[0]
+
+    def pushpop(self, item):
+        return heapq.heappushpop(self, item)
+
+    def poppush(self, itemp):
+        return heapq.replace(self, item)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/__init__.py
new file mode 100644 (file)
index 0000000..92b3ce6
--- /dev/null
@@ -0,0 +1,3 @@
+from .aggregate import aggregate, group
+from .peekable import Peekable
+from .sequence import sequence
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/aggregate.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/aggregate.py
new file mode 100644 (file)
index 0000000..3898255
--- /dev/null
@@ -0,0 +1,20 @@
+from .peekable import Peekable
+
+
+def group(it, by=lambda i: i):
+    return aggregate(it, by)
+
+
+def aggregate(it, by=lambda i: i):
+    it = Peekable(it)
+
+    def chunk(it, by):
+        identifier = by(it.peek())
+        while not it.empty():
+            if identifier == by(it.peek()):
+                yield next(it)
+            else:
+                break
+
+    while not it.empty():
+        yield (by(it.peek()), chunk(it, by))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/count.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/count.py
new file mode 100644 (file)
index 0000000..b8a5e6d
--- /dev/null
@@ -0,0 +1,8 @@
+def count(iterable):
+    """
+    Consumes all items in an iterable and returns a count.
+    """
+    n = 0
+    for item in iterable:
+        n += 1
+    return n
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/peekable.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/peekable.py
new file mode 100644 (file)
index 0000000..9ed0a72
--- /dev/null
@@ -0,0 +1,37 @@
+def Peekable(it):
+    if isinstance(it, PeekableType):
+        return it
+    else:
+        return PeekableType(it)
+
+
+class PeekableType:
+    class EMPTY:
+        pass
+
+    def __init__(self, it):
+        self.it = iter(it)
+        self.__cycle()
+
+    def __iter__(self):
+        return self
+
+    def __cycle(self):
+        try:
+            self.lookahead = next(self.it)
+        except StopIteration:
+            self.lookahead = self.EMPTY
+
+    def __next__(self):
+        item = self.peek()
+        self.__cycle()
+        return item
+
+    def peek(self):
+        if self.empty():
+            raise StopIteration()
+        else:
+            return self.lookahead
+
+    def empty(self):
+        return self.lookahead == self.EMPTY
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/sequence.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/sequence.py
new file mode 100644 (file)
index 0000000..adaac2b
--- /dev/null
@@ -0,0 +1,27 @@
+from .peekable import Peekable
+
+
+def sequence(*iterables, by=None, compare=None):
+    if compare is not None:
+        compare = compare
+    elif by is not None:
+        compare = lambda i1, i2: by(i1) <= by(i2)
+    else:
+        compare = lambda i1, i2: i1 <= i2
+
+    iterables = [Peekable(it) for it in iterables]
+
+    done = False
+    while not done:
+
+        next_i = None
+
+        for i, it in enumerate(iterables):
+            if not it.empty():
+                if next_i is None or compare(it.peek(), iterables[next_i].peek()):
+                    next_i = i
+
+        if next_i is None:
+            done = True
+        else:
+            yield next(iterables[next_i])
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/tests/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/tests/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/tests/test_aggregate.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/tests/test_aggregate.py
new file mode 100644 (file)
index 0000000..7051ee8
--- /dev/null
@@ -0,0 +1,13 @@
+from nose.tools import eq_
+from ..aggregate import aggregate
+
+
+def test_group():
+    l = [0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14]
+    expected = [[0, 1, 2, 3, 4, 5], [10, 11, 12, 13, 14]]
+
+    result = []
+    for identifier, group in aggregate(l, lambda item: int(item / 10)):
+        result.append(list(group))
+
+    eq_(result, expected)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/tests/test_peekable.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/tests/test_peekable.py
new file mode 100644 (file)
index 0000000..ec590c1
--- /dev/null
@@ -0,0 +1,20 @@
+from nose.tools import eq_
+from ..peekable import Peekable
+
+
+def test_peekable():
+    iterable = range(0, 100)
+    iterable = Peekable(iterable)
+    expected = list(range(0, 100))
+
+    result = []
+
+    assert not iterable.empty()
+    eq_(iterable.peek(), expected[0])
+    result.append(next(iterable))
+
+    eq_(iterable.peek(), expected[1])
+    result.append(next(iterable))
+
+    result.extend(list(iterable))
+    eq_(result, expected)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/tests/test_sequence.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/iteration/tests/test_sequence.py
new file mode 100644 (file)
index 0000000..9d54d00
--- /dev/null
@@ -0,0 +1,11 @@
+from nose.tools import eq_
+from ..sequence import sequence
+
+
+def test_sequence():
+    foo = [{'val': 3}, {'val': 5}]
+    bar = [{'val': 1}, {'val': 10}, {'val': 15}]
+    expected = [{'val': 1}, {'val': 3}, {'val': 5}, {'val': 10}, {'val': 15}]
+
+    result = list(sequence(foo, bar, compare=lambda i1, i2: i1['val'] < i2['val']))
+    eq_(expected, result)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/ordered.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/ordered.py
new file mode 100644 (file)
index 0000000..c9080aa
--- /dev/null
@@ -0,0 +1,116 @@
+from . import autovivifying
+
+
+class Circle(list):
+    def __init__(self, maxsize, iterable=None):
+        self._maxsize = int(maxsize)
+        list.__init__(self, [None] * maxsize)
+        self._size = 0
+        self._pointer = 0
+
+        if iterable is not None:
+            self.extend(iterable)
+
+    def state(self):
+        return list(list.__iter__(self))
+
+    def _internalize(self, index):
+        if self._size < self._maxsize:
+            return index
+        else:
+            return (self._pointer + index) % self._maxsize
+
+    def __iter__(self):
+        for i in range(0, self._size):
+            yield list.__getitem__(self, self._internalize(i))
+
+    def __reversed__(self):
+        for i in range(self._size - 1, -1, -1):
+            yield list.__getitem__(self, self._internalize(i))
+
+    def pop(self, index=None):
+        raise NotImplementedError()
+
+    def __len__(self):
+        return self._size
+
+    def __getitem__(self, index):
+        return list.__getitem__(self, self._internalize(index))
+
+    def append(self, value):
+        # Get the old value
+        old_value = list.__getitem__(self, self._pointer)
+
+        # Update internal list
+        list.__setitem__(self, self._pointer, value)
+
+        # Update state
+        self._pointer = (self._pointer + 1) % self._maxsize
+        self._size = min(self._maxsize, self._size + 1)
+
+        # If we overwrote a value, yield it.
+        return old_value
+
+    def extend(self, values):
+        for value in values:
+            expectorate = self.append(value)
+            if expectorate is not None or self._size == self._maxsize:
+                yield expectorate
+
+
+class HistoricalMap(autovivifying.Dict):
+    '''
+    A datastructure for efficiently storing and retrieving a
+    limited number of historical records.
+
+    TODO: Rename this to FIFOCache
+    '''
+
+    def __init__(self, *args, maxlen, **kwargs):
+        '''Maxlen specifies the maximum amount of history to keep'''
+        super().__init__(self, *args, vivifier=lambda k: [], **kwargs)
+
+        self._circle = Circle(maxlen)  # List to preserve order for history
+
+    def __iter__(self):
+        return iter(self._circle)
+
+    def __setitem__(self, key, value):
+        '''Adds a new key-value pair. Returns any discarded values.'''
+
+        # Add to history circle and catch expectorate
+        expectorate = self._circle.append((key, value))
+
+        autovivifying.Dict.__getitem__(self, key).append(value)
+
+        if expectorate is not None:
+            old_key, old_value = expectorate
+            autovivifying.Dict.__getitem__(self, old_key).pop(0)
+            if len(autovivifying.Dict.__getitem__(self, old_key)) == 0:
+                autovivifying.Dict.__delitem__(self, old_key)
+
+            return (old_key, old_value)
+
+    def insert(self, key, value):
+        return self.__setitem__(key, value)
+
+    def __getitem__(self, key):
+        if key in self:
+            return autovivifying.Dict.__getitem__(self, key)[-1]
+        else:
+            raise KeyError(key)
+
+    def get(self, key):
+        '''Gets the most recently added value for a key'''
+        return self.__getitem__(key)
+
+    def up_to(self, key):
+        '''Gets the recently inserted values up to a key'''
+        for okey, ovalue in reversed(self._circle):
+            if okey == key:
+                break
+            else:
+                yield ovalue
+
+    def last(self):
+        return self.circle[-1]
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/test_autovivifying.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/test_autovivifying.py
new file mode 100644 (file)
index 0000000..fc154ce
--- /dev/null
@@ -0,0 +1,28 @@
+from nose.tools import eq_
+
+from .. import autovivifying
+
+
+def test_word_count():
+    words = """
+    I am a little teapot short and stout.  Here is my handle and here is my
+    spout.  The red fox jumps over the lazy brown dog.  She sells sea shells
+    by the sea shore.
+    """.replace(".", " ").split()
+
+    # Lame way
+    lame_counts = {}
+    for word in words:
+        if word not in lame_counts:
+            lame_counts[word] = 0
+
+        lame_counts[word] += 1
+
+    # Awesome way
+    awesome_counts = autovivifying.Dict(  # Autovivifies entries with zero.
+                                          vivifier=lambda k: 0  # Useful for counting.
+    )
+    for word in words:
+        awesome_counts[word] += 1
+
+    eq_(lame_counts, awesome_counts)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/test_functions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/test_functions.py
new file mode 100644 (file)
index 0000000..fdb621a
--- /dev/null
@@ -0,0 +1,14 @@
+from nose.tools import eq_
+
+from ..functions import none_or
+
+
+def test_none_or():
+    eq_(10, none_or("10", int))
+    eq_(10.75, none_or("10.75", float))
+    eq_(None, none_or(None, int))
+    assert none_or("", str) is not None
+    assert none_or([], list) is not None
+    assert none_or({}, dict) is not None
+    assert none_or(0, int) is not None
+    assert none_or(-1, int) is not None
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/test_heap.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/test_heap.py
new file mode 100644 (file)
index 0000000..7e70562
--- /dev/null
@@ -0,0 +1,28 @@
+from nose.tools import eq_
+
+from ..heap import Heap
+
+
+def test_heap():
+    h = Heap([5, 4, 7, 8, 2])
+    eq_(h.pop(), 2)
+    eq_(h.pop(), 4)
+    eq_(h.pop(), 5)
+    eq_(h.pop(), 7)
+    eq_(h.pop(), 8)
+    eq_(len(h), 0)
+
+    h = Heap([10, 20, 100])
+    eq_(h.pop(), 10)
+    h.push(30)
+    eq_(len(h), 3)
+    eq_(h.pop(), 20)
+    eq_(h.pop(), 30)
+    eq_(h.pop(), 100)
+    eq_(len(h), 0)
+
+    h = Heap([(1, 7), (2, 4), (10, -100)])
+    eq_(h.peek(), (1, 7))
+    h.pop()
+    eq_(h.pop(), (2, 4))
+    eq_(h.pop(), (10, -100))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/test_ordered.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/util/tests/test_ordered.py
new file mode 100644 (file)
index 0000000..0bd7be9
--- /dev/null
@@ -0,0 +1,41 @@
+from nose.tools import eq_
+
+from .. import ordered
+
+
+def test_circle():
+    circle = ordered.Circle(3)
+
+    eq_(0, len(circle))
+    print(circle.state())
+    eq_(None, circle.append(5))
+    eq_(1, len(circle))
+    print(circle.state())
+    eq_(None, circle.append(6))
+    eq_(2, len(circle))
+    print(circle.state())
+    eq_(None, circle.append(7))
+    eq_(3, len(circle))
+    print(circle.state())
+    eq_(5, circle.append(8))
+    eq_(3, len(circle))
+    print(circle.state())
+
+    eq_([6, 7, 8], list(circle))
+
+    print(circle.state())
+    eq_([8, 7, 6], list(reversed(circle)))
+
+
+def test_historical_map():
+    hist = ordered.HistoricalMap(maxlen=2)
+
+    assert "foo" not in hist
+
+    eq_(None, hist.insert('foo', "bar1"))
+
+    assert "foo" in hist
+
+    eq_(None, hist.insert('foo', "bar2"))
+
+    eq_(('foo', "bar1"), hist.insert('not_foo', "not_bar"))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/7zfile.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/7zfile.py
new file mode 100644 (file)
index 0000000..b7308bc
--- /dev/null
@@ -0,0 +1,69 @@
+"""
+This is a failed attempt.  See
+https://github.com/halfak/Mediawiki-Utilities/issues/13 for more details.
+"""
+
+'''
+import os
+
+import py7zlib
+
+
+class SevenZFileError(py7zlib.ArchiveError):
+    pass
+
+class SevenZFile(object):
+    @classmethod
+    def is_7zfile(cls, filepath):
+        """ Determine if filepath points to a valid 7z archive. """
+        is7z = False
+        fp = None
+        try:
+            fp = open(filepath, 'rb')
+            archive = py7zlib.Archive7z(fp)
+            n = len(archive.getnames())
+            is7z = True
+        finally:
+            if fp: fp.close()
+        return is7z
+
+    def __init__(self, filepath):
+        fp = open(filepath, 'rb')
+        self.filepath = filepath
+        self.archive = py7zlib.Archive7z(fp)
+
+    def __contains__(self, name):
+        return name in self.archive.getnames()
+
+    def bytestream(self, name):
+        """ Iterate stream of bytes from an archive member. """
+        if name not in self:
+            raise SevenZFileError('member %s not found in %s' %
+                                  (name, self.filepath))
+        else:
+            member = self.archive.getmember(name)
+            for byte in member.read():
+                if not byte: break
+                yield byte
+
+    def readlines(self, name):
+        """ Iterate lines from an archive member. """
+        linesep = os.linesep[-1]
+        line = ''
+        for ch in self.bytestream(name):
+            line += ch
+            if ch == linesep:
+                yield line
+                line = ''
+        if line: yield line
+        
+    
+import os
+
+import py7zlib
+
+with open("/mnt/data/xmldatadumps/public/simplewiki/20141122/simplewiki-20141122-pages-meta-history.xml.7z", "rb") as f:
+    a = py7zlib.Archive7z(f)
+    
+    print(a.getmember(a.getnames()[0]).read())
+'''
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/__init__.py
new file mode 100644 (file)
index 0000000..6b3d7ec
--- /dev/null
@@ -0,0 +1,27 @@
+"""
+This module is a collection of utilities for efficiently processing MediaWiki's
+XML database dumps.  There are two important concerns that this module intends
+to address: *performance* and the *complexity* of streaming XML parsing.
+
+Performance
+    Performance is a serious concern when processing large database XML dumps.
+    Regretfully, the Global Intepreter Lock prevents us from running threads on
+    multiple CPUs.  This library provides a :func:`map`, a function
+    that maps a dump processing over a set of dump files using
+    :class:`multiprocessing` to distribute the work over multiple CPUS
+
+Complexity
+    Streaming XML parsing is gross.  XML dumps are (1) some site meta data, (2)
+    a collection of pages that contain (3) collections of revisions.  The
+    module allows you to think about dump files in this way and ignore the
+    fact that you're streaming XML.  An :class:`Iterator` contains
+    site meta data and an iterator of :class:`Page`'s.  A
+    :class:`Page` contains page meta data and an iterator of
+    :class:`Revision`'s.  A :class:`Revision` contains revision meta data
+    including a :class:`Contributor` (if one a contributor was specified in the
+    XML).
+
+"""
+from .map import map
+from .iteration import *
+from .functions import file, open_file
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/element_iterator.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/element_iterator.py
new file mode 100644 (file)
index 0000000..b16d0b7
--- /dev/null
@@ -0,0 +1,104 @@
+
+try:
+       import xml.etree.cElementTree as etree
+except ImportError:
+       import xml.etree.ElementTree as etree
+
+from xml.etree.ElementTree import ParseError
+
+from .errors import MalformedXML
+
+
+def trim_ns(tag):
+    return tag[tag.find("}") + 1:]
+
+
+class EventPointer:
+    def __init__(self, etree_events):
+        self.tag_stack = []
+        self.etree_events = etree_events
+
+    def __next__(self):
+        event, element = next(self.etree_events)
+
+        tag = trim_ns(element.tag)
+
+        if event == "start":
+            self.tag_stack.append(tag)
+        else:
+            if self.tag_stack[-1] == tag:
+                self.tag_stack.pop()
+            else:
+                raise MalformedXML("Expected {0}, but saw {1}.".format(
+                    self.tag_stack[-1],
+                    tag)
+                )
+
+        return event, element
+
+    def depth(self):
+        return len(self.tag_stack)
+
+    @classmethod
+    def from_file(cls, f):
+        return EventPointer(etree.iterparse(f, events=("start", "end")))
+
+
+class ElementIterator:
+    def __init__(self, element, pointer):
+        self.pointer = pointer
+        self.element = element
+        self.depth = pointer.depth() - 1
+
+        self.done = False
+
+    def __iter__(self):
+
+        while not self.done and self.pointer.depth() > self.depth:
+            event, element = next(self.pointer)
+
+            if event == "start":
+                sub_iterator = ElementIterator(element, self.pointer)
+
+                yield sub_iterator
+
+                sub_iterator.clear()
+
+        self.done = True
+
+    def complete(self):
+
+        while not self.done and self.pointer.depth() > self.depth:
+            event, element = next(self.pointer)
+            if self.pointer.depth() > self.depth:
+                element.clear()
+
+        self.done = True
+
+    def clear(self):
+        self.complete()
+        self.element.clear()
+
+    def attr(self, key, alt=None):
+        return self.element.attrib.get(key, alt)
+
+    def __getattr__(self, attr):
+        if attr == "tag":
+            return trim_ns(self.element.tag)
+        elif attr == "text":
+            self.complete()
+            return self.element.text
+        else:
+            raise AttributeError("%s has no attribute %r" % (self.__class__.__name__, attr))
+
+    @classmethod
+    def from_file(cls, f):
+        
+        try:
+            pointer = EventPointer.from_file(f)
+            event, element = next(pointer)
+            return cls(element, pointer)
+        except ParseError as e:
+            raise ParseError(
+                    "{0}: {1}...".format(str(e),
+                                         str(f.read(500), 'utf-8', 'replace')))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/errors.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/errors.py
new file mode 100644 (file)
index 0000000..f6dfd74
--- /dev/null
@@ -0,0 +1,12 @@
+class FileTypeError(Exception):
+    """
+    Thrown when an XML dump file is not of an expected type.
+    """
+    pass
+
+
+class MalformedXML(Exception):
+    """
+    Thrown when an XML dump file is not formatted as expected.
+    """
+    pass
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/functions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/functions.py
new file mode 100644 (file)
index 0000000..2f58e5f
--- /dev/null
@@ -0,0 +1,76 @@
+import os
+import re
+import subprocess
+
+from .errors import FileTypeError
+
+EXTENSIONS = {
+    'xml': ["cat"],
+    'gz': ["zcat"],
+    'bz2': ["bzcat"],
+    '7z': ["7z", "e", "-so"],
+    'lzma': ["lzcat"]
+}
+"""
+A map from file extension to the command to run to extract the data to standard out.
+"""
+
+EXT_RE = re.compile(r'\.([^\.]+)$')
+"""
+A regular expression for extracting the final extension of a file.
+"""
+
+
+def file(path_or_f):
+    """
+    Verifies that a file exists at a given path and that the file has a
+    known extension type.
+
+    :Parameters:
+        path : `str`
+            the path to a dump file
+
+    """
+    if hasattr(path_or_f, "readline"):
+        return path_or_f
+    else:
+        path = path_or_f
+
+    path = os.path.expanduser(path)
+    if not os.path.isfile(path):
+        raise FileTypeError("Can't find file %s" % path)
+
+    match = EXT_RE.search(path)
+    if match is None:
+        raise FileTypeError("No extension found for %s." % path)
+    elif match.groups()[0] not in EXTENSIONS:
+        raise FileTypeError("File type %r is not supported." % path)
+    else:
+        return path
+
+
+def open_file(path_or_f):
+    """
+    Turns a path to a dump file into a file-like object of (decompressed)
+    XML data.
+
+    :Parameters:
+        path : `str`
+            the path to the dump file to read
+    """
+    if hasattr(path_or_f, "read"):
+        return path_or_f
+    else:
+        path = path_or_f
+
+    match = EXT_RE.search(path)
+    ext = match.groups()[0]
+    p = subprocess.Popen(
+        EXTENSIONS[ext] + [path],
+        stdout=subprocess.PIPE,
+        stderr=open(os.devnull, "w")
+    )
+    # sys.stderr.write("\n%s %s\n" % (EXTENSIONS[ext], path))
+    # sys.stderr.write(p.stdout.read(1000))
+    # return False
+    return p.stdout
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/__init__.py
new file mode 100644 (file)
index 0000000..163b872
--- /dev/null
@@ -0,0 +1,10 @@
+"""
+*... iteration ...*
+"""
+from .iterator import Iterator
+from .page import Page
+from .redirect import Redirect
+from .revision import Revision
+from .comment import Comment
+from .contributor import Contributor
+from .text import Text
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/comment.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/comment.py
new file mode 100644 (file)
index 0000000..9d9cbca
--- /dev/null
@@ -0,0 +1,54 @@
+from ...types import serializable
+
+
+class Comment(str, serializable.Type):
+    """
+    A revision comment.  This class behaves identically to
+    :class:`str` except that it takes and stores an additional parameter
+    recording whether the comment was deleted or not.
+
+    >>> from mw.xml_dump import Comment
+    >>>
+    >>> c = Comment("foo")
+    >>> c == "foo"
+    True
+    >>> c.deleted
+    False
+
+    **deleted**
+        Was the comment deleted? | `bool`
+
+    """
+
+    def __new__(cls, string_or_comment="", deleted=False):
+        if isinstance(string_or_comment, cls):
+            return string_or_comment
+        else:
+            inst = super().__new__(cls, string_or_comment)
+            inst.initialize(string_or_comment, deleted)
+            return inst
+
+    def initialize(self, string, deleted):
+        self.deleted = bool(deleted)
+
+    def __str__(self):
+        return str.__str__(self)
+
+    def __repr__(self):
+        return "{0}({1})".format(
+            self.__class__.__name__,
+            ", ".join([
+                str.__repr__(self),
+                "deleted={0}".format(repr(self.deleted))
+            ])
+        )
+
+    def serialize(self):
+        return {
+            "string_or_comment": str(self),
+            "deleted": self.deleted
+        }
+
+    @classmethod
+    def from_element(cls, e):
+        return cls(e.text, e.attr('deleted', False))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/contributor.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/contributor.py
new file mode 100644 (file)
index 0000000..21ff2ad
--- /dev/null
@@ -0,0 +1,45 @@
+from ...types import serializable
+from ...util import none_or
+
+from .util import consume_tags
+
+
+class Contributor(serializable.Type):
+    """
+    Contributor meta data.
+    """
+    __slots__ = ('id', 'user_text')
+
+    TAG_MAP = {
+        'id': lambda e: int(e.text),
+        'username': lambda e: str(e.text),
+        'ip': lambda e: str(e.text)
+    }
+
+    def __init__(self, id, user_text):
+        self.id = none_or(id, int)
+        """
+        User ID : int | `None` (if not specified in the XML)
+
+        User ID of a user if the contributor is signed into an account
+        in the while making the contribution and `None` when
+        contributors are not signed in.
+        """
+
+        self.user_text = none_or(user_text, str)
+        """
+        User name or IP address : str | `None` (if not specified in the XML)
+
+        If a user is logged in, this will reflect the users accout
+        name. If the user is not logged in, this will usually be
+        recorded as the IPv4 or IPv6 address in the XML.
+        """
+
+    @classmethod
+    def from_element(cls, element):
+        values = consume_tags(cls.TAG_MAP, element)
+
+        return cls(
+            values.get('id'),
+            values.get('username', values.get('ip'))
+        )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/iterator.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/iterator.py
new file mode 100644 (file)
index 0000000..4e26e18
--- /dev/null
@@ -0,0 +1,221 @@
+import io
+
+from ...types import serializable
+from ...util import none_or
+from ..element_iterator import ElementIterator
+from ..errors import MalformedXML
+from .namespace import Namespace
+from .page import Page
+
+
+class ConcatinatingTextReader(io.TextIOBase):
+
+    def __init__(self, *items):
+        self.items = [io.StringIO(i) if isinstance(i, str) else i
+                      for i in items]
+
+    def read(self, size=-1):
+        return "".join(self._read(size))
+
+    def readline(self):
+
+        if len(self.items) > 0:
+            line = self.items[0].readline()
+            if line == "": self.items.pop(0)
+        else:
+            line = ""
+
+        return line
+
+    def _read(self, size):
+        if size > 0:
+            while len(self.items) > 0:
+                byte_vals = self.items[0].read(size)
+                yield byte_vals
+                if len(byte_vals) < size:
+                    size = size - len(byte_vals) # Decrement bytes
+                    self.items.pop(0)
+                else:
+                    break
+
+        else:
+            for item in self.items:
+                yield item.read()
+
+
+
+
+def concat(*stream_items):
+    return ConcatinatingTextReader(*stream_items)
+
+
+class Iterator(serializable.Type):
+    """
+    XML Dump Iterator. Dump file meta data and a
+    :class:`~mw.xml_dump.Page` iterator.  Instances of this class can be
+    called as an iterator directly.  E.g.::
+
+        from mw.xml_dump import Iterator
+
+        # Construct dump file iterator
+        dump = Iterator.from_file(open("example/dump.xml"))
+
+        # Iterate through pages
+        for page in dump:
+
+            # Iterate through a page's revisions
+            for revision in page:
+
+                print(revision.id)
+
+    """
+    __slots__ = ('site_name', 'base', 'generator', 'case', 'namespaces',
+                 '__pages')
+
+    def __init__(self, site_name=None, dbname=None, base=None, generator=None,
+                 case=None, namespaces=None, pages=None):
+
+        self.site_name = none_or(site_name, str)
+        """
+        The name of the site. : str | `None` (if not specified in the XML)
+        """
+
+        self.dbname = none_or(dbname, str)
+        """
+        The database name of the site. : str | `None` (if not specified in the
+        XML)
+        """
+
+        self.base = none_or(base, str)
+        """
+        TODO: ??? : str | `None` (if not specified in the XML)
+        """
+
+        self.generator = none_or(generator, str)
+        """
+        TODO: ??? : str | `None` (if not specified in the XML)
+        """
+
+        self.case = none_or(case, str)
+        """
+        TODO: ??? : str | `None` (if not specified in the XML)
+        """
+
+        self.namespaces = none_or(namespaces, list)
+        """
+        A list of :class:`mw.Namespace` | `None` (if not specified in the XML)
+        """
+
+        # Should be a lazy generator of page info
+        self.__pages = pages
+
+    def __iter__(self):
+        return self.__pages
+
+    def __next__(self):
+        return next(self.__pages)
+
+    @classmethod
+    def load_namespaces(cls, element):
+        namespaces = []
+        for sub_element in element:
+            tag = sub_element.tag
+
+            if tag == "namespace":
+                namespace = Namespace.from_element(sub_element)
+                namespaces.append(namespace)
+            else:
+                assert False, "This should never happen"
+
+        return namespaces
+
+    @classmethod
+    def load_site_info(cls, element):
+
+        site_name = None
+        dbname = None
+        base = None
+        generator = None
+        case = None
+        namespaces = {}
+
+        for sub_element in element:
+            if sub_element.tag == 'sitename':
+                site_name = sub_element.text
+            if sub_element.tag == 'dbname':
+                dbname = sub_element.text
+            elif sub_element.tag == 'base':
+                base = sub_element.text
+            elif sub_element.tag == 'generator':
+                generator = sub_element.text
+            elif sub_element.tag == 'case':
+                case = sub_element.text
+            elif sub_element.tag == 'namespaces':
+                namespaces = cls.load_namespaces(sub_element)
+
+        return site_name, dbname, base, generator, case, namespaces
+
+    @classmethod
+    def load_pages(cls, element):
+
+        for sub_element in element:
+            tag = sub_element.tag
+
+            if tag == "page":
+                yield Page.from_element(sub_element)
+            else:
+                assert MalformedXML("Expected to see 'page'.  " +
+                                    "Instead saw '{0}'".format(tag))
+
+    @classmethod
+    def from_element(cls, element):
+
+        site_name = None
+        base = None
+        generator = None
+        case = None
+        namespaces = None
+
+        # Consume <siteinfo>
+        for sub_element in element:
+            tag = sub_element.tag
+            if tag == "siteinfo":
+                site_name, dbname, base, generator, case, namespaces = \
+                    cls.load_site_info(sub_element)
+                break
+
+        # Consume all <page>
+        pages = cls.load_pages(element)
+
+        return cls(site_name, dbname, base, generator, case, namespaces, pages)
+
+    @classmethod
+    def from_file(cls, f):
+        element = ElementIterator.from_file(f)
+        assert element.tag == "mediawiki"
+        return cls.from_element(element)
+
+    @classmethod
+    def from_string(cls, string):
+        f = io.StringIO(string)
+        element = ElementIterator.from_file(f)
+        assert element.tag == "mediawiki"
+        return cls.from_element(element)
+
+    @classmethod
+    def from_page_xml(cls, page_xml):
+        header = """
+        <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/"
+                   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+                   xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/
+                     http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5"
+                   xml:lang="en">
+        <siteinfo>
+            <namespaces>
+            </namespaces>
+        </siteinfo>
+        """
+
+        footer = "</mediawiki>"
+
+        return cls.from_file(concat(header, page_xml, footer))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/namespace.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/namespace.py
new file mode 100644 (file)
index 0000000..652fabf
--- /dev/null
@@ -0,0 +1,11 @@
+from ... import types
+
+
+class Namespace(types.Namespace):
+    @classmethod
+    def from_element(cls, element):
+        return cls(
+            element.attr('key'),
+            element.text,
+            case=element.attr('case')
+        )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/page.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/page.py
new file mode 100644 (file)
index 0000000..e05d882
--- /dev/null
@@ -0,0 +1,118 @@
+from ...types import serializable
+from ...util import none_or
+from ..errors import MalformedXML
+from .redirect import Redirect
+from .revision import Revision
+
+
+class Page(serializable.Type):
+    """
+    Page meta data and a :class:`~mw.xml_dump.Revision` iterator.  Instances of
+    this class can be called as iterators directly.  E.g.
+
+    .. code-block:: python
+
+        page = mw.xml_dump.Page( ... )
+
+        for revision in page:
+            print("{0} {1}".format(revision.id, page_id))
+
+    """
+    __slots__ = (
+        'id',
+        'title',
+        'namespace',
+        'redirect',
+        'restrictions'
+    )
+
+    def __init__(self, id, title, namespace, redirect, restrictions, revisions=None):
+        self.id = none_or(id, int)
+        """
+        Page ID : `int`
+        """
+
+        self.title = none_or(title, str)
+        """
+        Page title (namespace excluded) : `str`
+        """
+
+        self.namespace = none_or(namespace, int)
+        """
+        Namespace ID : `int`
+        """
+
+        self.redirect = none_or(redirect, Redirect)
+        """
+        Page is currently redirect? : :class:`~mw.xml_dump.Redirect` | `None`
+        """
+
+        self.restrictions = serializable.List.deserialize(restrictions)
+        """
+        A list of page editing restrictions (empty unless restrictions are specified) : list( `str` )
+        """
+
+        # Should be a lazy generator
+        self.__revisions = revisions or []
+
+    def __iter__(self):
+        return self.__revisions
+
+    def __next__(self):
+        return next(self.__revisions)
+
+    @classmethod
+    def load_revisions(cls, first_revision, element):
+        yield Revision.from_element(first_revision)
+
+        for sub_element in element:
+            tag = sub_element.tag
+
+            if tag == "revision":
+                yield Revision.from_element(sub_element)
+            else:
+                raise MalformedXML("Expected to see 'revision'.  " +
+                                   "Instead saw '{0}'".format(tag))
+
+    @classmethod
+    def from_element(cls, element):
+        title = None
+        namespace = None
+        id = None
+        redirect = None
+        restrictions = []
+
+        first_revision = None
+
+        # Consume each of the elements until we see <id> which should come last.
+        for sub_element in element:
+            tag = sub_element.tag
+            if tag == "title":
+                title = sub_element.text
+            elif tag == "ns":
+                namespace = sub_element.text
+            elif tag == "id":
+                id = int(sub_element.text)
+            elif tag == "redirect":
+                redirect = Redirect.from_element(sub_element)
+            elif tag == "restrictions":
+                restrictions.append(sub_element.text)
+            elif tag == "DiscussionThreading":
+                continue
+            elif tag == "sha1":
+                continue
+            elif tag == "revision":
+                first_revision = sub_element
+                break
+            # Assuming that the first revision seen marks the end of page
+            # metadata.  I'm not too keen on this assumption, so I'm leaving
+            # this long comment to warn whoever ends up maintaining this.
+            else:
+                raise MalformedXML("Unexpected tag found when processing " +
+                                   "a <page>: '{0}'".format(tag))
+
+        # Assuming that I got here by seeing a <revision> tag.  See verbose
+        # comment above.
+        revisions = cls.load_revisions(first_revision, element)
+
+        return cls(id, title, namespace, redirect, restrictions, revisions)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/redirect.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/redirect.py
new file mode 100644 (file)
index 0000000..28876ed
--- /dev/null
@@ -0,0 +1,29 @@
+from ...types import serializable
+from ...util import none_or
+
+
+class Redirect(serializable.Type):
+    """
+    Represents a redirect tag.
+
+    **title**
+        Full page name that this page is redirected to : `str`
+    """
+
+    def __new__(cls, redirect_or_title):
+        if isinstance(redirect_or_title, cls):
+            return redirect_or_title
+        else:
+            inst = super().__new__(cls)
+            inst.initialize(redirect_or_title)
+            return inst
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def initialize(self, title):
+        self.title = none_or(title, str)
+
+    @classmethod
+    def from_element(cls, e):
+        return cls(e.attr('title'))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/revision.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/revision.py
new file mode 100644 (file)
index 0000000..85fa7a9
--- /dev/null
@@ -0,0 +1,116 @@
+from ...types import serializable, Timestamp
+from ...util import none_or
+from .comment import Comment
+from .contributor import Contributor
+from .text import Text
+from .util import consume_tags
+
+
+class Revision(serializable.Type):
+    """
+    Revision meta data.
+    """
+    __slots__ = ('id', 'timestamp', 'contributor', 'minor', 'comment', 'text',
+                 'bytes', 'sha1', 'parent_id', 'model', 'format',
+                 'beginningofpage')
+
+    TAG_MAP = {
+        'id': lambda e: int(e.text),
+        'timestamp': lambda e: Timestamp(e.text),
+        'contributor': lambda e: Contributor.from_element(e),
+        'minor': lambda e: True,
+        'comment': lambda e: Comment.from_element(e),
+        'text': lambda e: Text.from_element(e),
+        'sha1': lambda e: str(e.text),
+        'parentid': lambda e: int(e.text),
+        'model': lambda e: str(e.text),
+        'format': lambda e: str(e.text)
+    }
+
+    def __init__(self, id, timestamp, contributor=None, minor=None,
+                 comment=None, text=None, bytes=None, sha1=None,
+                 parent_id=None, model=None, format=None,
+                 beginningofpage=False):
+        self.id = none_or(id, int)
+        """
+        Revision ID : `int`
+        """
+
+        self.timestamp = none_or(timestamp, Timestamp)
+        """
+        Revision timestamp : :class:`mw.Timestamp`
+        """
+
+        self.contributor = none_or(contributor, Contributor.deserialize)
+        """
+        Contributor meta data : :class:`~mw.xml_dump.Contributor` | `None`
+        """
+
+        self.minor = False or none_or(minor, bool)
+        """
+        Is revision a minor change? : `bool`
+        """
+
+        self.comment = none_or(comment, Comment)
+        """
+        Comment left with revision : :class:`~mw.xml_dump.Comment` (behaves like `str`, with additional members)
+        """
+
+        self.text = none_or(text, Text)
+        """
+        Content of text : :class:`~mw.xml_dump.Text` (behaves like `str`, with additional members)
+        """
+
+        self.bytes = none_or(bytes, int)
+        """
+        Number of bytes of content : `str`
+        """
+
+        self.sha1 = none_or(sha1, str)
+        """
+        sha1 hash of the content : `str`
+        """
+
+        self.parent_id = none_or(parent_id, int)
+        """
+        Revision ID of preceding revision : `int` | `None`
+        """
+
+        self.model = none_or(model, str)
+        """
+        TODO: ??? : `str`
+        """
+
+        self.format = none_or(format, str)
+        """
+        TODO: ??? : `str`
+        """
+
+        self.beginningofpage = bool(beginningofpage)
+        """
+        Is the first revision of a page : `bool`
+        Used to identify the first revision of a page when using Wikihadoop
+        revision pairs.  Otherwise is always set to False.  Do not expect to use
+        this when processing an XML dump directly.
+        """
+
+    @classmethod
+    def from_element(cls, element):
+        values = consume_tags(cls.TAG_MAP, element)
+
+        return cls(
+            values.get('id'),
+            values.get('timestamp'),
+            values.get('contributor'),
+            values.get('minor') is not None,
+            values.get('comment'),
+            values.get('text'),
+            values.get('bytes'),
+            values.get('sha1'),
+            values.get('parentid'),
+            values.get('model'),
+            values.get('format'),
+            element.attr('beginningofpage') is not None
+                    # For Wikihadoop.
+                    # Probably never used by anything, ever.
+        )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/test_comment.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/test_comment.py
new file mode 100644 (file)
index 0000000..36ab580
--- /dev/null
@@ -0,0 +1,33 @@
+from nose.tools import eq_
+
+from ..comment import Comment
+
+
+def test_immutability():
+    c = Comment("foo")
+    b = Comment(c)
+    eq_(id(c), id(b))
+
+
+def test_empty_constructor():
+    c = Comment()
+    eq_(c, "")
+    eq_(c.deleted, False)
+
+
+def test_deleted_constructor():
+    c = Comment("", deleted=True)
+    eq_(c, "")
+    eq_(c.deleted, True)
+
+
+def test_full_constructor():
+    c = Comment("Foobar!", deleted=False)
+    eq_(c, "Foobar!")
+    eq_(c.deleted, False)
+
+
+def test_serialize():
+    c = Comment("Foobar!", deleted=False)
+    c2 = Comment.deserialize(c.serialize())
+    eq_(c2.deleted, False)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/test_iterator.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/test_iterator.py
new file mode 100644 (file)
index 0000000..348a48c
--- /dev/null
@@ -0,0 +1,266 @@
+import io
+
+from nose.tools import eq_, assert_is_instance
+
+from ....types import Timestamp
+from ..iterator import Iterator
+from ..comment import Comment
+from ..text import Text
+from ..revision import Revision
+from ..page import Page
+
+
+SAMPLE_XML = """
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http
+://www.mediawiki.org/xml/export-0.8/ http://www.mediawiki.org/xml/export-0.8.xsd" version="0.8" xml:lang="en">
+  <siteinfo>
+    <sitename>Wikipedia</sitename>
+    <base>http://en.wikipedia.org/wiki/Main_Page</base>
+    <generator>MediaWiki 1.22wmf2</generator>
+    <case>first-letter</case>
+    <namespaces>
+      <namespace key="0" case="first-letter" />
+      <namespace key="1" case="first-letter">Talk</namespace>
+    </namespaces>
+  </siteinfo>
+  <page>
+    <title>Foo</title>
+    <ns>0</ns>
+    <id>1</id>
+    <revision beginningofpage="true">
+      <id>1</id>
+      <timestamp>2004-08-09T09:04:08Z</timestamp>
+      <contributor>
+        <username>Gen0cide</username>
+        <id>92182</id>
+      </contributor>
+      <text xml:space="preserve" bytes="234" id="55">Revision 1 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+    </revision>
+    <revision>
+      <id>2</id>
+      <timestamp>2004-08-10T09:04:08Z</timestamp>
+      <contributor>
+        <ip>222.152.210.109</ip>
+      </contributor>
+      <text xml:space="preserve" bytes="235" id="56">Revision 2 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <comment>Comment 2</comment>
+      <format>text/x-wiki</format>
+    </revision>
+  </page>
+  <page>
+    <title>Bar</title>
+    <ns>1</ns>
+    <id>2</id>
+    <redirect title="Computer accessibility" />
+    <restrictions>edit=sysop:move=sysop</restrictions>
+    <revision beginningofpage="true">
+      <id>3</id>
+      <timestamp>2004-08-11T09:04:08Z</timestamp>
+      <contributor>
+        <ip>222.152.210.22</ip>
+      </contributor>
+      <text xml:space="preserve" bytes="236" id="57">Revision 3 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+    </revision>
+    <revision>
+      <id>4</id>
+      <timestamp>2004-08-12T09:04:08Z</timestamp>
+      <text id="58" bytes="237" />
+      <sha1>6ixvq7o1yg75n9g9chqqg94myzq11c5</sha1>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+    </revision>
+  </page>
+</mediawiki>"""
+
+
+def test_complete():
+    f = io.StringIO(SAMPLE_XML)
+
+    dump = Iterator.from_file(f)
+    eq_([0, 1], list(ns.id for ns in dump.namespaces))
+
+    page = next(dump)
+    eq_(page.title, "Foo")
+    eq_(page.namespace, 0)
+    eq_(page.id, 1)
+    eq_(page.redirect, None)
+    eq_(page.restrictions, [])
+
+    revision = next(page)
+    eq_(revision.id, 1)
+    eq_(revision.timestamp, Timestamp("2004-08-09T09:04:08Z"))
+    eq_(revision.contributor.id, 92182)
+    eq_(revision.contributor.user_text, "Gen0cide")
+    assert_is_instance(revision.text, Text)
+    eq_(revision.text, "Revision 1 text")
+    eq_(revision.text.bytes, 234)
+    eq_(revision.text.id, 55)
+    eq_(revision.text, "Revision 1 text")
+    eq_(revision.sha1, "g9chqqg94myzq11c56ixvq7o1yg75n9")
+    eq_(revision.comment, None)
+    eq_(revision.model, "wikitext")
+    eq_(revision.format, "text/x-wiki")
+    eq_(revision.beginningofpage, True)
+
+    revision = next(page)
+    eq_(revision.id, 2)
+    eq_(revision.timestamp, Timestamp("2004-08-10T09:04:08Z"))
+    eq_(revision.contributor.id, None)
+    eq_(revision.contributor.user_text, "222.152.210.109")
+    eq_(revision.text, "Revision 2 text")
+    eq_(revision.text.bytes, 235)
+    eq_(revision.text.id, 56)
+    eq_(revision.sha1, "g9chqqg94myzq11c56ixvq7o1yg75n9")
+    assert_is_instance(revision.comment, Comment)
+    eq_(revision.comment, "Comment 2")
+    eq_(revision.model, "wikitext")
+    eq_(revision.format, "text/x-wiki")
+    eq_(revision.beginningofpage, False)
+
+    page = next(dump)
+    assert_is_instance(page, Page)
+    eq_(page.title, "Bar")
+    eq_(page.namespace, 1)
+    eq_(page.id, 2)
+    eq_(page.redirect.title, "Computer accessibility")
+    eq_(page.restrictions, ["edit=sysop:move=sysop"])
+
+    revision = next(page)
+    assert_is_instance(revision, Revision)
+    eq_(revision.id, 3)
+    eq_(revision.timestamp, Timestamp("2004-08-11T09:04:08Z"))
+    eq_(revision.contributor.id, None)
+    eq_(revision.contributor.user_text, "222.152.210.22")
+    assert_is_instance(revision.text, Text)
+    eq_(revision.text.bytes, 236)
+    eq_(revision.text.id, 57)
+    eq_(revision.text, "Revision 3 text")
+    eq_(revision.sha1, "g9chqqg94myzq11c56ixvq7o1yg75n9")
+    eq_(revision.comment, None)
+    eq_(revision.model, "wikitext")
+    eq_(revision.format, "text/x-wiki")
+    assert_is_instance(str(page), str)
+
+    revision = next(page)
+    assert_is_instance(revision, Revision)
+    eq_(revision.id, 4)
+    eq_(revision.timestamp, Timestamp("2004-08-12T09:04:08Z"))
+    eq_(revision.contributor, None)
+    assert_is_instance(revision.text, Text)
+    eq_(revision.text.bytes, 237)
+    eq_(revision.text.id, 58)
+    eq_(revision.text, "")
+    eq_(revision.sha1, "6ixvq7o1yg75n9g9chqqg94myzq11c5")
+    eq_(revision.comment, None)
+    eq_(revision.model, "wikitext")
+    eq_(revision.format, "text/x-wiki")
+    assert_is_instance(str(revision), str)
+
+
+def test_skipping():
+    f = io.StringIO(SAMPLE_XML)
+
+    dump = Iterator.from_file(f)
+
+    page = next(dump)
+    eq_(page.title, "Foo")
+    eq_(page.namespace, 0)
+    eq_(page.id, 1)
+
+    page = next(dump)
+    eq_(page.title, "Bar")
+    eq_(page.namespace, 1)
+    eq_(page.id, 2)
+
+    revision = next(page)
+    eq_(revision.id, 3)
+    eq_(revision.timestamp, Timestamp("2004-08-11T09:04:08Z"))
+    eq_(revision.contributor.id, None)
+    eq_(revision.contributor.user_text, "222.152.210.22")
+    assert_is_instance(revision.text, Text)
+    eq_(revision.text, "Revision 3 text")
+    eq_(revision.sha1, "g9chqqg94myzq11c56ixvq7o1yg75n9")
+    eq_(revision.comment, None)
+    eq_(revision.model, "wikitext")
+    eq_(revision.format, "text/x-wiki")
+
+
+def test_serialization():
+    f = io.StringIO(SAMPLE_XML)
+
+    dump = Iterator.from_file(f)
+
+    eq_(dump, Iterator.deserialize(dump.serialize()))
+
+def test_from_page_xml():
+    page_xml = """
+    <page>
+      <title>Foo</title>
+      <ns>0</ns>
+      <id>1</id>
+      <revision>
+        <id>1</id>
+        <timestamp>2004-08-09T09:04:08Z</timestamp>
+        <contributor>
+          <username>Gen0cide</username>
+          <id>92182</id>
+        </contributor>
+        <text xml:space="preserve">Revision 1 text</text>
+        <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+        <model>wikitext</model>
+        <format>text/x-wiki</format>
+      </revision>
+      <revision>
+        <id>2</id>
+        <timestamp>2004-08-10T09:04:08Z</timestamp>
+        <contributor>
+          <ip>222.152.210.109</ip>
+        </contributor>
+        <text xml:space="preserve">Revision 2 text</text>
+        <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+        <model>wikitext</model>
+        <comment>Comment 2</comment>
+        <format>text/x-wiki</format>
+      </revision>
+    </page>
+    """
+
+    dump = Iterator.from_page_xml(io.StringIO(page_xml))
+
+    # You have a `namespaces`, but it's empty.
+    eq_(dump.namespaces, [])
+
+    page = next(dump)
+    eq_(page.title, "Foo")
+    eq_(page.namespace, 0)
+    eq_(page.id, 1)
+
+    revision = next(page)
+    eq_(revision.id, 1)
+    eq_(revision.timestamp, Timestamp("2004-08-09T09:04:08Z"))
+    eq_(revision.contributor.id, 92182)
+    eq_(revision.contributor.user_text, "Gen0cide")
+    eq_(revision.text, "Revision 1 text")
+    eq_(revision.sha1, "g9chqqg94myzq11c56ixvq7o1yg75n9")
+    eq_(revision.comment, None)
+    eq_(revision.model, "wikitext")
+    eq_(revision.format, "text/x-wiki")
+
+    revision = next(page)
+    eq_(revision.id, 2)
+    eq_(revision.timestamp, Timestamp("2004-08-10T09:04:08Z"))
+    eq_(revision.contributor.id, None)
+    eq_(revision.contributor.user_text, "222.152.210.109")
+    eq_(revision.text, "Revision 2 text")
+    eq_(revision.sha1, "g9chqqg94myzq11c56ixvq7o1yg75n9")
+    eq_(revision.comment, "Comment 2")
+    eq_(revision.model, "wikitext")
+    eq_(revision.format, "text/x-wiki")
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/test_text.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/tests/test_text.py
new file mode 100644 (file)
index 0000000..51f9225
--- /dev/null
@@ -0,0 +1,45 @@
+from nose.tools import eq_
+
+from ..text import Text
+
+
+def test_immutability():
+    a = Text("foo")
+    b = Text(a)
+    eq_(id(a), id(b))
+
+
+def test_empty_constructor():
+    t = Text()
+    eq_(t, "")
+    eq_(t.deleted, False)
+    eq_(t.id, None)
+    eq_(t.xml_space, "preserve")
+    eq_(t.bytes, None)
+
+
+def test_deleted_constructor():
+    t = Text("", deleted=True)
+    eq_(t, "")
+    eq_(t.deleted, True)
+    eq_(t.id, None)
+    eq_(t.xml_space, "preserve")
+    eq_(t.bytes, None)
+
+
+def test_full_constructor():
+    t = Text("Foobar!", deleted=False, id=10, xml_space="foobar", bytes=1001)
+    eq_(t, "Foobar!")
+    eq_(t.deleted, False)
+    eq_(t.id, 10)
+    eq_(t.xml_space, "foobar")
+    eq_(t.bytes, 1001)
+
+
+def test_serialize():
+    t = Text("Foobar!", deleted=False, id=10, xml_space="foobar", bytes=1001)
+    t2 = Text.deserialize(t.serialize())
+    eq_(t2.deleted, False)
+    eq_(t2.id, 10)
+    eq_(t2.xml_space, "foobar")
+    eq_(t2.bytes, 1001)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/text.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/text.py
new file mode 100644 (file)
index 0000000..9c51108
--- /dev/null
@@ -0,0 +1,74 @@
+from ...types import serializable
+from ...util import none_or
+
+
+class Text(str, serializable.Type):
+    """
+    Revision text content.  This class behaves identically to
+    :class:`str` except that it takes and stores an additional set of parameters.
+
+    **deleted**
+        Was the text deleted? : `bool`
+    **xml_space**
+        What to do with extra whitespace? : `str`
+    **id**
+        TODO: ??? : `int` | `None`
+    **bytes**
+        TODO: ??? : `int` | `None`
+
+    >>> from mw.xml_dump import Text
+    >>>
+    >>> t = Text("foo")
+    >>> t == "foo"
+    True
+    >>> t.deleted
+    False
+    >>> t.xml_space
+    'preserve'
+    """
+
+    def __new__(cls, string_or_text="", deleted=False, xml_space="preserve", id=None, bytes=None):
+        if isinstance(string_or_text, cls):
+            return string_or_text
+        else:
+            inst = super().__new__(cls, string_or_text)
+            inst.initialize(string_or_text, deleted, xml_space, id, bytes)
+            return inst
+
+    def initialize(self, string, deleted, xml_space, id, bytes):
+        self.deleted = bool(deleted)
+        self.xml_space = none_or(xml_space, str)
+        self.id = none_or(id, int)
+        self.bytes = none_or(bytes, int)
+
+    def __str__(self):
+        return str.__str__(self)
+
+    def __repr__(self):
+        return "{0}({1})".format(
+            self.__class__.__name__,
+            ", ".join([
+                str.__repr__(self),
+                "deleted={0}".format(repr(self.deleted))
+            ])
+        )
+
+    def serialize(self):
+        return {
+            "string_or_text": str(self),
+            "deleted": self.deleted,
+            "xml_space": self.xml_space,
+            "id": self.id,
+            "bytes": self.bytes
+        }
+
+    @classmethod
+    def from_element(cls, e):
+        content = e.text or ""
+        return cls(
+            content,
+            deleted=e.attr('deleted', False),
+            xml_space=e.attr('xml:space'),
+            id=e.attr('id'),
+            bytes=e.attr('bytes')
+        )
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/util.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/iteration/util.py
new file mode 100644 (file)
index 0000000..0ce8590
--- /dev/null
@@ -0,0 +1,9 @@
+def consume_tags(tag_map, element):
+    value_map = {}
+    for sub_element in element:
+        tag_name = sub_element.tag
+
+        if tag_name in tag_map:
+            value_map[tag_name] = tag_map[tag_name](sub_element)
+
+    return value_map
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/map.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/map.py
new file mode 100644 (file)
index 0000000..0018a15
--- /dev/null
@@ -0,0 +1,101 @@
+import logging
+from multiprocessing import Queue, Value, cpu_count
+from queue import Empty
+
+from .functions import file
+from .processor import DONE, Processor
+
+logger = logging.getLogger("mw.xml_dump.map")
+
+
+def re_raise(error, path):
+    raise error
+
+
+
+def map(paths, process_dump, handle_error=re_raise,
+        threads=cpu_count(), output_buffer=100):
+    """
+    Maps a function across a set of dump files and returns
+    an (order not guaranteed) iterator over the output.
+
+    The `process_dump` function must return an iterable object (such as a
+    generator).  If your process_dump function does not need to produce
+    output, make it return an empty `iterable` upon completion (like an empty
+    list).
+
+    :Parameters:
+        paths : iter( str )
+            a list of paths to dump files to process
+        process_dump : function( dump : :class:`~mw.xml_dump.Iterator`, path : str)
+            a function to run on every :class:`~mw.xml_dump.Iterator`
+        threads : int
+            the number of individual processing threads to spool up
+        output_buffer : int
+            the maximum number of output values to buffer.
+
+    :Returns:
+        An iterator over values yielded by calls to `process_dump()`
+    :Example:
+        .. code-block:: python
+
+            from mw import xml_dump
+
+            files = ["examples/dump.xml", "examples/dump2.xml"]
+
+            def page_info(dump, path):
+                for page in dump:
+
+                    yield page.id, page.namespace, page.title
+
+
+            for page_id, page_namespace, page_title in xml_dump.map(files, page_info):
+                print("\t".join([str(page_id), str(page_namespace), page_title]))
+    """
+    paths = list(paths)
+    pathsq = queue_files(paths)
+    outputq = Queue(maxsize=output_buffer)
+    running = Value('i', 0)
+    threads = max(1, min(int(threads), pathsq.qsize()))
+
+    processors = []
+
+    for i in range(0, threads):
+        processor = Processor(
+            pathsq,
+            outputq,
+            process_dump
+        )
+        processor.start()
+        processors.append(processor)
+
+    # output while processes are running
+    done = 0
+    while done < len(paths):
+        try:
+            error, item = outputq.get(timeout=.25)
+        except Empty:
+            continue
+
+        if not error:
+            if item is DONE:
+                done += 1
+            else:
+                yield item
+        else:
+            error, path = item
+            re_raise(error, path)
+
+def queue_files(paths):
+    """
+    Produces a `multiprocessing.Queue` containing path for each value in
+    `paths` to be used by the `Processor`s.
+
+    :Parameters:
+        paths : iterable
+            the paths to add to the processing queue
+    """
+    q = Queue()
+    for path in paths:
+        q.put(file(path))
+    return q
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/processor.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/processor.py
new file mode 100644 (file)
index 0000000..a496ad8
--- /dev/null
@@ -0,0 +1,48 @@
+import logging
+import traceback
+from collections import namedtuple
+from multiprocessing import Process
+from queue import Empty
+
+from .functions import open_file
+from .iteration import Iterator
+
+logger = logging.getLogger("mw.dump.processor")
+
+ErrorItem = namedtuple("ErrorItem", ['error', 'item'])
+
+class DONE: pass
+
+
+class Processor(Process):
+    def __init__(self, pathq, outputq, process_dump, logger=logger):
+        self.pathq = pathq
+        self.outputq = outputq
+        self.process_dump = process_dump
+        self.logger = logger
+        Process.__init__(self)
+
+    def run(self):
+        try:
+            while True:
+
+                # Force the queue to reset & behave reasonably
+                foo = self.pathq.qsize()
+                path = self.pathq.get(block=False)
+                dump = Iterator.from_file(open_file(path))
+                logger.info("Beginning to process {0}.".format(repr(path)))
+                try:
+                    for out in self.process_dump(dump, path):
+                        self.outputq.put(ErrorItem(False, out))
+                except Exception as error:
+
+                    self.outputq.put(ErrorItem(True, (error, path)))
+
+                    logger.error(
+                        "Failed while processing dump " +
+                        "{0}: {1}".format(repr(path),
+                                          "\n" + traceback.format_exc()))
+        except Empty:
+            self.logger.info("Nothing left to do.  Shutting down thread.")
+        finally:
+            self.outputq.put(ErrorItem(False, DONE))
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/__init__.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/test_element_iterator.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/test_element_iterator.py
new file mode 100644 (file)
index 0000000..932b156
--- /dev/null
@@ -0,0 +1,104 @@
+import io
+
+from nose.tools import eq_
+from ..element_iterator import EventPointer, ElementIterator
+
+
+TEST_XML = """
+<foo>
+    <bar>
+        <herp>content</herp>
+    </bar>
+    <derp foo="bar"></derp>
+</foo>
+"""
+
+
+def test_pointer():
+    pointer = EventPointer.from_file(io.StringIO(TEST_XML))
+
+    eq_(pointer.tag_stack, [])
+    eq_(pointer.depth(), 0)
+
+    event, element = next(pointer)
+    eq_(pointer.tag_stack, ["foo"])
+    eq_(pointer.depth(), 1)
+    eq_(element.tag, "foo")
+    eq_(event, "start")
+
+    event, element = next(pointer)
+    eq_(pointer.tag_stack, ["foo", "bar"])
+    eq_(pointer.depth(), 2)
+    eq_(element.tag, "bar")
+    eq_(event, "start")
+
+    event, element = next(pointer)
+    eq_(pointer.tag_stack, ["foo", "bar", "herp"])
+    eq_(pointer.depth(), 3)
+    eq_(element.tag, "herp")
+    eq_(event, "start")
+
+    event, element = next(pointer)
+    eq_(pointer.tag_stack, ["foo", "bar"])
+    eq_(pointer.depth(), 2)
+    eq_(element.tag, "herp")
+    eq_(event, "end")
+
+    event, element = next(pointer)
+    eq_(pointer.tag_stack, ["foo"])
+    eq_(pointer.depth(), 1)
+    eq_(element.tag, "bar")
+    eq_(event, "end")
+
+    event, element = next(pointer)
+    eq_(pointer.tag_stack, ["foo", "derp"])
+    eq_(pointer.depth(), 2)
+    eq_(element.tag, "derp")
+    eq_(event, "start")
+
+    event, element = next(pointer)
+    eq_(pointer.tag_stack, ["foo"])
+    eq_(pointer.depth(), 1)
+    eq_(element.tag, "derp")
+    eq_(event, "end")
+
+    event, element = next(pointer)
+    eq_(pointer.tag_stack, [])
+    eq_(pointer.depth(), 0)
+    eq_(element.tag, "foo")
+    eq_(event, "end")
+
+    try:
+        event, element = next(pointer)
+    except StopIteration:
+        return True
+
+    assert False, "Iteration did not stop as expected."
+
+
+def test_iterator():
+    foo_element = ElementIterator.from_file(io.StringIO(TEST_XML))
+    foo_iterator = iter(foo_element)
+
+    bar_element = next(foo_iterator)
+    bar_iterator = iter(bar_element)
+    eq_(bar_element.tag, "bar")
+
+    herp_element = next(bar_iterator)
+    eq_(herp_element.tag, "herp")
+    eq_(herp_element.text, "content")
+
+    derp_element = next(foo_iterator)
+    eq_(derp_element.tag, "derp")
+    eq_(derp_element.attr("foo"), "bar")
+
+
+def test_skipping_iterator():
+    foo_element = ElementIterator.from_file(io.StringIO(TEST_XML))
+    foo_iterator = iter(foo_element)
+
+    bar_element = next(foo_iterator)
+
+    derp_element = next(foo_iterator)
+    eq_(derp_element.tag, "derp")
+    eq_(derp_element.attr("foo"), "bar")
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/test_functions.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/test_functions.py
new file mode 100644 (file)
index 0000000..53dab7f
--- /dev/null
@@ -0,0 +1,10 @@
+import os.path
+
+from nose.tools import eq_, raises
+
+from ..functions import open_file
+
+
+def test_open_file_7z():
+    f = open_file(os.path.join(os.path.dirname(__file__), "test.7z"))
+    eq_(f.read(), b"foobartest\n")
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/test_map.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/test_map.py
new file mode 100644 (file)
index 0000000..ed5da93
--- /dev/null
@@ -0,0 +1,170 @@
+import io
+
+from nose.tools import eq_, raises
+
+from ..map import map
+
+
+SAMPLE_XML = """
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/"
+           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+           xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.8/
+           http://www.mediawiki.org/xml/export-0.8.xsd"
+           version="0.8" xml:lang="en">
+  <siteinfo>
+    <sitename>Wikipedia</sitename>
+    <base>http://en.wikipedia.org/wiki/Main_Page</base>
+    <generator>MediaWiki 1.22wmf2</generator>
+    <case>first-letter</case>
+    <namespaces>
+      <namespace key="0" case="first-letter" />
+      <namespace key="1" case="first-letter">Talk</namespace>
+    </namespaces>
+  </siteinfo>
+  <page>
+    <title>Foo</title>
+    <ns>0</ns>
+    <id>1</id>
+    <revision>
+      <id>1</id>
+      <timestamp>2004-08-09T09:04:08Z</timestamp>
+      <contributor>
+        <username>Gen0cide</username>
+        <id>92182</id>
+      </contributor>
+      <text xml:space="preserve">Revision 1 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+    </revision>
+    <revision>
+      <id>2</id>
+      <timestamp>2004-08-10T09:04:08Z</timestamp>
+      <contributor>
+        <ip>222.152.210.109</ip>
+      </contributor>
+      <text xml:space="preserve">Revision 2 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <comment>Comment 2</comment>
+      <format>text/x-wiki</format>
+    </revision>
+  </page>
+  <page>
+    <title>Bar</title>
+    <ns>1</ns>
+    <id>2</id>
+    <revision>
+      <id>3</id>
+      <timestamp>2004-08-11T09:04:08Z</timestamp>
+      <contributor>
+        <ip>222.152.210.22</ip>
+      </contributor>
+      <text xml:space="preserve">Revision 3 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+    </revision>
+  </page>
+</mediawiki>"""
+
+
+def test_map():
+    f = io.StringIO(SAMPLE_XML)
+
+    def process_dump(dump, path):
+        for page in dump:
+            count = 0
+            for rev in page:
+                count += 1
+
+            yield {'page_id': page.id, 'revisions': count}
+
+    pages = 0
+    for doc in map([f], process_dump):
+        page_id = doc['page_id']
+        revisions = doc['revisions']
+        if page_id == 1:
+            eq_(revisions, 2)
+        elif page_id == 2:
+            eq_(revisions, 1)
+        else:
+            assert False
+
+        pages += 1
+
+    eq_(pages, 2)
+
+
+def test_dict_yield():
+    def test_map():
+        f = io.StringIO(SAMPLE_XML)
+
+        def process_dump(dump, path):
+            for page in dump:
+                count = 0
+                for rev in page:
+                    count += 1
+
+                yield {'page_id': page.id, 'revisions': count}
+
+        pages = 0
+        for doc in map([f], process_dump):
+            page_id = doc['page_id']
+            revisions = doc['revisions']
+            if page_id == 1:
+                eq_(revisions, 2)
+            elif page_id == 2:
+                eq_(revisions, 1)
+            else:
+                assert False
+
+            pages += 1
+
+        eq_(pages, 2)
+
+
+@raises(TypeError)
+def test_map_error():
+    f = io.StringIO(SAMPLE_XML)
+
+    def process_dump(dump, path):
+        for page in dump:
+
+            if page.id == 2:
+                raise TypeError("Fake error")
+
+    pages = 0
+    for doc in map([f], process_dump):
+        page_id = doc['page_id']
+
+
+def test_map_error_handler():
+    f = io.StringIO(SAMPLE_XML)
+
+    def process_dump(dump, path, handle_error=lambda exp, stack: None):
+        for page in dump:
+            count = 0
+
+            for rev in page:
+                count += 1
+
+            if count > 2:
+                raise TypeError("Fake type error.")
+
+            yield {'page_id': page.id, 'revisions': count}
+
+    pages = 0
+    for doc in map([f], process_dump):
+        page_id = doc['page_id']
+        revisions = doc['revisions']
+        if page_id == 1:
+            eq_(revisions, 2)
+        elif page_id == 2:
+            eq_(revisions, 1)
+        else:
+            assert False
+
+        pages += 1
+
+    eq_(pages, 2)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/test_processor.py b/mediawiki_dump_tools/Mediawiki-Utilities/mw/xml_dump/tests/test_processor.py
new file mode 100644 (file)
index 0000000..64fe11e
--- /dev/null
@@ -0,0 +1,120 @@
+import io
+from multiprocessing import Queue
+
+from nose.tools import eq_, raises
+
+from ..processor import DONE, Processor
+
+
+SAMPLE_XML = """
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/"
+           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+           xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.8/
+           http://www.mediawiki.org/xml/export-0.8.xsd"
+           version="0.8" xml:lang="en">
+  <siteinfo>
+    <sitename>Wikipedia</sitename>
+    <base>http://en.wikipedia.org/wiki/Main_Page</base>
+    <generator>MediaWiki 1.22wmf2</generator>
+    <case>first-letter</case>
+    <namespaces>
+      <namespace key="0" case="first-letter" />
+      <namespace key="1" case="first-letter">Talk</namespace>
+    </namespaces>
+  </siteinfo>
+  <page>
+    <title>Foo</title>
+    <ns>0</ns>
+    <id>1</id>
+    <revision>
+      <id>1</id>
+      <timestamp>2004-08-09T09:04:08Z</timestamp>
+      <contributor>
+        <username>Gen0cide</username>
+        <id>92182</id>
+      </contributor>
+      <text xml:space="preserve">Revision 1 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+    </revision>
+    <revision>
+      <id>2</id>
+      <timestamp>2004-08-10T09:04:08Z</timestamp>
+      <contributor>
+        <ip>222.152.210.109</ip>
+      </contributor>
+      <text xml:space="preserve">Revision 2 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <comment>Comment 2</comment>
+      <format>text/x-wiki</format>
+    </revision>
+  </page>
+  <page>
+    <title>Bar</title>
+    <ns>1</ns>
+    <id>2</id>
+    <revision>
+      <id>3</id>
+      <timestamp>2004-08-11T09:04:08Z</timestamp>
+      <contributor>
+        <ip>222.152.210.22</ip>
+      </contributor>
+      <text xml:space="preserve">Revision 3 text</text>
+      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
+      <model>wikitext</model>
+      <format>text/x-wiki</format>
+    </revision>
+  </page>
+</mediawiki>"""
+
+
+
+def test_processor():
+
+    pathq = Queue()
+    pathq.put(io.StringIO(SAMPLE_XML))
+
+    outputq = Queue()
+
+    def process_dump(dump, path):
+        for page in dump:
+            yield page.id
+
+
+    processor = Processor(pathq, outputq, process_dump)
+    processor.start()
+
+    error, item = outputq.get()
+    assert not error
+    eq_(item, 1)
+
+    error, item = outputq.get()
+    assert not error
+    eq_(item, 2)
+
+    error, item = outputq.get()
+    assert not error
+    eq_(item, DONE)
+
+def test_processor_error():
+
+    pathq = Queue()
+    pathq.put(io.StringIO(SAMPLE_XML))
+
+    outputq = Queue()
+
+    def process_dump(dump, path):
+        raise Exception("foo")
+
+
+    processor = Processor(pathq, outputq, process_dump)
+    processor.start()
+
+    error, item = outputq.get()
+    assert error
+
+    error, item = outputq.get()
+    assert not error
+    eq_(item, DONE)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/setup.py b/mediawiki_dump_tools/Mediawiki-Utilities/setup.py
new file mode 100644 (file)
index 0000000..d3d09f1
--- /dev/null
@@ -0,0 +1,34 @@
+from distutils.core import setup
+
+from setuptools import find_packages
+
+setup(
+    name='mediawiki-utilities',
+    version="0.4.18",
+    author='Aaron Halfaker',
+    author_email='aaron.halfaker@gmail.com',
+    packages=find_packages(),
+    scripts=[],
+    url='http://pypi.python.org/pypi/mediawiki-utilities',
+    license=open('LICENSE').read(),
+    description='A set of utilities for extracting and processing MediaWiki data.',
+    long_description=open('README.rst').read(),
+    install_requires=[
+        "requests>=2.4",
+        "pymysql>=0.6.2"],
+    test_suite='nose.collector',
+    classifiers=[
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3 :: Only",
+        "Environment :: Other Environment",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+        "Topic :: Text Processing :: Linguistic",
+        "Topic :: Text Processing :: General",
+        "Topic :: Utilities",
+        "Topic :: Scientific/Engineering"
+    ],
+)
diff --git a/mediawiki_dump_tools/Mediawiki-Utilities/tox.ini b/mediawiki_dump_tools/Mediawiki-Utilities/tox.ini
new file mode 100644 (file)
index 0000000..f203bc3
--- /dev/null
@@ -0,0 +1,12 @@
+[tox]
+minversion = 1.6
+skipsdist = True
+envlist = flake8
+
+[flake8]
+exclude = .venv,.tox,dist,doc,build,*.egg
+max-line-length = 120
+
+[testenv:flake8]
+commands = flake8
+deps = flake8
diff --git a/mediawiki_dump_tools/README.rst b/mediawiki_dump_tools/README.rst
new file mode 100644 (file)
index 0000000..111728c
--- /dev/null
@@ -0,0 +1,9 @@
+When you install this from git, you will need to first clone the repository::
+
+  git clone git://projects.mako.cc/mediawiki_dump_tools
+
+From within the repository working directory, initiatlize and set up the
+submodule like::
+
+  git submodule init
+  git submodule update
diff --git a/mediawiki_dump_tools/mw b/mediawiki_dump_tools/mw
new file mode 120000 (symlink)
index 0000000..75b92ba
--- /dev/null
@@ -0,0 +1 @@
+Mediawiki-Utilities/mw
\ No newline at end of file
diff --git a/mediawiki_dump_tools/wikiq b/mediawiki_dump_tools/wikiq
new file mode 100755 (executable)
index 0000000..d0466eb
--- /dev/null
@@ -0,0 +1,358 @@
+#!/usr/bin/env python3
+
+# original wikiq headers are: title articleid revid date_time anon
+# editor editor_id minor text_size text_entropy text_md5 reversion
+# additions_size deletions_size
+
+import argparse
+import sys
+import os, os.path
+import re
+
+from subprocess import Popen, PIPE
+from collections import deque
+from hashlib import sha1
+
+from mw.xml_dump import Iterator
+from mw.lib import persistence
+from mw.lib import reverts
+from urllib.parse import quote
+TO_ENCODE = ('title', 'editor')
+PERSISTENCE_RADIUS=7
+
+def calculate_persistence(tokens_added):
+    return(sum([(len(x.revisions)-1) for x in tokens_added]),
+           len(tokens_added))
+
+class WikiqIterator():
+    def __init__(self, fh, collapse_user=False):
+        self.fh = fh
+        self.collapse_user = collapse_user
+        self.mwiterator = Iterator.from_file(self.fh)
+        self.__pages = self.load_pages()
+
+    def load_pages(self):
+        for page in self.mwiterator:
+            yield WikiqPage(page, collapse_user=self.collapse_user)
+
+    def __iter__(self):
+        return self.__pages
+
+    def __next__(self):
+        return next(self._pages)
+
+class WikiqPage():
+    __slots__ = ('id', 'title', 'namespace', 'redirect',
+                 'restrictions', 'mwpage', '__revisions',
+                 'collapse_user')
+    
+    def __init__(self, page, collapse_user=False):
+        self.id = page.id
+        self.title = page.title
+        self.namespace = page.namespace
+        self.redirect = page.redirect
+        self.restrictions = page.restrictions
+        
+        self.collapse_user = collapse_user
+        self.mwpage = page
+        self.__revisions = self.rev_list()
+
+    def rev_list(self):
+        # Outline for how we want to handle collapse_user=True
+        # iteration   rev.user   prev_rev.user   add prev_rev?
+        #         0          A            None           Never
+        #         1          A               A           False
+        #         2          B               A            True
+        #         3          A               B            True
+        #         4          A               A           False
+        # Post-loop                          A          Always
+        for i, rev in enumerate(self.mwpage):
+            # never yield the first time
+            if i == 0:
+                if self.collapse_user: 
+                    collapsed_revs = 1
+                    rev.collapsed_revs = collapsed_revs
+
+            else:
+                if self.collapse_user:
+                    # yield if this is the last edit in a seq by a user and reset
+                    if not rev.contributor.user_text == prev_rev.contributor.user_text:
+                        yield prev_rev
+                        collapsed_revs = 1
+                        rev.collapsed_revs = collapsed_revs
+                    # otherwise, add one to the counter
+                    else:
+                        collapsed_revs += 1
+                        rev.collapsed_revs = collapsed_revs
+                # if collapse_user is false, we always yield
+                else:
+                    yield prev_rev
+
+            prev_rev = rev
+        # also yield the final time
+        yield prev_rev
+
+    def __iter__(self):
+        return self.__revisions
+
+    def __next__(self):
+        return next(self.__revisions)
+
+class WikiqParser():
+
+
+    def __init__(self, input_file, output_file, collapse_user=False, persist=False, urlencode=False):
+        
+        self.input_file = input_file
+        self.output_file = output_file
+        self.collapse_user = collapse_user
+        self.persist = persist
+        self.printed_header = False
+        self.namespaces = []
+        self.urlencode = urlencode
+        
+    def __get_namespace_from_title(self, title):
+        default_ns = None
+
+        for ns in self.namespaces:
+            # skip if the namespace is not defined
+            if ns == None:
+                default_ns = self.namespaces[ns]
+                continue
+
+            if title.startswith(ns + ":"):
+                return self.namespaces[ns]
+
+        # if we've made it this far with no matches, we return the default namespace
+        return default_ns
+
+    def process(self):
+
+        # create a regex that creates the output filename
+        # output_filename = re.sub(r'^.*/(enwiki\-\d+)\-.*p(\d+)p.*$',
+        #                         r'output/wikiq-\1-\2.tsv',
+        #                         input_filename)
+
+        # Construct dump file iterator
+        dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
+
+        # extract list of namspaces
+        self.namespaces = {ns.name : ns.id for ns in dump.mwiterator.namespaces}
+
+        page_count = 0
+        rev_count = 0
+        # Iterate through pages
+        for page in dump:
+            if self.persist:
+                state = persistence.State()
+                window = deque(maxlen=PERSISTENCE_RADIUS)
+
+            rev_detector = reverts.Detector()
+
+            # Iterate through a page's revisions
+            for rev in page:
+
+                rev_data = {'revid' : rev.id,
+                            'date_time' : rev.timestamp.strftime('%Y-%m-%d %H:%M:%S'),
+                            'articleid' : page.id,
+                            'editor_id' : "" if rev.contributor.id == None else rev.contributor.id,
+                            'title' : '"' + page.title + '"',
+                            'namespace' : page.namespace if page.namespace else self.__get_namespace_from_title(page.title),
+                            'deleted' : "TRUE" if rev.text.deleted else "FALSE" } 
+
+                # if revisions are deleted, /many/ things will be missing
+                if rev.text.deleted:
+                    rev_data['text_chars'] = ""
+                    rev_data['sha1'] = ""
+                    rev_data['revert'] = ""
+                    rev_data['reverteds'] = ""
+
+                else:
+                    # if text exists, we'll check for a sha1 and generate one otherwise
+                    if rev.sha1:
+                        text_sha1 = rev.sha1
+                    else:
+                        text_sha1 = sha1(bytes(rev.text, "utf8")).hexdigest()
+                    
+                    rev_data['sha1'] = text_sha1
+
+                    # TODO rev.bytes doesn't work.. looks like a bug
+                    rev_data['text_chars'] = len(rev.text)
+               
+                    # generate revert data
+                    revert = rev_detector.process(text_sha1, rev.id)
+                    if revert:
+                        rev_data['revert'] = "TRUE"
+                        rev_data['reverteds'] = '"' + ",".join([str(x) for x in revert.reverteds]) + '"'
+                    else:
+                        rev_data['revert'] = "FALSE"
+                        rev_data['reverteds'] = ""
+
+                # if the fact that the edit was minor can be hidden, this might be an issue
+                rev_data['minor'] = "TRUE" if rev.minor else "FALSE"
+
+                if rev.contributor.user_text:
+                    # wrap user-defined editors in quotes for fread
+                    rev_data['editor'] = '"' + rev.contributor.user_text + '"'
+                    rev_data['anon'] = "TRUE" if rev.contributor.id == None else "FALSE"
+                    
+                else:
+                    rev_data['anon'] = ""
+                    rev_data['editor'] = ""
+
+                #if re.match(r'^#redirect \[\[.*\]\]', rev.text, re.I):
+                #    redirect = True
+                #else:
+                #    redirect = False
+                
+                #TODO missing: additions_size deletions_size
+                
+                # if collapse user was on, lets run that
+                if self.collapse_user:
+                    rev_data['collapsed_revs'] = rev.collapsed_revs
+
+                if self.persist:
+                    if rev.text.deleted:
+                        for k in ["token_revs", "tokens_added", "tokens_removed", "tokens_window"]:
+                            old_rev_data[k] = None
+                    else:
+                        _, tokens_added, tokens_removed = state.process(rev.text, rev.id, text_sha1)
+                        window.append((rev.id, rev_data, tokens_added, tokens_removed))
+                        
+                        if len(window) == PERSISTENCE_RADIUS:
+                            old_rev_id, old_rev_data, old_tokens_added, old_tokens_removed = window[0]
+                            
+                            num_token_revs, num_tokens = calculate_persistence(old_tokens_added)
+
+                            old_rev_data["token_revs"] = num_token_revs
+                            old_rev_data["tokens_added"] = num_tokens
+                            old_rev_data["tokens_removed"] = len(old_tokens_removed)
+                            old_rev_data["tokens_window"] = PERSISTENCE_RADIUS-1
+
+                            self.print_rev_data(old_rev_data)
+
+                else:
+                    self.print_rev_data(rev_data)
+
+                rev_count += 1
+
+            if self.persist:
+                # print out metadata for the last RADIUS revisions
+                for i, item in enumerate(window):
+                    # if the window was full, we've already printed item 0
+                    if len(window) == PERSISTENCE_RADIUS and i == 0:
+                        continue
+
+                    rev_id, rev_data, tokens_added, tokens_removed = item
+                    num_token_revs, num_tokens = calculate_persistence(tokens_added)
+
+                    rev_data["token_revs"] = num_token_revs
+                    rev_data["tokens_added"] = num_tokens
+                    rev_data["tokens_removed"] = len(tokens_removed)
+                    rev_data["tokens_window"] = len(window)-(i+1)
+                    
+                    self.print_rev_data(rev_data)
+
+            page_count += 1
+
+        print("Done: %s revisions and %s pages." % (rev_count, page_count),
+              file=sys.stderr)
+
+    def print_rev_data(self, rev_data):
+        # if it's the first time through, print the header
+        if self.urlencode:
+            for field in TO_ENCODE:
+                rev_data[field] = quote(str(rev_data[field]))
+            
+        if not self.printed_header:
+            print("\t".join([str(k) for k in sorted(rev_data.keys())]), file=self.output_file)
+            self.printed_header = True
+        
+        print("\t".join([str(v) for k, v in sorted(rev_data.items())]), file=self.output_file)
+
+
+def open_input_file(input_filename):
+    if re.match(r'.*\.7z', input_filename):
+        cmd = ["7za", "x", "-so", input_filename, '*.xml'] 
+    elif re.match(r'.*\.gz', input_filename):
+        cmd = ["zcat", input_filename] 
+    elif re.match(r'.*\.bz2', input_filename):
+        cmd = ["zcat", input_filename] 
+
+    try:
+        input_file = Popen(cmd, stdout=PIPE).stdout
+    except NameError:
+        input_file = open(input_filename, 'r')
+
+    return input_file
+
+def open_output_file(input_filename):
+    # create a regex that creates the output filename
+    output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
+    output_filename = re.sub(r'\.xml', '', output_filename)
+    output_filename = output_filename + ".tsv"
+    output_file = open(output_filename, "w")
+
+    return output_file
+
+parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
+
+# arguments for the input direction
+parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str, 
+                    help="Filename of the compressed or uncompressed XML database dump. If absent, we'll look for content on stdin and output on stdout.")
+
+parser.add_argument('-o', '--output-dir', metavar='DIR', dest='output_dir', type=str, nargs=1,
+                    help="Directory for output files.")
+
+parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
+                    help="Write output to standard out (do not create dump file)")
+
+parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
+                    help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")
+
+parser.add_argument('-p', '--persistence', dest="persist", action="store_true",
+                    help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure.")
+
+parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
+                    help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")
+
+args = parser.parse_args()
+
+if len(args.dumpfiles) > 0:
+    for filename in args.dumpfiles:
+        input_file = open_input_file(filename)
+
+        # open file for output
+        if args.stdout:
+            output_file = sys.stdout
+        else:
+            if args.output_dir:
+                output_dir = args.output_dir[0]
+            else:
+                output_dir = "."
+
+        print("Processing file: %s" % filename, file=sys.stderr)
+        filename = os.path.join(output_dir, os.path.basename(filename))
+        output_file = open_output_file(filename)
+
+        wikiq = WikiqParser(input_file, output_file, 
+                           collapse_user=args.collapse_user,
+                            persist=args.persist,
+                            urlencode=args.urlencode)
+
+
+
+        wikiq.process()
+
+        # close things 
+        input_file.close()
+        output_file.close()
+else:
+    wikiq = WikiqParser(sys.stdin, sys.stdout,
+                       collapse_user=args.collapse_user,
+                        persist=args.persist,
+                        urlencode=args.urlencode)
+    wikiq.process()
+
+# stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
+# stop_words = stop_words.split(",")
diff --git a/paper_source/ACM-Reference-Format.bst b/paper_source/ACM-Reference-Format.bst
new file mode 100644 (file)
index 0000000..f79017e
--- /dev/null
@@ -0,0 +1,2893 @@
+%%% -*-BibTeX-*-
+%%% ====================================================================
+%%%  @BibTeX-style-file{
+%%%     author          = "Nelson H. F. Beebe, Boris Veytsman and Gerald Murray",
+%%%     version         = "2.1",
+%%%     date            = "14 June 2017",
+%%%     filename        = "ACM-Reference-Format.bst",
+%%%     email           = "borisv@lk.net, boris@varphi.com",
+%%%     codetable       = "ISO/ASCII",
+%%%     keywords        = "ACM Transactions bibliography style; BibTeX",
+%%%     license         = "public domain",
+%%%     supported       = "yes",
+%%%     abstract        = "",
+%%%  }
+%%% ====================================================================
+
+%%% Revision history:  see source in git
+
+ENTRY
+  { address
+    advisor
+    archiveprefix
+    author
+    booktitle
+    chapter
+    city
+    date
+    edition
+    editor
+    eprint
+    eprinttype
+    eprintclass
+    howpublished
+    institution
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    primaryclass
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+        % New keys recognized
+        issue         % UTAH: used in, e.g., ACM SIGSAM Bulletin and ACM Communications in Computer Algebra
+        articleno
+        eid
+        day           % UTAH: needed for newspapers, weeklies, bi-weeklies
+        doi           % UTAH
+        url           % UTAH
+        bookpages     % UTAH
+        numpages
+        lastaccessed  % UTAH: used only for @Misc{...}
+        coden         % UTAH
+        isbn          % UTAH
+        isbn-13       % UTAH
+        issn          % UTAH
+        lccn          % UTAH
+  }
+  {}
+  { label.year extra.label sort.year sort.label basic.label.year}
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+INTEGERS { show-isbn-10-and-13 }  % initialized below in begin.bib
+
+INTEGERS { nameptr namesleft numnames }
+
+INTEGERS { multiresult }
+
+INTEGERS { len }
+
+INTEGERS { last.extra.num }
+
+STRINGS { s t t.org u }
+
+STRINGS { last.label next.extra }
+
+STRINGS { p1 p2 p3 page.count }
+
+
+FUNCTION { not }
+{
+    { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION { and }
+{
+    'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION { or }
+{
+   { pop$ #1 }
+    'skip$
+  if$
+}
+
+
+FUNCTION { dump.stack.1 }
+{
+    duplicate$ "STACK[top] = [" swap$ * "]" * warning$
+}
+
+FUNCTION { dump.stack.2 }
+{
+    duplicate$ "STACK[top  ] = [" swap$ * "]" * warning$
+    swap$
+    duplicate$ "STACK[top-1] = [" swap$ * "]" * warning$
+    swap$
+}
+
+FUNCTION { empty.or.unknown }
+{
+  %% Examine the top stack entry, and push 1 if it is empty, or
+  %% consists only of whitespace, or is a string beginning with two
+  %% queries (??), and otherwise, push 0.
+  %%
+  %% This function provides a replacement for empty$, with the
+  %% convenient feature that unknown values marked by two leading
+  %% queries are treated the same as missing values, and thus, do not
+  %% appear in the output .bbl file, and yet, their presence in .bib
+  %% file(s) serves to mark values which are temporarily missing, but
+  %% are expected to be filled in eventually once more data is
+  %% obtained.  The TeX User Group and BibNet bibliography archives
+  %% make extensive use of this practice.
+  %%
+  %% An empty string cannot serve the same purpose, because just as in
+  %% statistics data processing, an unknown value is not the same as an
+  %% empty value.
+  %%
+  %% At entry: stack = ... top:[string]
+  %% At exit:  stack = ... top:[0 or 1]
+
+  duplicate$ empty$
+    { pop$ #1 }
+    { #1 #2 substring$ "??" = }
+  if$
+}
+
+FUNCTION { writeln }
+{
+  %% In BibTeX style files, the sequences
+  %%
+  %%     ... "one" "two" output
+  %%     ... "one" "two" output.xxx
+  %%
+  %% ship "one" to the output file, possibly following by punctuation,
+  %% leaving the stack with
+  %%
+  %%     ... "two"
+  %%
+  %% There is thus a one-string lag in output processing that must be
+  %% carefully handled to avoid duplicating a string in the output
+  %% file.  Unless otherwise noted, all output.xxx functions leave
+  %% just one new string on the stack, and that model should be born
+  %% in mind when reading or writing function code.
+  %%
+  %% BibTeX's asynchronous buffering of output from strings from the
+  %% stack is confusing because newline$ bypasses the buffer.  It
+  %% would have been so much easier for newline to be a character
+  %% rather than a state of the output-in-progress.
+  %%
+  %% The documentation in btxhak.dvi is WRONG:  it says
+  %%
+  %%    newline$ Writes onto the bbl file what's accumulated in the
+  %%             output buffer. It writes a blank line if and only
+  %%             if the output buffer is empty. Since write$ does
+  %%             reasonable line breaking, you should use this
+  %%             function only when you want a blank line or an
+  %%             explicit line break.
+  %%
+  %%    write$   Pops the top (string) literal and writes it on the
+  %%             output buffer (which will result in stuff being
+  %%             written onto the bbl file when the buffer fills
+  %%             up).
+  %%
+  %% Examination of the BibTeX source code shows that write$ does
+  %% indeed behave as claimed, but newline$ sends a newline character
+  %% directly to the output file, leaving the stack unchanged.  The
+  %% first line "Writes onto ... buffer." is therefore wrong.
+  %%
+  %% The original BibTeX style files almost always use "write$ newline$"
+  %% in that order, so it makes sense to hide that pair in a private
+  %% function like this one, named after a statement in Pascal,
+  %% the programming language embedded in the BibTeX Web program.
+
+  write$                % output top-of-stack string
+  newline$              % immediate write of newline (not via stack)
+}
+
+FUNCTION { init.state.consts }
+{
+  #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+FUNCTION { output.nonnull }
+{ % Stack in: ... R S T  Stack out: ... R T   File out: S<comma><space>
+  's :=
+  output.state mid.sentence =
+    {
+      ", " * write$
+    }
+    {
+      output.state after.block =
+        {
+          add.period$ writeln
+          "\newblock " write$
+        }
+        {
+          output.state before.all =
+            {
+              write$
+            }
+            {
+              add.period$ " " * write$
+            }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION { output.nonnull.dot.space }
+{ % Stack in: ... R S T  Stack out: ... R T   File out: S<dot><space>
+  's :=
+  output.state mid.sentence =           % { "<DEBUG output.nonnull.dot.space>. " * write$ }
+    {
+      ". " * write$
+    }
+    {
+      output.state after.block =
+        {
+          add.period$ writeln "\newblock " write$
+        }
+        {
+          output.state before.all =
+            {
+              write$
+            }
+            {
+              add.period$ " " * write$
+            }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION { output.nonnull.remove }
+{ % Stack in: ... R S T  Stack out: ... R T   File out: S<space>
+  's :=
+  output.state mid.sentence =
+    {
+      " " * write$
+    }
+    {
+      output.state after.block =
+        {
+          add.period$ writeln "\newblock " write$
+        }
+        {
+          output.state before.all =
+            {
+              write$
+            }
+            {
+              add.period$ " " * write$
+            }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION { output.nonnull.removenospace }
+{ % Stack in: ... R S T  Stack out: ... R T   File out: S
+  's :=
+  output.state mid.sentence =
+    {
+      "" * write$
+    }
+    {
+      output.state after.block =
+        {
+          add.period$ writeln "\newblock " write$
+        }
+        {
+          output.state before.all =
+            {
+              write$
+            }
+            {
+              add.period$ " " * write$
+            }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION { output }
+{ % discard top token if empty, else like output.nonnull
+  duplicate$ empty.or.unknown
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION { output.dot.space }
+{ % discard top token if empty, else like output.nonnull.dot.space
+  duplicate$ empty.or.unknown
+    'pop$
+    'output.nonnull.dot.space
+  if$
+}
+
+FUNCTION { output.removenospace }
+{ % discard top token if empty, else like output.nonnull.removenospace
+  duplicate$ empty.or.unknown
+    'pop$
+    'output.nonnull.removenospace
+  if$
+}
+
+FUNCTION { output.check }
+{ % like output, but warn if key name on top-of-stack is not set
+  't :=
+  duplicate$ empty.or.unknown
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION { bibinfo.output.check }
+{ % like output.check, adding bibinfo field
+  't :=
+  duplicate$ empty.or.unknown
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    { "\bibinfo{" t "}{" * * swap$ * "}" *
+      output.nonnull }
+  if$
+}
+
+FUNCTION { output.check.dot.space }
+{ % like output.dot.space, but warn if key name on top-of-stack is not set
+  't :=
+  duplicate$ empty.or.unknown
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull.dot.space
+  if$
+}
+
+FUNCTION { fin.block }
+{ % functionally, but not logically, identical to fin.entry
+   add.period$
+   writeln
+}
+
+FUNCTION { fin.entry }
+{
+   add.period$
+   writeln
+}
+
+FUNCTION { new.sentence }
+{ % update sentence state, with neither output nor stack change
+  output.state after.block =
+    'skip$
+    {
+      output.state before.all =
+        'skip$
+        { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION { fin.sentence }
+{
+   add.period$
+   write$
+   new.sentence
+   ""
+}
+
+FUNCTION { new.block }
+{
+  output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+FUNCTION { output.coden }       % UTAH
+{ % output non-empty CODEN as one-line sentence (stack untouched)
+  coden empty.or.unknown
+    { }
+    { "\showCODEN{" coden * "}" * writeln }
+  if$
+}
+
+FUNCTION { format.articleno }
+{
+  articleno empty.or.unknown not eid empty.or.unknown not and
+     { "Both articleno and eid are defined for " cite$ * warning$ }
+     'skip$
+  if$
+  articleno empty.or.unknown eid empty.or.unknown and
+     { "" }
+     {
+        numpages empty.or.unknown
+          { "articleno or eid field, but no numpages field, in "
+            cite$ * warning$ }
+          { }
+        if$
+        eid empty.or.unknown
+          { "Article \bibinfo{articleno}{" articleno * "}" * }
+          { "Article \bibinfo{articleno}{" eid * "}" * }
+        if$
+     }
+  if$
+}
+
+FUNCTION { format.year }
+{ % push year string or "[n. d.]" onto output stack
+  %% Because year is a mandatory field, we always force SOMETHING
+  %% to be output
+  "\bibinfo{year}{"
+  year empty.or.unknown
+    { "[n. d.]" }
+    { year }
+  if$
+  *  "}" *
+}
+
+FUNCTION { format.day.month }
+{ % push "day month " or "month " or "" onto output stack
+  day empty.or.unknown
+    {
+      month empty.or.unknown
+        { "" }
+        { "\bibinfo{date}{" month * "} " *}
+      if$
+    }
+    {
+      month empty.or.unknown
+        { "" }
+        { "\bibinfo{date}{" day * " " * month * "} " *}
+      if$
+    }
+  if$
+}
+
+FUNCTION { format.day.month.year }     % UTAH
+{ % if month is empty, push "" else push "(MON.)" or "(DD MON.)"
+  % Needed for frequent periodicals: 2008. ... New York Times C-1, C-2, C-17 (23 Oct.)
+  % acm-*.bst addition: prefix parenthesized date string with
+  % ", Article nnn "
+  articleno empty.or.unknown eid empty.or.unknown and
+    { "" }
+    { output.state after.block =
+       {", " format.articleno * }
+       { format.articleno  }
+      if$
+    }
+  if$
+  " (" * format.day.month * format.year * ")" *
+}
+
+FUNCTION { output.day.month.year }     % UTAH
+{ % if month is empty value, do nothing; else output stack top and
+  % leave with new top string "(MON.)" or "(DD MON.)"
+  % Needed for frequent periodicals: 2008. ... New York Times C-1, C-2, C-17 (23 Oct.)
+  format.day.month.year
+  output.nonnull.remove
+}
+
+FUNCTION { strip.doi } % UTAH
+{ % Strip any Web address prefix to recover the bare DOI, leaving the
+  % result on the output stack, as recommended by CrossRef DOI
+  % documentation.
+  % For example, reduce "http://doi.acm.org/10.1145/1534530.1534545" to
+  % "10.1145/1534530.1534545".  That is later typeset and displayed as
+  % doi:10.1145/1534530.1534545 as the LAST item in the reference list
+  % entry.  Publisher Web sites wrap this with a suitable link to a real
+  % URL to resolve the DOI, and the master https://doi.org/ address is
+  % preferred, since publisher-specific URLs can disappear in response
+  % to economic events.  All journals are encouraged by the DOI
+  % authorities to use that typeset format and link procedures for
+  % uniformity across all publications that include DOIs in reference
+  % lists.
+  % The numeric prefix is guaranteed to start with "10.", so we use
+  % that as a test.
+  % 2017-02-04 Added stripping of https:// (Boris)
+  doi #1 #3 substring$ "10." =
+    { doi }
+    {
+      doi 't :=  % get modifiable copy of DOI
+
+      % Change https:// to http:// to strip both prefixes (BV)
+
+      t #1 #8 substring$ "https://" =
+        { "http://"  t #9 t text.length$ #8 - substring$ * 't := }
+        { }
+      if$
+
+      t #1 #7 substring$ "http://" =
+        {
+            t #8 t text.length$ #7 - substring$ 't :=
+
+            "INTERNAL STYLE-FILE ERROR" 's :=
+
+            % search for next "/" and assign its suffix to s
+
+            { t text.length$ }
+            {
+              t #1 #1 substring$ "/" =
+                {
+                  % save rest of string as true DOI (should be 10.xxxx/yyyy)
+                  t #2 t text.length$ #1 - substring$ 's :=
+                  "" 't :=    % empty string t terminates the loop
+                }
+                {
+                  % discard first character and continue loop: t <= substring(t,2,last)
+                  t #2 t text.length$ #1 - substring$ 't :=
+                }
+              if$
+            }
+            while$
+
+            % check for valid DOI (should be 10.xxxx/yyyy)
+            s #1 #3 substring$ "10." =
+              { }
+              { "unrecognized DOI substring " s * " in DOI value [" * doi * "]" * warning$ }
+            if$
+
+            s   % push the stripped DOI on the output stack
+
+        }
+        {
+          "unrecognized DOI value [" doi * "]" * warning$
+          doi   % push the unrecognized original DOI on the output stack
+        }
+      if$
+    }
+  if$
+}
+
+%
+% Change by BV: added standard prefix to URL
+%
+FUNCTION { output.doi } % UTAH
+{ % output non-empty DOI as one-line sentence (stack untouched)
+  doi empty.or.unknown
+    { }
+    {
+      %% Use \urldef here for the same reason it is used in output.url,
+      %% see output.url for further discussion.
+      "\urldef\tempurl%" writeln
+      "\url{https://doi.org/" strip.doi * "}" * writeln
+      "\showDOI{\tempurl}" writeln
+    }
+  if$
+}
+
+FUNCTION { output.isbn }                % UTAH
+{ % output non-empty ISBN-10 and/or ISBN-13 as one-line sentences (stack untouched)
+  show-isbn-10-and-13
+    {
+      %% show both 10- and 13-digit ISBNs
+      isbn empty.or.unknown
+        { }
+        {
+          "\showISBNx{" isbn * "}" * writeln
+        }
+      if$
+      isbn-13 empty.or.unknown
+        { }
+        {
+          "\showISBNxiii{" isbn-13 * "}" * writeln
+        }
+      if$
+    }
+    {
+      %% show 10-digit ISBNs only if 13-digit ISBNs not available
+      isbn-13 empty.or.unknown
+        {
+          isbn empty.or.unknown
+            { }
+            {
+              "\showISBNx{" isbn * "}" * writeln
+            }
+          if$
+        }
+        {
+          "\showISBNxiii{" isbn-13 * "}" * writeln
+        }
+      if$
+    }
+  if$
+}
+
+FUNCTION { output.issn } % UTAH
+{ % output non-empty ISSN as one-line sentence (stack untouched)
+  issn empty.or.unknown
+    { }
+    { "\showISSN{" issn * "}" * writeln }
+  if$
+}
+
+FUNCTION { output.issue }
+{ % output non-empty issue number as a one-line sentence (stack untouched)
+  issue empty.or.unknown
+    { }
+    { "Issue " issue * "." * writeln }
+  if$
+}
+
+FUNCTION { output.lccn } % UTAH
+{ % return with stack untouched
+  lccn empty.or.unknown
+    { }
+    { "\showLCCN{" lccn * "}" * writeln }
+  if$
+}
+
+FUNCTION { output.note } % UTAH
+{ % return with stack empty
+  note empty.or.unknown
+    { }
+    { "\shownote{" note add.period$ * "}" * writeln }
+  if$
+}
+
+FUNCTION { output.note.check } % UTAH
+{ % return with stack empty
+  note empty.or.unknown
+    { "empty note in " cite$ * warning$ }
+    { "\shownote{" note add.period$ * "}" * writeln }
+  if$
+}
+
+FUNCTION { output.eprint } %
+{ % return with stack empty
+  eprint empty.or.unknown
+    { }
+    { "\showeprint"
+         archiveprefix empty.or.unknown
+           { eprinttype empty.or.unknown
+               { }
+               { "[" eprinttype "]" * * * }
+             if$
+           }
+           { "[" archiveprefix "l" change.case$ "]" * * * }
+         if$
+         "{" *
+         primaryclass empty.or.unknown
+           { eprintclass empty.or.unknown
+             { }
+             { eprintclass "/" * * }
+             if$
+           }
+           { primaryclass "/" * * }
+         if$
+         eprint "}" * *
+         writeln
+    }
+  if$
+}
+
+
+%
+% Changes by BV 2011/04/15.  Do not output
+% url if doi is defined
+%
+FUNCTION { output.url } % UTAH
+{ % return with stack untouched
+  % output URL and associated lastaccessed fields
+  doi empty.or.unknown
+  {
+    url empty.or.unknown
+      { }
+      {
+          %% Use \urldef, outside \showURL, so that %nn, #, etc in URLs work
+          %% correctly.  Put the actual URL on its own line to reduce the
+          %% likelihood of BibTeX's nasty line wrapping after column 79.
+          %% \url{} can undo this, but if that doesn't work for some reason
+          %% the .bbl file would have to be repaired manually.
+          "\urldef\tempurl%" writeln
+          "\url{" url * "}" * writeln
+
+          "\showURL{%" writeln
+          lastaccessed empty.or.unknown
+            { "" }
+            { "Retrieved " lastaccessed * " from " * }
+          if$
+          "\tempurl}" * writeln
+      }
+      if$
+  }
+  { }
+  if$
+}
+
+FUNCTION { output.year.check }
+{ % warn if year empty, output top string and leave " YEAR<label>" on stack in mid-sentence
+  year empty.or.unknown
+     { "empty year in " cite$ * warning$
+       write$
+       " \bibinfo{year}{[n. d.]}"
+       "\natexlab{" extra.label * "}" * *
+       mid.sentence 'output.state :=
+     }
+     { write$
+       " \bibinfo{year}{" year * "}"  *
+       "\natexlab{" extra.label * "}" * *
+       mid.sentence 'output.state :=
+     }
+  if$
+}
+
+
+FUNCTION { le }
+{
+  %% test whether first number is less than or equal to second number
+  %% stack in:  n1 n2
+  %% stack out: if n1 <= n2 then 1 else 0
+
+  %% "DEBUG: le " cite$ * warning$
+  > { #0 } { #1 } if$
+}
+
+FUNCTION { ge }
+{
+  %% test whether first number is greater than or equal to second number
+  %% stack in:  n1 n2
+  %% stack out: if n1 >= n2 then 1 else 0
+
+  %% "DEBUG: ge " cite$ * warning$
+  < { #0 } { #1 } if$
+}
+
+FUNCTION { is.leading.digit }
+{
+  %% test whether first character of string is a digit
+  %% stack in:  string
+  %% stack out: if first-char-is-digit then 1 else 0
+
+  #1 #1 substring$                      % replace string by string[1:1]
+  duplicate$                            % string[1:1] string[1:1]
+  chr.to.int$
+  "0" chr.to.int$ swap$ le              % "0" <= string[1:1] --> 0-or-1
+  swap$                                 % 0-or-1 string[1:1]
+  chr.to.int$
+  "9" chr.to.int$ le                    % string[1:1} <= "9" --> 0-or-1
+  and
+}
+
+FUNCTION { skip.digits }
+{
+  %% skip over leading digits in string
+  %% stack in:  string
+  %% stack out: rest-of-string leading-digits
+
+  %% "DEBUG: enter skip.digits " cite$ * warning$
+
+  %% dump.stack.1
+
+  duplicate$
+  't :=
+  't.org :=
+  "" 'u :=
+
+  { t text.length$ }
+  {
+    %% "=================DEBUG: skip.digits   t = [" t * "]" * warning$
+    t is.leading.digit
+      { t #2 t text.length$ #1 - substring$ }
+      {
+        t 'u :=
+        ""
+      }
+    if$
+    't :=
+  }
+  while$
+
+  u                                                             % rest of string
+  t.org #1 t.org text.length$ u text.length$ - substring$       % leading digits
+
+  %% "DEBUG: t.org = [" t.org * "]" * warning$
+  %% "DEBUG: u     = [" u * "]" * warning$
+
+  %% dump.stack.2
+
+  %% "DEBUG: leave skip.digits " cite$ * warning$
+}
+
+FUNCTION { skip.nondigits }
+{
+  %% skip over leading nondigits in string
+  %% stack in:  string
+  %% stack out: rest-of-string
+
+  %% "DEBUG: enter skip.nondigits " cite$ * warning$
+
+  't :=
+  "" 'u :=
+
+  { t text.length$ }
+  {
+    %% "=================DEBUG: skip.nondigits   t = [" t * "]" * warning$
+    t is.leading.digit
+      {
+        t 'u :=
+        ""
+      }
+      { t #2 t text.length$ #1 - substring$ }
+    if$
+    't :=
+  }
+  while$
+
+  u                     % rest of string
+
+  %% dump.stack.1
+  %% "DEBUG: leave skip.nondigits " cite$ * warning$
+}
+
+FUNCTION { parse.next.number }
+{
+  %% stack in:  string
+  %% stack out: rest-of-string next-numeric-part-of-string
+  %% Example:
+  %% stack in:  "123:1--123:59"
+  %% stack out: ":1--123:59" "123"
+
+  's :=
+  s skip.nondigits 's :=
+  s skip.digits
+}
+
+FUNCTION { reduce.pages.to.page.count }
+{
+  %% Stack in:  arbitrary-and-unused
+  %% Stack out: unchanged
+  %%
+  %% For the new-style pagination with article number and numpages or
+  %% pages, we expect to have BibTeX entries containing something like
+  %%     articleno = "17",
+  %%     pages     = "1--23",
+  %% with output "Article 17, 23 pages",
+  %% or
+  %%     articleno = "17",
+  %%     numpages  = "23",
+  %% with output "Article 17, 23 pages",
+  %% or
+  %%     articleno = "17",
+  %%     pages     = "17:1--17:23",
+  %% with output "Article 17, 23 pages",
+  %%
+  %% If articleno is missing or empty, then we should output "1--23",
+  %% "23" (with a warning of a missing articleno), or "17:1--17:23",
+  %% respectively.
+
+  %% "DEBUG: enter reduce.pages.to.page.count " cite$ * warning$
+
+  %% "DEBUG: pages = [" pages * "]" * warning$
+
+  pages
+  parse.next.number 'p1 :=
+  parse.next.number 'p2 :=
+  parse.next.number 'p3 :=
+  parse.next.number 'page.count :=
+
+  duplicate$
+  empty.or.unknown
+    {  }
+    {
+      duplicate$ "unexpected trailing garbage [" swap$ *
+      "] after n:p1--n:p2 in pages = [" *
+      pages *
+      "] in " *
+      cite$ *
+      warning$
+    }
+  if$
+
+  pop$
+
+  %% "DEBUG: reduce.pages.to.page.count: "
+  %% " p1 = " p1 * *
+  %% " p2 = " p2 * *
+  %% " p3 = " p3 * *
+  %% " p4 = " page.count * *
+  %% " in " cite$ * * warning$
+
+  p1 p3 =   p2 "1" =   and   numpages empty.or.unknown   and
+    { "INFO: reduced pages = [" pages * "] to numpages = [" * page.count * "]" * warning$ }
+    {
+      numpages empty.or.unknown
+        { pages }
+        { numpages }
+      if$
+      'page.count :=
+    }
+  if$
+
+  p1 "1" =   p3 empty.or.unknown   and   numpages empty.or.unknown   and
+    {
+      p2 'page.count :=
+      "INFO: reduced pages = [" pages * "] to numpages = [" * page.count * "]" * warning$
+    }
+    {
+      numpages empty.or.unknown
+        { pages }
+        { numpages }
+      if$
+      'page.count :=
+    }
+  if$
+
+  %% "DEBUG: leave reduce.pages.to.page.count " cite$ * warning$
+}
+
+FUNCTION { new.block.checkb }
+{ % issue a new.block only if at least one of top two stack strings is not empty
+  empty.or.unknown
+  swap$ empty.or.unknown
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION { field.or.null }
+{ % convert empty value to null string, else return value
+  duplicate$ empty.or.unknown
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+
+
+FUNCTION { emphasize }
+{ % emphasize a non-empty top string on the stack
+  duplicate$ empty.or.unknown
+    { pop$ "" }
+    { "\emph{" swap$ * "}" * }
+  if$
+}
+
+FUNCTION { comma }
+{ % convert empty string to null string, or brace string and add trailing comma
+  duplicate$ empty.or.unknown
+    { pop$ "" }
+    { "{" swap$ * "}," * }
+  if$
+}
+
+FUNCTION { format.names }
+{
+  % Format bibliographical entries with the first author last name first,
+  % and subsequent authors with initials followed by last name.
+  % All names are formatted in this routine.
+
+  's :=
+  #1 'nameptr :=               % nameptr = 1;
+  s num.names$ 'numnames :=    % numnames = num.name$(s);
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { nameptr #1 =
+        %NO: BAD ORDER: {"{" s nameptr "{ff~}{ll}{, jj}{, vv}" format.name$ * "}" * 't := }
+        %NO: BAD ORDER: {"{" s nameptr "{ff~}{ll}{, jj}{, vv}" format.name$ * "}" * 't := }
+        {"\bibinfo{person}{" s nameptr "{ff }{vv }{ll}{, jj}" format.name$ * "}" * 't := }
+        {"\bibinfo{person}{" s nameptr "{ff }{vv }{ll}{, jj}" format.name$ * "}" * 't := }
+      if$
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "\bibinfo{person}{others}" =
+                { " {et~al\mbox{.}}" * } % jrh: avoid spacing problems
+                { " {and} " * t * } % from Chicago Manual of Style
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=          % nameptr += 1;
+      namesleft #1 - 'namesleft :=      % namesleft =- 1;
+    }
+  while$
+}
+
+FUNCTION { my.full.label }
+{
+  's :=
+  #1 'nameptr :=               % nameptr = 1;
+  s num.names$ 'numnames :=    % numnames = num.name$(s);
+  numnames 'namesleft :=
+    { namesleft #0 > }
+
+    { s nameptr "{vv~}{ll}" format.name$ 't :=  % get the next name
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                { " et~al\mbox{.}" * } % jrh: avoid spacing problems
+                { " and " * t * } % from Chicago Manual of Style
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=          % nameptr += 1;
+      namesleft #1 - 'namesleft :=      % namesleft =- 1;
+    }
+  while$
+
+}
+
+FUNCTION { format.names.fml }
+{
+  % Format names in "familiar" format, with first initial followed by
+  % last name. Like format.names, ALL names are formatted.
+  % jtb: The names are NOT put in small caps
+
+  's :=
+  #1 'nameptr :=               % nameptr = 1;
+  s num.names$ 'numnames :=    % numnames = num.name$(s);
+  numnames 'namesleft :=
+    { namesleft #0 > }
+
+    {
+      "\bibinfo{person}{" s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ * "}" * 't :=
+
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "\bibinfo{person}{others}" =
+                { " {et~al\mbox{.}}" * }
+                { " {and} " * t * }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=          % nameptr += 1;
+      namesleft #1 - 'namesleft :=      % namesleft =- 1;
+    }
+  while$
+}
+
+FUNCTION { format.authors }
+{
+  author empty.or.unknown
+    { "" }
+    {
+      "\bibfield{author}{"
+      author format.names add.period$ * "}" *} % jtb: add period if none before
+  if$
+}
+
+FUNCTION { format.key }
+{
+  empty.or.unknown
+    { key field.or.null }
+    { "" }
+  if$
+}
+
+FUNCTION { format.no.key }
+{
+  empty.or.unknown
+    { "" }
+    { "" }
+  if$
+}
+
+FUNCTION { format.editors.fml }
+{
+  % Format editor names for use in the "in" types: inbook, incollection,
+  % inproceedings: first initial, then last names. When editors are the
+  % LABEL for an entry, then format.editor is used which lists editors
+  % by last name first.
+
+  editor empty.or.unknown
+    { "" }
+    {
+      "\bibfield{editor}{"
+      editor format.names.fml
+      *  "}" *
+      editor num.names$ #1 >
+        { " (Eds.)" * }
+        { " (Ed.)" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION { format.editors }
+{ % format editor names for use in labels, last names first.
+  editor empty.or.unknown
+    { "" }
+    {
+      "\bibfield{editor}{"
+      editor format.names
+      *  "}" *
+      editor num.names$ #1 >
+        { " (Eds.)." * }
+        { " (Ed.)." * }
+      if$
+    }
+  if$
+}
+
+FUNCTION { format.articletitle }
+{
+  title empty.or.unknown
+    { "" }
+    % Use this to preserve lettercase in titles:
+    { "\showarticletitle{" title * "}" * }
+    % Use this for downcase title style:
+    % { \showarticletitle{" title "t" change.case$ * "}" * }
+  if$
+}
+
+FUNCTION { format.title }
+{
+  title empty.or.unknown
+    { "" }
+    % Use this to preserve lettercase in titles:
+    { "\bibinfo{title}{" title * "}" * }
+    % Use this for downcase title style:
+    % { title "t" change.case$ }
+  if$
+}
+
+FUNCTION { n.dashify }
+{
+  't :=
+  ""
+    { t empty.or.unknown not }
+    {
+      t #1 #1 substring$ "-" =
+        {
+          t #1 #2 substring$ "--" = not
+            { "--" *
+              t #2 global.max$ substring$ 't :=
+            }
+            {
+              { t #1 #1 substring$ "-" = }
+              {
+                "-" *
+                t #2 global.max$ substring$ 't :=
+              }
+              while$
+            }
+          if$
+        }
+        {
+          t #1 #1 substring$ *
+          t #2 global.max$ substring$ 't :=
+        }
+      if$
+    }
+  while$
+}
+
+FUNCTION { format.a.title.with.edition }
+{
+  "\bibinfo{booktitle}{"
+  swap$ emphasize *
+  edition empty.or.unknown
+    'skip$
+    { " (\bibinfo{edition}{" * edition "l" change.case$ *
+      "} ed.)" * } % jtb: no parens for ed.
+  if$
+  "}" *
+}
+
+FUNCTION { format.btitle }
+{ title format.a.title.with.edition }
+
+FUNCTION { format.emphasize.booktitle }
+{ booktitle format.a.title.with.edition }
+
+FUNCTION { format.city }
+{
+  % jtb: if the preceding string (the title of the conference) is non-empty,
+  % jtb: append the location, otherwise leave empty (so as to trigger the
+  % jtb: error message in output.check
+
+  duplicate$ empty.or.unknown
+    { }
+    {
+      city empty.or.unknown
+        {
+          date empty.or.unknown
+            { }
+            { " (" * date * ")" * }
+          if$
+        }
+        {
+          date empty.or.unknown
+            { " (" * city * ")" * }
+            { " (" * city * ", " * date * ")" * }
+          if$
+        }
+      if$
+    }
+  if$
+}
+
+FUNCTION { tie.or.space.connect }
+{
+  duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION { either.or.check }
+{
+  empty.or.unknown
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION { format.bvolume }
+{
+  % jtb: If there is a series, this is added and the volume trails after it.
+  % jtb: Otherwise, "Vol" is Capitalized.
+
+  volume empty.or.unknown
+    { "" }
+    {
+      series empty.or.unknown
+        { "Vol.~\bibinfo{volume}{" volume "}" * *}
+        { "\bibinfo{series}{" series "}, " * *
+          "Vol.~\bibinfo{volume}{" volume "}" * * *}
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION { format.bvolume.noseries }
+{
+  volume empty.or.unknown
+    { "" }
+    { "Vol.~\bibinfo{volume}{" volume "}" * *
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION { format.series }
+{
+  series empty.or.unknown
+    {""}
+    {" \emph{(\bibinfo{series}{" * series "})}" *}
+  if$
+}
+
+FUNCTION { format.number.series }
+{
+  volume empty.or.unknown
+    {
+      number empty.or.unknown
+        {
+          volume empty.or.unknown
+          { "" }
+          {
+            series empty.or.unknown
+              { "" }
+              { " (\bibinfo{series}{" series * "})" * }
+            if$
+          }
+          if$
+        }                                       %    { series field.or.null }
+        {
+          output.state mid.sentence =
+            { "Number" }                        % gnp - changed to mixed case always
+            { "Number" }
+          if$
+          number tie.or.space.connect series empty.or.unknown
+            { "there's a number but no series in " cite$ * warning$ }
+            { " in \bibinfo{series}{" * series * "}" * }
+          if$
+        }
+      if$
+    }
+    {
+      ""
+    }
+  if$
+}
+
+FUNCTION { multi.page.check }
+{
+  't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty.or.unknown not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+    { #1 'multiresult := }
+    { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION { format.pages }
+{
+  pages empty.or.unknown
+    { "" }
+    { "\bibinfo{pages}{"
+      pages multi.page.check
+        { pages n.dashify } % gnp - removed () % jtb: removed pp.
+        { pages }
+      if$
+      * "}" *
+    }
+  if$
+}
+
+FUNCTION { format.pages.check.without.articleno }
+{ %% format pages field only if articleno is absent
+  %% Stack out: pages-specification
+  numpages missing$ pages missing$ and
+    { "page numbers missing in both pages and numpages fields in " cite$ * warning$ }
+    { }
+  if$
+
+  articleno empty.or.unknown eid empty.or.unknown and
+    {
+      pages missing$
+        { numpages }
+        { format.pages }
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION { format.pages.check }
+{
+  pages empty.or.unknown
+    { "page numbers missing in " cite$ * warning$ "" }
+    { pages n.dashify }
+  if$
+}
+
+FUNCTION { format.bookpages }
+{
+  bookpages empty.or.unknown
+    { "" }
+    { bookpages "book pages" tie.or.space.connect }
+  if$
+}
+
+FUNCTION { format.named.pages }
+{
+  pages empty.or.unknown
+    { "" }
+    { format.pages "pages" tie.or.space.connect }
+  if$
+}
+
+%
+% Changed by Boris Veytsman, 2011-03-13
+% Now the word "pages" is printed even if
+% there field pages is not empty.
+%
+
+FUNCTION { format.page.count }
+{
+  page.count empty.or.unknown
+    { "" }
+    { "\bibinfo{numpages}{" page.count * "}~pages" * }
+  if$
+}
+
+FUNCTION { format.articleno.numpages }
+{
+  %% There are seven possible outputs, depending on which fields are set.
+  %%
+  %% These four are handled here:
+  %%
+  %%     articleno, numpages, pages     -> "Article articleno-value, numpages-value pages"
+  %%     articleno, numpages            -> "Article articleno-value, numpages-value pages"
+  %%     articleno, pages               -> "Article articleno-value, reduced-pages-value pages"
+  %%     articleno                      -> "Article articleno-value" and warn about missing numpages
+  %%
+  %% The remaining three have already been handled by
+  %% format.pages.check.without.articleno:
+  %%
+  %%     numpages, pages                -> "pages-value"
+  %%     numpages                       -> "numpages-value"
+  %%     pages                          -> "pages-value"
+
+  articleno empty.or.unknown eid empty.or.unknown and
+    {
+      numpages empty.or.unknown
+        { }
+        { "numpages field, but no articleno or eid field, in "
+          cite$ * warning$ }
+      if$
+      ""
+    }
+    {
+      numpages empty.or.unknown
+        {
+          pages empty.or.unknown
+            {
+              "articleno or eid, but no pages or numpages field in "
+                 cite$ * warning$
+              "" 'page.count :=
+            }
+            { reduce.pages.to.page.count }
+          if$
+        }
+        { numpages 'page.count := }
+      if$
+
+      %% The Article number is now handled in format.day.month.year because
+      %% ACM prefers the style "Digital Libraries 12, 3, Article 5 (July 2008)"
+      %% over "Digital Libraries 12, 3 (July 2008), Article 5"
+      %% format.articleno output
+      format.page.count
+    }
+  if$
+}
+
+FUNCTION {calc.format.page.count}
+{
+  numpages empty.or.unknown
+   {
+     pages empty.or.unknown
+        {
+        "" 'page.count :=
+        }
+        { reduce.pages.to.page.count }
+     if$
+   }
+   { numpages 'page.count := }
+  if$
+  format.page.count
+}
+
+
+FUNCTION { journal.canon.abbrev }
+{
+  % Returns a canonical abbreviation for 'journal', or else 'journal'
+  % unchanged.
+  journal "ACM Computing Surveys"                                                                       = { "Comput. Surveys"                                 } {
+  journal "{ACM} Computing Surveys"                                                                     = { "Comput. Surveys"                                 } {
+  journal "ACM Transactions on Mathematical Software"                                                   = { "ACM Trans. Math. Software"                       } {
+  journal "{ACM} Transactions on Mathematical Software"                                                 = { "ACM Trans. Math. Software"                       } {
+  journal "ACM SIGNUM Newsletter"                                                                       = { "ACM SIGNUM Newslett."                            } {
+  journal "ACM {SIGNUM} Newsletter"                                                                     = { "ACM SIGNUM Newslett."                            } {
+  journal "{ACM} SIGNUM Newsletter"                                                                     = { "ACM SIGNUM Newslett."                            } {
+  journal "{ACM} {SIGNUM} Newsletter"                                                                   = { "ACM SIGNUM Newslett."                            } {
+  journal "American Journal of Sociology"                                                               = { "Amer. J. Sociology"                              } {
+  journal "American Mathematical Monthly"                                                               = { "Amer. Math. Monthly"                             } {
+  journal "American Mathematical Society Translations"                                                  = { "Amer. Math. Soc. Transl."                        } {
+  journal "Applied Mathematics and Computation"                                                         = { "Appl. Math. Comput."                             } {
+  journal "British Journal of Mathematical and Statistical Psychology"                                  = { "Brit. J. Math. Statist. Psych."                  } {
+  journal "Bulletin of the American Mathematical Society"                                               = { "Bull. Amer. Math. Soc."                          } {
+  journal "Canadian Mathematical Bulletin"                                                              = { "Canad. Math. Bull."                              } {
+  journal "Communications of the ACM"                                                                   = { "Commun. ACM"                                     } {
+  journal "Communications of the {ACM}"                                                                 = { "Commun. ACM"                                     } {
+  journal "Computers and Structures"                                                                    = { "Comput. \& Structures"                           } {
+  journal "Contemporary Mathematics"                                                                    = { "Contemp. Math."                                  } {
+  journal "Crelle's Journal"                                                                            = { "Crelle's J."                                     } {
+  journal "Giornale di Mathematiche"                                                                    = { "Giorn. Mat."                                     } {
+  journal "IEEE Transactions on Aerospace and Electronic Systems"                                       = { "IEEE Trans. Aerospace Electron. Systems"         } {
+  journal "{IEEE} Transactions on Aerospace and Electronic Systems"                                     = { "IEEE Trans. Aerospace Electron. Systems"         } {
+  journal "IEEE Transactions on Automatic Control"                                                      = { "IEEE Trans. Automat. Control"                    } {
+  journal "{IEEE} Transactions on Automatic Control"                                                    = { "IEEE Trans. Automat. Control"                    } {
+  journal "IEEE Transactions on Computers"                                                              = { "IEEE Trans. Comput."                             } {
+  journal "{IEEE} Transactions on Computers"                                                            = { "IEEE Trans. Comput."                             } {
+  journal "IMA Journal of Numerical Analysis"                                                           = { "IMA J. Numer. Anal."                             } {
+  journal "{IMA} Journal of Numerical Analysis"                                                         = { "IMA J. Numer. Anal."                             } {
+  journal "Information Processing Letters"                                                              = { "Inform. Process. Lett."                          } {
+  journal "International Journal for Numerical Methods in Engineering"                                  = { "Internat. J. Numer. Methods Engrg."              } {
+  journal "International Journal of Control"                                                            = { "Internat. J. Control"                            } {
+  journal "International Journal of Supercomputing Applications"                                        = { "Internat. J. Supercomputing Applic."             } {
+  journal "Journal of Computational Physics"                                                            = { "J. Comput. Phys."                                } {
+  journal "Journal of Computational and Applied Mathematics"                                            = { "J. Comput. Appl. Math."                          } {
+  journal "Journal of Computer and System Sciences"                                                     = { "J. Comput. System Sci."                          } {
+  journal "Journal of Mathematical Analysis and Applications"                                           = { "J. Math. Anal. Appl."                            } {
+  journal "Journal of Mathematical Physics"                                                             = { "J. Math. Phys."                                  } {
+  journal "Journal of Parallel and Distributed Computing"                                               = { "J. Parallel and Distrib. Comput."                } {
+  journal "Journal of Research of the National Bureau of Standards"                                     = { "J. Res. Nat. Bur. Standards"                     } {
+  journal "Journal of VLSI and Computer Systems"                                                        = { "J. VLSI Comput. Syst."                           } {
+  journal "Journal of {VLSI} and Computer Systems"                                                      = { "J. VLSI Comput. Syst."                           } {
+  journal "Journal of the ACM"                                                                          = { "J. ACM"                                          } {
+  journal "Journal of the American Statistical Association"                                             = { "J. Amer. Statist. Assoc."                        } {
+  journal "Journal of the Institute of Mathematics and its Applications"                                = { "J. Inst. Math. Appl."                            } {
+  journal "Journal of the Society for Industrial and Applied Mathematics"                               = { "J. Soc. Indust. Appl. Math."                     } {
+  journal "Journal of the Society for Industrial and Applied Mathematics, Series B, Numerical Analysis" = { "J. Soc. Indust. Appl. Math. Ser. B Numer. Anal." } {
+  journal "Linear Algebra and its Applications"                                                         = { "Linear Algebra Appl."                            } {
+  journal "Mathematica Scandinavica"                                                                    = { "Math. Scand."                                    } {
+  journal "Mathematical Tables and Other Aids to Computation"                                           = { "Math. Tables Aids Comput."                       } {
+  journal "Mathematics of Computation"                                                                  = { "Math. Comp."                                     } {
+  journal "Mathematische Annalen"                                                                       = { "Math. Ann."                                      } {
+  journal "Numerische Mathematik"                                                                       = { "Numer. Math."                                    } {
+  journal "Pacific Journal of Mathematics"                                                              = { "Pacific J. Math."                                } {
+  journal "Parallel Computing"                                                                          = { "Parallel Comput."                                } {
+  journal "Philosophical Magazine"                                                                      = { "Philos. Mag."                                    } {
+  journal "Proceedings of the American Mathematical Society"                                            = { "Proc. Amer. Math. Soc."                          } {
+  journal "Proceedings of the IEEE"                                                                     = { "Proc. IEEE"                                      } {
+  journal "Proceedings of the {IEEE}"                                                                   = { "Proc. IEEE"                                      } {
+  journal "Proceedings of the National Academy of Sciences of the USA"                                  = { "Proc. Nat. Acad. Sci. U. S. A."                  } {
+  journal "Quarterly Journal of Mathematics, Oxford, Series (2)"                                        = { "Quart. J. Math. Oxford Ser. (2)"                 } {
+  journal "Quarterly of Applied Mathematics"                                                            = { "Quart. Appl. Math."                              } {
+  journal "Review of the International Statisical Institute"                                            = { "Rev. Inst. Internat. Statist."                   } {
+  journal "SIAM Journal on Algebraic and Discrete Methods"                                              = { "SIAM J. Algebraic Discrete Methods"              } {
+  journal "{SIAM} Journal on Algebraic and Discrete Methods"                                            = { "SIAM J. Algebraic Discrete Methods"              } {
+  journal "SIAM Journal on Applied Mathematics"                                                         = { "SIAM J. Appl. Math."                             } {
+  journal "{SIAM} Journal on Applied Mathematics"                                                       = { "SIAM J. Appl. Math."                             } {
+  journal "SIAM Journal on Computing"                                                                   = { "SIAM J. Comput."                                 } {
+  journal "{SIAM} Journal on Computing"                                                                 = { "SIAM J. Comput."                                 } {
+  journal "SIAM Journal on Matrix Analysis and Applications"                                            = { "SIAM J. Matrix Anal. Appl."                      } {
+  journal "{SIAM} Journal on Matrix Analysis and Applications"                                          = { "SIAM J. Matrix Anal. Appl."                      } {
+  journal "SIAM Journal on Numerical Analysis"                                                          = { "SIAM J. Numer. Anal."                            } {
+  journal "{SIAM} Journal on Numerical Analysis"                                                        = { "SIAM J. Numer. Anal."                            } {
+  journal "SIAM Journal on Scientific and Statistical Computing"                                        = { "SIAM J. Sci. Statist. Comput."                   } {
+  journal "{SIAM} Journal on Scientific and Statistical Computing"                                      = { "SIAM J. Sci. Statist. Comput."                   } {
+  journal "SIAM Review"                                                                                 = { "SIAM Rev."                                       } {
+  journal "{SIAM} Review"                                                                               = { "SIAM Rev."                                       } {
+  journal "Software Practice and Experience"                                                            = { "Software Prac. Experience"                       } {
+  journal "Statistical Science"                                                                         = { "Statist. Sci."                                   } {
+  journal "The Computer Journal"                                                                        = { "Comput. J."                                      } {
+  journal "Transactions of the American Mathematical Society"                                           = { "Trans. Amer. Math. Soc."                         } {
+  journal "USSR Computational Mathematics and Mathematical Physics"                                     = { "U. S. S. R. Comput. Math. and Math. Phys."       } {
+  journal "{USSR} Computational Mathematics and Mathematical Physics"                                   = { "U. S. S. R. Comput. Math. and Math. Phys."       } {
+  journal "Zeitschrift fur Angewandte Mathematik und Mechanik"                                          = { "Z. Angew. Math. Mech."                           } {
+  journal "Zeitschrift fur Angewandte Mathematik und Physik"                                            = { "Z. Angew. Math. Phys."                           } {
+  journal
+  } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+  } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+  } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+  } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+  } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+  } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+  } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+  } if$ } if$ } if$ } if$ } if$ } if$ } if$ } if$
+}
+
+FUNCTION { format.journal.volume.number.day.month.year }
+{
+  % By Young (and Spencer)
+  % GNP - fixed bugs with missing volume, number, and/or pages
+  %
+  % Format journal, volume, number, pages for article types.
+  %
+  journal empty.or.unknown
+    { "no journal in " cite$ * warning$ "" }
+    { "\bibinfo{journal}{"
+      journal.canon.abbrev emphasize *
+      "}" * }
+  if$
+
+  number empty.or.unknown
+    {
+      volume empty.or.unknown
+        { "no number and no volume in " cite$ * warning$ "" * }
+        { " " * " \bibinfo{volume}{" * volume * "}" * }
+      if$
+    }
+    {
+      volume empty.or.unknown
+        {
+          "unusual to have number, but no volume, for " cite$ * warning$
+          " \bibinfo{number}{" * number * "}" *
+        }
+        { " \bibinfo{volume}{" * volume  * "}, \bibinfo{number}{" *
+          number * "}" *}
+      if$
+    }
+  if$
+  after.block 'output.state :=
+
+  % Sometimes proceedings are published in journals
+  % In this case we do not want to put year, day and month here
+
+  type$ "inproceedings" =
+    { }
+    {format.day.month.year * }
+  if$
+}
+
+FUNCTION { format.chapter.pages }
+{
+  chapter empty.or.unknown
+    'format.pages
+    { type empty.or.unknown
+        { "Chapter" } % gnp - changed to mixed case
+        { type "t" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty.or.unknown
+        {"page numbers missing in " cite$ * warning$} % gnp - added check
+        { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION { format.in.emphasize.booktitle }
+{ % jtb: format for collections or proceedings not appearing in a journal
+  booktitle empty.or.unknown
+  { "" }
+  { "In " format.emphasize.booktitle * }
+  if$
+}
+
+FUNCTION { format.in.booktitle }
+{ % jtb: format for proceedings appearing in a journal
+  booktitle empty.or.unknown
+  { "" }
+  { "In \bibinfo{booktitle}{" booktitle * "}" * }
+  if$
+}
+
+FUNCTION { format.in.ed.booktitle }
+{
+  booktitle empty.or.unknown
+  { "" }
+  { editor empty.or.unknown
+    { "In " format.emphasize.booktitle * }
+                % jtb: swapped editor location
+    { "In " format.emphasize.booktitle * ", " * format.editors.fml * }
+    if$
+  }
+  if$
+}
+
+FUNCTION { format.thesis.type }
+{ % call with default type on stack top
+  type empty.or.unknown
+    'skip$    % use default type
+    {
+      pop$    % discard default type
+      % NO: it is silly to have to brace protect every degree type!:  type "t" change.case$
+      type
+    }
+  if$
+}
+
+FUNCTION { format.tr.number }
+{
+  "\bibinfo{type}{"
+  type empty.or.unknown
+    { "{T}echnical {R}eport" }
+    'type
+  if$
+  "}" * *
+  number empty.or.unknown
+    { "t" change.case$ }
+    %% LOOKS BAD: { "." * number tie.or.space.connect }
+    %% Prefer "Research report RJ687." to "Research report. RJ687."
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION { format.advisor }
+{
+  advisor empty.or.unknown
+    { "" }
+    { "Advisor(s) " advisor * }
+  if$
+}
+
+FUNCTION { format.article.crossref }
+{ "See"
+  "\citeN{" * crossref * "}" *
+}
+
+FUNCTION { format.crossref.editor }
+{
+  editor #1 "{vv~}{ll}" format.name$
+  editor num.names$ duplicate$
+  #2 >
+    { pop$ " et~al\mbox{.}" * }         % jrh: avoid spacing problems
+    { #2 <
+    'skip$
+    { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+        { " et~al\mbox{.}" * }          % jrh: avoid spacing problems
+        { " and " * editor #2 "{vv~}{ll}" format.name$ * }
+      if$
+    }
+      if$
+    }
+  if$
+}
+
+FUNCTION { format.book.crossref }
+{
+  volume empty.or.unknown
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect % gnp - changed to mixed case
+      " of " *
+    }
+  if$
+  editor empty.or.unknown
+  editor field.or.null author field.or.null =
+  or
+    { key empty.or.unknown
+    { series empty.or.unknown
+        { "need editor, key, or series for " cite$ * " to crossref " *
+          crossref * warning$
+          "" *
+        }
+        { series emphasize * }
+      if$
+    }
+    { key * }
+      if$
+    }
+    { format.crossref.editor * }
+  if$
+  " \citeN{" * crossref * "}" *
+}
+
+FUNCTION { format.incoll.inproc.crossref }
+{ "See"
+  " \citeN{" * crossref * "}" *
+}
+
+FUNCTION { format.lab.names }
+{
+  % format.lab.names:
+  %
+  % determines "short" names for the abbreviated author information.
+  % "Long" labels are created in calc.label, using the routine my.full.label
+  % to format author and editor fields.
+  %
+  % There are 4 cases for labels.   (n=3 in the example)
+  % a) one author             Foo
+  % b) one to n               Foo, Bar and Baz
+  % c) use of "and others"    Foo, Bar et al.
+  % d) more than n            Foo et al.
+
+  's :=
+  s num.names$ 'numnames :=
+  numnames #2 >    % change number to number of others allowed before
+                   % forcing "et al".
+    { s #1 "{vv~}{ll}" format.name$ " et~al\mbox{.}" * } % jrh: \mbox{} added
+    {
+      numnames #1 - 'namesleft :=
+      #2 'nameptr :=
+      s #1 "{vv~}{ll}" format.name$
+        { namesleft #0 > }
+        { nameptr numnames =
+            { s nameptr "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+                { " et~al\mbox{.}" * }          % jrh: avoid spacing problems
+                { " and " * s nameptr "{vv~}{ll}" format.name$ * }
+              if$
+            }
+            { ", " * s nameptr "{vv~}{ll}" format.name$ * }
+          if$
+          nameptr #1 + 'nameptr :=
+          namesleft #1 - 'namesleft :=
+        }
+      while$
+    }
+  if$
+}
+
+FUNCTION { author.key.label }
+{
+  author empty.or.unknown
+    { key empty.or.unknown
+          { "no key, author in " cite$ * warning$
+            cite$ #1 #3 substring$ }
+         'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION { editor.key.organization.label }
+{ % added - gnp. Provide label formatting by organization if editor is null.
+  editor empty.or.unknown
+    { organization empty.or.unknown
+        { key empty.or.unknown
+            { "no key, editor or organization in " cite$ * warning$
+              cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { organization }
+      if$
+    }
+    { editor format.lab.names }
+  if$
+}
+
+FUNCTION { author.editor.key.label }
+{
+  author empty.or.unknown
+    { editor empty.or.unknown
+          { key empty.or.unknown
+               { "no key, author, or editor in " cite$ * warning$
+                 cite$ #1 #3 substring$ }
+             'key
+           if$
+         }
+          { editor format.lab.names }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION { author.editor.key.organization.label }
+{ % added - gnp. Provide label formatting by organization if author is null.
+  author empty.or.unknown
+    { editor empty.or.unknown
+        { organization empty.or.unknown
+            { key empty.or.unknown
+               { "no key, author, editor or organization in " cite$ * warning$
+                 cite$ #1 #3 substring$ }
+               'key
+              if$
+            }
+            { organization }
+          if$
+        }
+        { editor format.lab.names }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+% Calculate label and leave it on stack
+FUNCTION { calc.basic.label }
+{
+  type$ "book" =
+  type$ "inbook" =
+  or
+  type$ "article" =
+  or
+    'author.editor.key.label
+    { type$ "proceedings" =
+      type$ "periodical" =
+      or
+        'editor.key.organization.label
+        { type$ "manual" =
+            'author.editor.key.organization.label
+            'author.key.label
+          if$
+        }
+      if$
+    }
+  if$
+  duplicate$
+  year empty.or.unknown
+    { "[n. d.]" }
+    { year field.or.null purify$ #-1 #4 substring$}
+  if$
+  *
+  'basic.label.year :=
+}
+
+FUNCTION { calc.label }
+{
+  % Changed - GNP. See also author.editor.organization.sort, editor.organization.sort
+  % Form label for BibTeX entry. The classification of which fields are used
+  % for which type of entry (book, inbook, etc.) are taken from alpha.bst.
+  % The change here from newapa is to also include organization as a
+  % citation label if author or editor is missing.
+
+  calc.basic.label
+
+  author empty.or.unknown  % generate the full label citation information.
+    {
+      editor empty.or.unknown
+        {
+          organization empty.or.unknown
+            {
+              key empty.or.unknown
+                {
+                  "no author, editor, organization, or key in " cite$ * warning$
+                  "??"
+                }
+                { key }
+              if$
+            }
+            { organization }
+          if$
+        }
+        { editor my.full.label }
+      if$
+    }
+    { author my.full.label }
+  if$
+
+  % leave label on the stack, to be popped when required.
+
+  "}{" * swap$ * "}{" *
+  %  year field.or.null purify$ #-1 #4 substring$ *
+  %
+  % save the year for sort processing afterwards (adding a, b, c, etc.)
+  %
+  year empty.or.unknown
+    { "[n. d.]" }
+    { year field.or.null purify$ #-1 #4 substring$}
+  if$
+  'label.year :=
+}
+
+
+FUNCTION { output.bibitem }
+{
+  newline$
+  "\bibitem[\protect\citeauthoryear{" write$
+  calc.label write$
+  sort.year write$
+  "}]%" writeln
+  "        {" write$
+  cite$ write$
+  "}" writeln
+  ""
+  before.all 'output.state :=
+}
+
+
+FUNCTION { output.issue.doi.coden.isxn.lccn.url.eprint }
+{ % enter and return with stack empty
+  %% We switch now from buffered output to output of complete lines, so
+  %% that the Issue .. URL data have their own lines, and are less likely
+  %% to be line-wrapped by BibTeX's short-sighted algorithm, which wraps
+  %% lines longer than 79 characters, backtracking to what it thinks is
+  %% a break point in the string.  Any such wrapping MUST be undone to
+  %% prevent percent-newline from appearing in DOIs and URLs.  The
+  %% output data are intentionally wrapped in \showxxx{} macros at
+  %% beginning of line, and that supply their own punctuation (if they
+  %% are not defined to suppress output entirely), to make it easier for
+  %% other software to recover them from .bbl files.
+  %%
+  %% It also makes it possible to later change the macro definitions
+  %% to suppress particular output values, or alter their appearance.
+  %%
+  %% Note that it is possible for theses, technical reports, and
+  %% manuals to have ISBNs, and anything that has an ISBN may also
+  %% have an ISSN.  When there are no values for these keys, there
+  %% is no output generated for them here.
+
+  "\newblock" writeln
+  after.block 'output.state :=
+
+  output.issue
+  output.isbn
+  output.coden  % CODEN is functionally like ISSN, so output them sequentially
+  output.issn
+  output.lccn
+  output.doi    % DOI is ALWAYS last according to CrossRef DOI documentation
+  output.eprint
+  output.url    % but ACM wants URL last
+}
+
+FUNCTION { output.issue.doi.coden.isxn.lccn.url.eprint.note }
+{ % enter with stack empty, return with empty string on stack
+  output.issue.doi.coden.isxn.lccn.url.eprint
+  note empty.or.unknown
+    { }
+    {
+      "\newblock" writeln
+      output.note
+    }
+  if$
+  ""
+}
+
+FUNCTION { output.issue.doi.coden.isxn.lccn.url.eprint.note.check }
+{ % enter with stack empty, return with empty string on stack
+  output.issue.doi.coden.isxn.lccn.url.eprint
+  note empty.or.unknown
+    { }
+    {
+      "\newblock" writeln
+      output.note.check
+    }
+  if$
+  ""
+}
+
+FUNCTION { article }
+{
+  output.bibitem
+
+  author empty.or.unknown
+    {
+      editor empty.or.unknown
+        { "neither author and editor supplied for " cite$ * warning$ }
+        { format.editors "editor" output.check }
+      if$
+    }
+    { format.authors "author" output.check }
+  if$
+
+  author format.no.key output       % added
+  output.year.check                 % added
+  new.block
+  format.articletitle "title" output.check
+  new.block
+  howpublished empty.or.unknown
+    { }
+    { "\bibinfo{howpublished}{" howpublished "}" * * output }
+  if$
+
+  crossref missing$
+    { format.journal.volume.number.day.month.year output}
+    {
+      "cross reference in @Article{...} is unusual" warning$
+      format.article.crossref output.nonnull
+    }
+  if$
+
+  format.pages.check.without.articleno output
+  format.articleno.numpages output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note
+  fin.entry
+}
+
+FUNCTION { book }
+{
+  output.bibitem
+  author empty.or.unknown
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  output.year.check       % added
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { new.sentence              % jtb: start a new sentence for series/volume
+      format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" bibinfo.output.check
+      address "address" bibinfo.output.check    % jtb: require address
+      fin.sentence
+      pages empty.or.unknown
+        { format.bookpages }    % use bookpages when pages empty
+        { format.pages.check "pages" tie.or.space.connect }
+      if$
+      output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note
+  fin.entry
+}
+
+FUNCTION { booklet }
+{
+  output.bibitem
+  format.authors output
+  author format.key output          % added
+  output.year.check                 % added
+  new.block
+  format.title "title" output.check
+  new.block
+    howpublished empty.or.unknown
+    { }
+    { "\bibinfo{howpublished}{" howpublished "}" * * output }
+  if$
+  address output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note
+  fin.entry
+}
+
+FUNCTION { inbook }
+{
+  output.bibitem
+  author empty.or.unknown
+    { format.editors
+      "author and editor" output.check
+    }
+    { format.authors output.nonnull
+      crossref missing$
+    { "author and editor" editor either.or.check }
+    'skip$
+      if$
+    }
+  if$
+  output.year.check                 % added
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { new.sentence              % jtb: start a new sentence for series/volume
+      format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" bibinfo.output.check
+      address "address" bibinfo.output.check    % jtb: require address
+      format.bookpages output
+      format.chapter.pages
+      "chapter and pages" output.check  % jtb: moved from before publisher
+    }
+    {
+      format.bookpages output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note
+  fin.entry
+}
+
+FUNCTION { incollection }
+{
+  output.bibitem
+  format.authors "author" output.check
+  author format.key output       % added
+  output.year.check              % added
+  new.block
+  format.articletitle "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      new.sentence                % jtb: start a new sentence for series/volume
+      format.bvolume output
+      format.number.series output
+      new.sentence
+      publisher "publisher" bibinfo.output.check
+      address "address" bibinfo.output.check      % jtb: require address
+      format.bookpages output
+      format.chapter.pages output % gnp - was special.output.nonnull
+                                  % left out comma before page numbers
+                                  % jtb: moved from before publisher
+    }
+    {
+      format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note
+  fin.entry
+}
+
+FUNCTION { inproceedings }
+{
+  output.bibitem
+  format.authors "author" output.check
+  author format.key output            % added
+  output.year.check                   % added
+  new.block
+  format.articletitle "title" output.check
+  howpublished empty.or.unknown
+    { }
+    { "\bibinfo{howpublished}{" howpublished "}" * * output.dot.space }
+  if$
+  crossref missing$
+    {
+      journal missing$          % jtb: proceedings appearing in journals
+        { format.in.emphasize.booktitle format.city "booktitle"  output.check.dot.space
+          format.series output.removenospace
+          format.editors.fml output % BV 2011/09/27 Moved dot to comma
+          format.bvolume.noseries output
+          new.sentence
+          organization output
+          publisher "publisher" bibinfo.output.check % jtb: require publisher (?)
+          address "address" bibinfo.output.check  % jtb: require address
+          format.bookpages output
+        }
+        {
+           format.in.booktitle format.city "booktitle" output.check
+           format.editors.fml output
+           new.sentence
+           format.journal.volume.number.day.month.year output
+        }
+      if$
+      format.articleno output
+      format.pages.check.without.articleno output
+    }
+    {
+      format.incoll.inproc.crossref output.nonnull
+      format.articleno output
+      format.pages.check.without.articleno output
+    }
+  if$
+  format.articleno.numpages output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note
+  fin.entry
+}
+
+FUNCTION { conference } { inproceedings }
+
+FUNCTION { manual }
+{
+  output.bibitem
+  author empty.or.unknown
+    { editor empty.or.unknown
+      { organization "organization" output.check
+        organization format.key output }  % if all else fails, use key
+      { format.editors "author and editor" output.check }
+      if$
+    }
+    { format.authors output.nonnull }
+    if$
+  output.year.check                 % added
+  new.block
+  format.btitle "title" output.check
+  organization address new.block.checkb
+  % jtb: back to normal style: organization, address
+  organization "organization" output.check
+  address output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note
+  fin.entry
+}
+
+FUNCTION { mastersthesis }
+{
+  output.bibitem
+  format.authors "author" output.check
+  author format.key output          % added
+  output.year.check                 % added
+  new.block
+  format.title emphasize "title" output.check  % NB: ACM style requires emphasized thesis title
+  new.block
+  "\bibinfo{thesistype}{Master's\ thesis}" format.thesis.type output
+  new.sentence
+  school "school" bibinfo.output.check
+  address empty.or.unknown
+     { }
+     { "\bibinfo{address}{" address * "}" * output }
+  if$
+  new.block
+  format.advisor output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note
+  fin.entry
+}
+
+FUNCTION { misc }
+{
+  output.bibitem
+  format.authors "author" output.check
+  author format.key output            % added
+  output.year.check                   % added
+  title howpublished new.block.checkb
+  format.title output
+  new.block
+  howpublished empty.or.unknown
+    { }
+    { "\bibinfo{howpublished}{" howpublished "}" * * output }
+  if$
+  "" output.nonnull.dot.space
+  output.day.month.year
+  calc.format.page.count output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note
+  fin.entry
+}
+
+FUNCTION { online } { misc }
+
+FUNCTION { game } { misc }
+
+
+FUNCTION { phdthesis }
+{
+  output.bibitem
+  format.authors "author" output.check
+  author format.key output          % added
+  output.year.check                 % added
+  new.block
+  format.title emphasize "title" output.check  % NB: ACM style requires emphasized thesis title
+  new.block
+  "\bibinfo{thesistype}{Ph.D. Dissertation}" format.thesis.type output
+  new.sentence
+  school "school" bibinfo.output.check
+  address empty.or.unknown
+     { }
+     { "\bibinfo{address}{" address * "}" * output }
+  if$
+  new.block
+  format.advisor output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note
+  fin.entry
+}
+
+FUNCTION {format.date}
+{ year empty.or.unknown
+    { month empty.or.unknown
+        {
+          ""                    % output empty date if year/month both empty
+          day empty.or.unknown
+            {  }
+            { "there's a day but no month or year in " cite$ * warning$ }
+          if$
+        }
+        { "there's a month but no year in " cite$ * warning$
+          month
+          day empty.or.unknown
+            { }
+            { " " * day * }
+          if$
+        }
+      if$
+    }
+    { month empty.or.unknown
+        {
+          year                  % output only year if month empty
+          day empty.or.unknown
+            {  }
+            { "there's a day and year but no month in " cite$ * warning$ }
+          if$
+        }
+        {
+          month " " *
+          day empty.or.unknown
+            { }
+            { day * ", " * }
+          if$
+          year *
+        }
+      if$
+    }
+  if$
+}
+
+FUNCTION {new.block.checka}
+{
+  empty.or.unknown
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION { periodical }
+{
+  output.bibitem
+  editor empty.or.unknown
+    { organization output }
+    { format.editors output.nonnull }
+  if$
+  new.block
+  output.year.check
+  new.sentence
+  format.articletitle "title" output.check
+  format.journal.volume.number.day.month.year output
+  calc.format.page.count output
+  fin.entry
+}
+
+FUNCTION { proceedings }
+{
+  output.bibitem
+  editor empty.or.unknown
+    { organization output
+      organization format.key output }  % gnp - changed from author format.key
+    { format.editors output.nonnull }
+  if$
+  % author format.key output             % gnp - removed (should be either
+  %                                        editor or organization
+  output.year.check                    % added (newapa)
+  new.block
+  format.btitle format.city "title" output.check        % jtb: added city
+  new.sentence
+  format.bvolume output
+  format.number.series output
+  new.sentence
+  organization output
+  % jtb: normal order: publisher, address
+  publisher empty.or.unknown
+     { }
+     { "\bibinfo{publisher}{" publisher * "}" * output }
+  if$
+  address empty.or.unknown
+     { }
+     { "\bibinfo{address}{" address * "}" * output }
+  if$
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note
+  fin.entry
+}
+
+FUNCTION { collection } { proceedings }
+
+FUNCTION { techreport }
+{
+  output.bibitem
+  format.authors "author" output.check
+  author format.key output             % added
+  output.year.check                    % added
+  new.block
+  format.btitle "title" output.check
+  new.block
+%   format.tr.number output               % jtb: moved month ...
+  format.tr.number output new.sentence    % Gerry  - need dot 2011/09/28
+  institution "institution" bibinfo.output.check
+  address empty.or.unknown
+    { }
+    { "\bibinfo{address}{" address "}" * * output }
+  if$
+  new.sentence
+  format.named.pages output
+  % ACM omits year at end in transactions style
+  % format.day.month.year output.nonnull.dot.space  % jtb: ... to here (no parens)
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note
+  fin.entry
+}
+
+FUNCTION { unpublished }
+{
+  output.bibitem
+  format.authors
+  "author" output.check
+  author format.key output              % added
+  output.year.check                     % added
+  new.block
+  format.title "title" output.check
+  fin.sentence
+  output.day.month.year                 % UTAH
+  calc.format.page.count output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.eprint.note.check
+  fin.entry
+}
+
+FUNCTION { default.type } { misc }
+
+%%% ACM journal-style month definitions: full name if 1--5 letters, else
+%%% abbreviation of 3 or 4 characters and a dot
+
+MACRO {jan}             {"Jan."}
+
+MACRO {feb}             {"Feb."}
+
+MACRO {mar}             {"March"}
+
+MACRO {apr}             {"April"}
+
+MACRO {may}             {"May"}
+
+MACRO {jun}             {"June"}
+
+MACRO {jul}             {"July"}
+
+MACRO {aug}             {"Aug."}
+
+MACRO {sep}             {"Sept."}
+
+MACRO {oct}             {"Oct."}
+
+MACRO {nov}             {"Nov."}
+
+MACRO {dec}             {"Dec."}
+
+
+
+READ
+
+FUNCTION { sortify }
+{
+  purify$
+  "l" change.case$
+}
+
+FUNCTION { chop.word }
+{
+  's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+
+FUNCTION { sort.format.names }
+{
+  's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { nameptr #1 >
+          { "   " * }
+         'skip$
+      if$
+      s nameptr "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}" format.name$ 't :=
+      nameptr numnames = t "others" = and
+          { " et~al" * }
+          { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION { sort.format.title }
+{
+  't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+
+FUNCTION { author.sort }
+{
+  author empty.or.unknown
+    { key empty.or.unknown
+         { "to sort, need author or key in " cite$ * warning$
+           "" }
+         { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION { author.editor.sort }
+{
+  author empty.or.unknown
+    {
+      editor empty.or.unknown
+         {
+           key empty.or.unknown
+             { "to sort, need author, editor, or key in " cite$ * warning$
+               ""
+             }
+             { key sortify }
+           if$
+         }
+         { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION { editor.organization.sort }
+{
+  % added - GNP. Stack editor or organization for sorting (from alpha.bst).
+  % Unlike alpha.bst, we need entire names, not abbreviations
+
+  editor empty.or.unknown
+    { organization empty.or.unknown
+        { key empty.or.unknown
+            { "to sort, need editor, organization, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { organization sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+
+FUNCTION { author.editor.organization.sort }
+{
+  % added - GNP. Stack author or organization for sorting (from alpha.bst).
+  % Unlike alpha.bst, we need entire names, not abbreviations
+
+  author empty.or.unknown
+    {
+      editor empty.or.unknown
+        { organization empty.or.unknown
+            { key empty.or.unknown
+                { "to sort, need author, editor, or key in " cite$ * warning$
+                ""
+                }
+                { key sortify }
+              if$
+            }
+            { organization sortify }
+          if$
+        }
+        { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION { presort }
+{
+  % Presort creates the bibentry's label via a call to calc.label, and then
+  % sorts the entries based on entry type. Chicago.bst adds support for
+  % including organizations as the sort key; the following is stolen from
+  % alpha.bst.
+
+  calc.label
+  basic.label.year
+  swap$
+  "    "
+  swap$
+  * *
+  "    "
+  *
+  sortify
+  year field.or.null purify$ #-1 #4 substring$ * % add year
+  "    "
+  *
+  type$ "book" =
+  type$ "inbook" =
+  or
+  type$ "article" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+      type$ "periodical" =
+      or
+        'editor.organization.sort
+        { type$ "manual" =
+            'author.editor.organization.sort
+            'author.sort
+          if$
+        }
+      if$
+    }
+  if$
+  #1 entry.max$ substring$        % added for newapa
+  'sort.label :=                  % added for newapa
+  sort.label                      % added for newapa
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+
+
+ITERATE { presort }
+
+SORT             % by label, year, author/editor, title
+
+% From plainnat.bst
+STRINGS { longest.label }
+
+INTEGERS { longest.label.width number.label }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #0 int.to.chr$ 'last.label :=
+  "" 'next.extra :=
+  #0 'longest.label.width :=
+  #0 'last.extra.num :=
+  #0 'number.label :=
+}
+
+
+
+FUNCTION { initialize.extra.label.stuff }
+{ #0 int.to.chr$ 'last.label :=
+  "" 'next.extra :=
+  #0 'last.extra.num :=
+}
+
+FUNCTION { forward.pass }
+{
+  % Pass through all entries, comparing current entry to last one.
+  % Need to concatenate year to the stack (done by calc.label) to determine
+  % if two entries are the same (see presort)
+
+  last.label
+  calc.basic.label year field.or.null purify$ #-1 #4 substring$ * % add year
+  #1 entry.max$ substring$ =     % are they equal?
+     { last.extra.num #1 + 'last.extra.num :=
+       last.extra.num int.to.chr$ 'extra.label :=
+     }
+     { "a" chr.to.int$ 'last.extra.num :=
+       "" 'extra.label :=
+       calc.basic.label year field.or.null purify$ #-1 #4 substring$ * % add year
+       #1 entry.max$ substring$ 'last.label := % assign to last.label
+     }
+  if$
+  number.label #1 + 'number.label :=
+}
+
+FUNCTION { reverse.pass }
+{
+  next.extra "b" =
+    { "a" 'extra.label := }
+     'skip$
+  if$
+  label.year extra.label * 'sort.year :=
+  extra.label 'next.extra :=
+}
+
+EXECUTE {initialize.extra.label.stuff}
+EXECUTE {initialize.longest.label}
+
+
+ITERATE {forward.pass}
+
+REVERSE {reverse.pass}
+
+FUNCTION { bib.sort.order }
+{
+  sort.label
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE { bib.sort.order }
+
+SORT             % by sort.label, year, title --- giving final bib. order.
+
+FUNCTION { begin.bib }
+{
+  %% Set to #0 show 13-digit ISBN in preference to 10-digit ISBN.
+  %% Set to #1 to show both 10-digit and 13-digit ISBNs.
+  #1 'show-isbn-10-and-13 :=
+
+  "%%% -*-BibTeX-*-" writeln
+  "%%% Do NOT edit. File created by BibTeX with style" writeln
+  "%%% ACM-Reference-Format-Journals [18-Jan-2012]." writeln
+  "" writeln
+
+  preamble$ empty.or.unknown
+    'skip$
+    { preamble$ writeln }
+  if$
+  "\begin{thebibliography}{" number.label int.to.str$ * "}" * writeln
+  ""                                                                         writeln
+  "%%% ====================================================================" writeln
+  "%%% NOTE TO THE USER: you can override these defaults by providing"       writeln
+  "%%% customized versions of any of these macros before the \bibliography"  writeln
+  "%%% command.  Each of them MUST provide its own final punctuation,"       writeln
+  "%%% except for \shownote{}, \showDOI{}, and \showURL{}.  The latter two"  writeln
+  "%%% do not use final punctuation, in order to avoid confusing it with"    writeln
+  "%%% the Web address."                                                     writeln
+  "%%%"                                                                      writeln
+  "%%% To suppress output of a particular field, define its macro to expand" writeln
+  "%%% to an empty string, or better, \unskip, like this:"                   writeln
+  "%%%"                                                                      writeln
+  "%%% \newcommand{\showDOI}[1]{\unskip}   % LaTeX syntax"                   writeln
+  "%%%"                                                                      writeln
+  "%%% \def \showDOI #1{\unskip}           % plain TeX syntax"               writeln
+  "%%%"                                                                      writeln
+  "%%% ====================================================================" writeln
+  ""                                                                         writeln
+
+  %% ACM publications do not use CODEN, ISSN, and LCCN data, so their default
+  %% macro wrappers expand to \unskip, discarding their values and unwanted
+  %% space.
+  %%
+  %% For other publications, prior definitions like these may be useful:
+  %%
+  %%     Plain TeX:
+  %%         \def \showCODEN     #1{CODEN #1.}
+  %%         \def \showISSN      #1{ISSN #1.}
+  %%         \def \showLCCN      #1{LCCN #1.}
+  %%
+  %%     LaTeX:
+  %%         \newcommand{\showCODEN}[1]{CODEN #1.}
+  %%         \newcommand{\showISSN}[1]#1{ISSN #1.}
+  %%         \newcommand{\showLCCN}[1]{LCCN #1.}
+
+  "\ifx \showCODEN    \undefined \def \showCODEN     #1{\unskip}     \fi" writeln
+  "\ifx \showDOI      \undefined \def \showDOI       #1{#1}\fi" writeln
+  % ACM styles omit ISBNs, but they can be included by suitable definitions of
+  % \showISBNx and \showISBNxiii before the .bbl file is read
+  "\ifx \showISBNx    \undefined \def \showISBNx     #1{\unskip}     \fi" writeln
+  "\ifx \showISBNxiii \undefined \def \showISBNxiii  #1{\unskip}     \fi" writeln
+  "\ifx \showISSN     \undefined \def \showISSN      #1{\unskip}     \fi" writeln
+  "\ifx \showLCCN     \undefined \def \showLCCN      #1{\unskip}     \fi" writeln
+  "\ifx \shownote     \undefined \def \shownote      #1{#1}          \fi" writeln % NB: final period supplied by add.period$ above
+  "\ifx \showarticletitle \undefined \def \showarticletitle #1{#1}   \fi" writeln
+  "\ifx \showURL      \undefined \def \showURL       {\relax}        \fi" writeln
+  "% The following commands are used for tagged output and should be " writeln
+  "% invisible to TeX" writeln
+  "\providecommand\bibfield[2]{#2}" writeln
+  "\providecommand\bibinfo[2]{#2}" writeln
+  "\providecommand\natexlab[1]{#1}" writeln
+  "\providecommand\showeprint[2][]{arXiv:#2}" writeln
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION { end.bib }
+{
+  newline$
+  "\end{thebibliography}"
+  writeln
+}
+
+EXECUTE {end.bib}
diff --git a/paper_source/Makefile b/paper_source/Makefile
new file mode 100644 (file)
index 0000000..87240b5
--- /dev/null
@@ -0,0 +1,28 @@
+#!/usr/bin/make
+all: $(patsubst %.Rtex,%.pdf,$(wildcard *.Rtex)) 
+pdf: all
+
+%.tex: %.Rtex
+       Rscript -e "library(knitr); knit('$<')"
+
+%.pdf: %.tex
+       latexmk -f -pdf -quiet $<
+
+clean: 
+       latexmk -f -pdf -quiet -c *.tex
+       rm -rf figure/
+       rm -f *.tex
+       rm -f *.tmp
+       rm -f vc
+       rm -f *.bbl
+       rm -f generalizable_wiki.pdf
+
+
+viewpdf: all
+       evince *.pdf
+
+spell:
+       aspell -c -t --tex-check-comments -b text.tex
+
+.PHONY: clean all
+.PRECIOUS: %.tex
diff --git a/paper_source/SIGCHI-Reference-Format.bst b/paper_source/SIGCHI-Reference-Format.bst
new file mode 100644 (file)
index 0000000..6fc5574
--- /dev/null
@@ -0,0 +1,3352 @@
+%%% -*-BibTeX-*-
+%%% ====================================================================
+%%%  @BibTeX-style-file{
+%%%     author          = "Nelson H. F. Beebe, Boris Veytsman and Gerald Murray",
+%%%     version         = "1.00",
+%%%     date            = "18 January 2012",
+%%%     time            = "11:48 EST",
+%%%     filename        = "ACM-Reference-Format-Journals.bst",
+%%%     address         = "University of Utah
+%%%                        Department of Mathematics, 110 LCB
+%%%                        155 S 1400 E RM 233
+%%%                        Salt Lake City, UT 84112-0090
+%%%                        USA",
+%%%     telephone       = "+1 801 581 5254",
+%%%     FAX             = "+1 801 581 4148",
+%%%     URL             = "http://www.math.utah.edu/~beebe",
+%%%     checksum        = "available here: http://www.acm.org/publications/latex_style/CRC-journals.txt"
+%%%     email           = "beebe@math.utah.edu, beebe@acm.org,
+%%%                        beebe@computer.org, borisv@lk.net, murray@hq.acm.org", 
+%%%     codetable       = "ISO/ASCII",
+%%%     keywords        = "ACM Transactions bibliography style; BibTeX",
+%%%     license         = "public domain",
+%%%     supported       = "yes",
+%%%     abstract        = "",
+%%%     docstring       = "The checksum field, above, is produced by WinMD5Free (v1.20) 
+%%%                        available from http://www.winmd5.com/?rid=winmd5,"
+%%%  }
+%%% ====================================================================
+
+% "SIGCHI Format" BibTeX style, Forked from ACM-Reference-Format-Journals.bst
+%  Modifications 13-FEBURARY-2015 (David Ayman Shamma)
+
+% "ACM Transactions" BibTeX style, ACM-Reference-Format-Journals.bst
+%  for BibTeX version 0.99c, LaTeX version 3.141
+%  ACM extensions with code cleaned up, extended, and debugged 10--15-Nov-2008
+%  Revised 17-APRIL-2008 (Nelson)
+%  Revised 13-MARCH-2011 (Boris/Gerry)
+%  Revised 23-MARCH-2011 (Boris/Gerry)
+%  Revised 27-MARCH-2011 (Boris/Gerry)
+%  Revised 15-APRIL-2011 (Boris/Gerry)
+%  Revised 27-SEPTEMBER-2011 (Boris)
+%
+%
+% History (by Nelson)
+%
+% Based on 'acmtrans' (for ACM Journals)
+% Date: 28th April 2008
+%
+% 1. Avoid 'repeating' the 'month' values.
+% 2. Avoid incorrectly italicizing the volume number.
+% 3. Avoid non italicizing certain titles (book, inproceedings etc).
+% 4. NO series if there is NO volume.
+% 5. Sorting of reference with first surname.
+% 6. Article no added for Inproceedings.
+%
+% Date: 07th May 2008
+%
+% 1. Abbreviation list added
+%
+%   Citation format: [author-last-name year]
+%             [author-last-name and author-last-name year]
+%             [author-last-name, author-last-name, and author-last-name year]
+%             [author-last-name et al. year]
+%             [author-last-name]
+%             author-last-name [year]
+%             [author-last-name and author-last-name]
+%             [author-last-name et al.]
+%             [year] or [year,year]
+%             year or year,year
+%
+%   Reference list ordering: alphabetical by author or whatever passes
+%    for author in the absence of one.
+%
+% Features of the old acmtrans.bst:
+% =================================
+%
+% - all authors appear last name first.
+% - all pages are listed xx-xx, (no pp.) and are at the end of the reference
+% - publishers are identified as publisher, address
+% - conferences papers (inproceedings) may give city of conference,
+%   date of conference, and journal that the proceedings appear in.
+% - months abbreviated to max four letters (eg. Mar.)
+% - volume of a series indicated after the title of the series
+% - editors appear after edited title and are identified by a trailing "Eds."
+%   not in parentheses.  Editor names are not given in small caps.
+%   (unless there is no author line)
+% - names terminated with a period even if there is no first name.
+% - editions are indicated trailing after the work, not in parentheses.
+% - "et al." citations have a protected period to avoid bad spacing (jrh)
+% - "address" required when publisher given
+% - series (roman) and volume are in a sentence separate from (book-)title
+%
+%
+% Features of chicago.bst:
+% =======================
+%
+% - full names used in citations, but abbreviated citations are available
+%   (see above)
+% - if an entry has a "month", then the month and year are also printed
+%   as part of that bibitem.
+% - all conjunctions use "and" instead of "\&"
+% - major modification from Chicago Manual of Style (13th ed.) is that
+%   only the first author in a reference appears last name first-
+%   additional authors appear as J. Q. Public.
+% - pages are listed as "pp. xx-xx" in all entry types except
+%   article entries.
+% - book, inbook, and manual use "location: publisher" (or organization)
+%   for address and publisher. All other types list publishers separately.
+% - "pp." are used to identify page numbers for all entry types except
+%   articles.
+% - organization is used as a citation label if neither author nor editor
+%   is present (for manuals).
+% - "et al." is used for long author and editor lists, or when "others"
+%   is used.
+%
+% Modifications and bug fixes from newapa.bst:
+% ===========================================
+%
+%   - added month, year to bib entries if month is present
+%   - fixed bug with In proceedings, added necessary comma after title
+%   - all conjunctions changed to "and" from "\&"
+%   - fixed bug with author labels in my.full.label: "et al." now is
+%        generated when "others" is an author name
+%   - major modification from Chicago Manual of Style (13th ed.) is that
+%     only the first author in a reference appears last name first-
+%     additional authors appear as J. Q. Public.
+%   - pages are listed as "pp. xx-xx" in all entry types except
+%     article entries. Unnecessary (IMHO) "()" around page numbers
+%     were removed, and page numbers now don't end with a period.
+%   - created chicago.sty for use with this bibstyle (required).
+%   - fixed bugs in FUNCTION {format.vol.num.pages} for missing volume,
+%     number, and /or pages. Renamed to format.journal.volume.number.
+%   - fixed bug in formatting booktitles: additional period an error if
+%     book has a volume.
+%   - fixed bug: editors usually given redundant period before next clause
+%     (format.editors.dot) removed.
+%   - added label support for organizations, if both author and editor
+%     are missing (from alpha.bst). If organization is too long, then
+%     the key field is used for abbreviated citations.
+%   - In proceedings or books of several volumes, no comma was written
+%     between the "Volume x" and the page numbers (this was intentional
+%     in newapa.bst). Fixed.
+%   - Some journals may not have volumes/numbers, only month/year (eg.
+%     IEEE Computer). Fixed bug in article style that assumed volume/number
+%     was always present.
+%
+% Original documentation for newapa.sty:
+% =====================================
+%
+% This version was made by modifying the master file made by
+% Oren Patashnik (PATASHNIK@SCORE.STANFORD.EDU), and the 'named' BibTeX
+% style of Peter F. Patel-Schneider.
+%
+% Copyright (C) 1985, all rights reserved.
+% Copying of this file is authorized only if either
+% (1) you make absolutely no changes to your copy, including name, or
+% (2) if you do make changes, you name it something other than 'newapa.bst'.
+% There are undoubtably bugs in this style.  If you make bug fixes,
+% improvements, etc.  please let me know.  My e-mail address is:
+%    spencer@cgrg.ohio.state.edu or 71160.3141@compuserve.com
+%
+% This style was made from 'plain.bst', 'named.bst', and 'apalike.bst',
+% with lots of tweaking to make it look like APA style, along with tips
+% from Young Ryu and Brian Reiser's modifications of 'apalike.bst'.
+%
+%
+% Start of ACM-Reference-Format-Journals.bst
+%
+% Note: Many of the new bibentry 'fields' will only work with the 
+% 'ACM-Reference-Format-Journals.bst' file. Legacy .bib files (which will, in all probability,
+% NOT contain these new fields) will _still_ work with the ACM-Reference-Format-Journals.bst.
+% 
+%
+ENTRY
+  { address
+    advisor
+    author
+    booktitle
+    chapter
+    city        % jtb: added
+    date        % jtb: added
+    edition
+    editor
+    howpublished
+    institution
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+        % New keys recognized 
+        issue         % UTAH: used in, e.g., ACM SIGSAM Bulletin and ACM Communications in Computer Algebra
+        articleno
+        day           % UTAH: needed for newspapers, weeklies, bi-weeklies
+        doi           % UTAH
+        url           % UTAH
+        bookpages     % UTAH
+        numpages
+        lastaccessed  % UTAH: used only for @Misc{...}
+        coden         % UTAH
+        isbn          % UTAH
+        isbn-13       % UTAH
+        issn          % UTAH
+        lccn          % UTAH
+  }
+  {}
+  { label.year extra.label sort.year sort.label }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+INTEGERS { show-isbn-10-and-13 }  % initialized below in begin.bib
+
+INTEGERS { nameptr namesleft numnames }
+
+INTEGERS { multiresult }
+
+INTEGERS { len }
+
+INTEGERS { last.extra.num }
+
+STRINGS { s t t.org u }
+
+STRINGS { last.label next.extra }
+
+STRINGS { p1 p2 p3 page.count }
+
+FUNCTION { dump.stack.1 }
+{
+    duplicate$ "STACK[top] = [" swap$ * "]" * warning$
+}
+
+FUNCTION { dump.stack.2 }
+{
+    duplicate$ "STACK[top  ] = [" swap$ * "]" * warning$
+    swap$
+    duplicate$ "STACK[top-1] = [" swap$ * "]" * warning$
+    swap$
+}
+
+FUNCTION { empty.or.unknown }
+{
+  %% Examine the top stack entry, and push 1 if it is empty, or
+  %% consists only of whitespace, or is a string beginning with two
+  %% queries (??), and otherwise, push 0.
+  %%
+  %% This function provides a replacement for empty$, with the
+  %% convenient feature that unknown values marked by two leading
+  %% queries are treated the same as missing values, and thus, do not
+  %% appear in the output .bbl file, and yet, their presence in .bib
+  %% file(s) serves to mark values which are temporarily missing, but
+  %% are expected to be filled in eventually once more data is
+  %% obtained.  The TeX User Group and BibNet bibliography archives
+  %% make extensive use of this practice.
+  %%
+  %% An empty string cannot serve the same purpose, because just as in
+  %% statistics data processing, an unknown value is not the same as an
+  %% empty value.
+  %%
+  %% At entry: stack = ... top:[string]
+  %% At exit:  stack = ... top:[0 or 1]
+
+  duplicate$ empty$
+    { pop$ #1 }
+    { #1 #2 substring$ "??" = }
+  if$
+}
+
+FUNCTION { writeln }
+{
+  %% In BibTeX style files, the sequences
+  %%
+  %%     ... "one" "two" output
+  %%     ... "one" "two" output.xxx
+  %%
+  %% ship "one" to the output file, possibly following by punctuation,
+  %% leaving the stack with
+  %%
+  %%     ... "two"
+  %%
+  %% There is thus a one-string lag in output processing that must be
+  %% carefully handled to avoid duplicating a string in the output
+  %% file.  Unless otherwise noted, all output.xxx functions leave
+  %% just one new string on the stack, and that model should be born
+  %% in mind when reading or writing function code.
+  %%
+  %% BibTeX's asynchronous buffering of output from strings from the
+  %% stack is confusing because newline$ bypasses the buffer.  It
+  %% would have been so much easier for newline to be a character
+  %% rather than a state of the output-in-progress.
+  %%
+  %% The documentation in btxhak.dvi is WRONG:  it says
+  %%
+  %%    newline$ Writes onto the bbl file what's accumulated in the
+  %%             output buffer. It writes a blank line if and only
+  %%             if the output buffer is empty. Since write$ does
+  %%             reasonable line breaking, you should use this
+  %%             function only when you want a blank line or an
+  %%             explicit line break.
+  %%
+  %%    write$   Pops the top (string) literal and writes it on the
+  %%             output buffer (which will result in stuff being
+  %%             written onto the bbl file when the buffer fills
+  %%             up).
+  %%
+  %% Examination of the BibTeX source code shows that write$ does
+  %% indeed behave as claimed, but newline$ sends a newline character
+  %% directly to the output file, leaving the stack unchanged.  The
+  %% first line "Writes onto ... buffer." is therefore wrong.
+  %%
+  %% The original BibTeX style files almost always use "write$ newline$"
+  %% in that order, so it makes sense to hide that pair in a private
+  %% function like this one, named after a statement in Pascal,
+  %% the programming language embedded in the BibTeX Web program.
+
+  write$                % output top-of-stack string
+  newline$              % immediate write of newline (not via stack)
+}
+
+FUNCTION { init.state.consts }
+{
+  #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+FUNCTION { output.nonnull }
+{ % Stack in: ... R S T  Stack out: ... R T   File out: S<comma><space>
+  's :=
+  output.state mid.sentence =
+    {
+      ", " * write$
+    }
+    {
+      output.state after.block =
+        {
+          add.period$ writeln
+          "\newblock " write$
+        }
+        {
+          output.state before.all =
+            {
+              write$
+            }
+            {
+              add.period$ " " * write$
+            }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION { output.nonnull.dot.space }
+{ % Stack in: ... R S T  Stack out: ... R T   File out: S<dot><space>
+  's :=
+  output.state mid.sentence =           % { "<DEBUG output.nonnull.dot.space>. " * write$ }
+    {
+      ". " * write$
+    }
+    {
+      output.state after.block =
+        {
+          add.period$ writeln "\newblock " write$
+        }
+        {
+          output.state before.all =
+            {
+              write$
+            }
+            {
+              add.period$ " " * write$
+            }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION { output.nonnull.remove }
+{ % Stack in: ... R S T  Stack out: ... R T   File out: S<space>
+  's :=
+  output.state mid.sentence =
+    {
+      " " * write$
+    }
+    {
+      output.state after.block =
+        {
+          add.period$ writeln "\newblock " write$
+        }
+        {
+          output.state before.all =
+            {
+              write$
+            }
+            {
+              add.period$ " " * write$
+            }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION { output.nonnull.removenospace }
+{ % Stack in: ... R S T  Stack out: ... R T   File out: S
+  's :=
+  output.state mid.sentence =
+    {
+      "" * write$
+    }
+    {
+      output.state after.block =
+        {
+          add.period$ writeln "\newblock " write$
+        }
+        {
+          output.state before.all =
+            {
+              write$
+            }
+            {
+              add.period$ " " * write$
+            }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION { output }
+{ % discard top token if empty, else like output.nonnull
+  duplicate$ empty.or.unknown
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION { output.dot.space }
+{ % discard top token if empty, else like output.nonnull.dot.space
+  duplicate$ empty.or.unknown
+    'pop$
+    'output.nonnull.dot.space
+  if$
+}
+
+FUNCTION { output.removenospace }
+{ % discard top token if empty, else like output.nonnull.removenospace
+  duplicate$ empty.or.unknown
+    'pop$
+    'output.nonnull.removenospace
+  if$
+}
+
+FUNCTION { output.check }
+{ % like output, but warn if key name on top-of-stack is not set
+  't :=
+  duplicate$ empty.or.unknown
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION { output.check.dot.space }
+{ % like output.dot.space, but warn if key name on top-of-stack is not set
+  't :=
+  duplicate$ empty.or.unknown
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull.dot.space
+  if$
+}
+
+FUNCTION { fin.block }
+{ % functionally, but not logically, identical to fin.entry
+   add.period$
+   writeln
+}
+
+FUNCTION { fin.entry }
+{
+   add.period$
+   writeln
+}
+
+FUNCTION { new.sentence }
+{ % update sentence state, with neither output nor stack change
+  output.state after.block =
+    'skip$
+    {
+      output.state before.all =
+        'skip$
+        { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION { fin.sentence }
+{
+   add.period$
+   write$
+   new.sentence
+   ""
+}
+
+FUNCTION { new.block }
+{
+  output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+FUNCTION { output.coden }       % UTAH
+{ % output non-empty CODEN as one-line sentence (stack untouched)
+  coden empty.or.unknown
+    { }
+    { "\showCODEN{" coden * "}" * writeln }
+  if$
+}
+
+FUNCTION { format.articleno }
+{
+  articleno empty.or.unknown
+     { "" }
+     {
+        numpages empty.or.unknown
+          { "articleno field, but no numpages field, in " cite$ * warning$ }
+          { }
+        if$
+        "Article " articleno *
+     }
+  if$
+}
+
+FUNCTION { format.year }
+{ % push year string or "????" onto output stack
+  %% Because year is a mandatory field, we always force SOMETHING
+  %% to be output
+  year empty.or.unknown
+    { "????" }
+    { year }
+  if$
+}
+
+FUNCTION { format.day.month }
+{ % push "day month " or "month " or "" onto output stack
+  day empty.or.unknown
+    {
+      month empty.or.unknown
+        { "" }
+        { month " " *}
+      if$
+    }
+    {
+      month empty.or.unknown
+        { "" }
+        { day " " * month * " " *}
+      if$
+    }
+  if$
+}
+
+FUNCTION { format.day.month.year }     % UTAH
+{ % if month is empty, push "" else push "(MON.)" or "(DD MON.)"
+  % Needed for frequent periodicals: 2008. ... New York Times C-1, C-2, C-17 (23 Oct.)
+  % acm-*.bst addition: prefix parenthesized date string with
+  % ", Article nnn "
+  articleno empty.or.unknown
+    { "" }
+    { ", " format.articleno * }
+  if$
+  " (" * format.day.month * format.year * ")" *
+}
+
+FUNCTION { output.day.month.year }     % UTAH
+{ % if month is empty value, do nothing; else output stack top and
+  % leave with new top string "(MON.)" or "(DD MON.)"
+  % Needed for frequent periodicals: 2008. ... New York Times C-1, C-2, C-17 (23 Oct.)
+  format.day.month.year
+  output.nonnull.remove
+}
+
+FUNCTION { strip.doi } % UTAH
+{ % Strip any Web address prefix to recover the bare DOI, leaving the
+  % result on the output stack, as recommended by CrossRef DOI
+  % documentation.
+  % For example, reduce "http://doi.acm.org/10.1145/1534530.1534545" to
+  % "10.1145/1534530.1534545".  That is later typeset and displayed as
+  % doi:10.1145/1534530.1534545 as the LAST item in the reference list
+  % entry.  Publisher Web sites wrap this with a suitable link to a real
+  % URL to resolve the DOI, and the master http://dx.doi.org/ address is
+  % preferred, since publisher-specific URLs can disappear in response
+  % to economic events.  All journals are encouraged by the DOI
+  % authorities to use that typeset format and link procedures for
+  % uniformity across all publications that include DOIs in reference
+  % lists.
+  % The numeric prefix is guaranteed to start with "10.", so we use
+  % that as a test.
+  doi #1 #3 substring$ "10." =
+    { doi }
+    {
+      doi #1 #7 substring$ "http://" =
+        {
+            doi #8 doi text.length$ #7 - substring$ 't := % get modifiable copy of rest of DOI
+
+            "INTERNAL STYLE-FILE ERROR" 's :=
+
+            % search for next "/" and assign its suffix to s
+
+            { t text.length$ }
+            {
+              t #1 #1 substring$ "/" =
+                {
+                  % save rest of string as true DOI (should be 10.xxxx/yyyy)
+                  t #2 t text.length$ #1 - substring$ 's :=
+                  "" 't :=    % empty string t terminates the loop
+                }
+                {
+                  % discard first character and continue loop: t <= substring(t,2,last)
+                  t #2 t text.length$ #1 - substring$ 't :=
+                }
+              if$
+            }
+            while$
+
+            % check for valid DOI (should be 10.xxxx/yyyy)
+            s #1 #3 substring$ "10." =
+              { }
+              { "unrecognized DOI substring " s * " in DOI value [" * doi * "]" * warning$ }
+            if$
+
+            s   % push the stripped DOI on the output stack
+
+        }
+        {
+          "unrecognized DOI value [" doi * "]" * warning$
+          doi   % push the unrecognized original DOI on the output stack
+        }
+      if$
+    }
+  if$
+}
+
+%
+% Change by BV: added standard prefix to URL
+%
+FUNCTION { output.doi } % UTAH
+{ % output non-empty DOI as one-line sentence (stack untouched)
+  doi empty.or.unknown
+    { }
+    {
+      %% NB: We want URLs at beginning of line to reduce likelihood of
+      %% BibTeX's nasty line wrapping after column 79, which then requires
+      %% manual (or automated) editing of the .bbl file to repair.
+      %% The \url{} macro strips percent-newlines, and is thus safe in
+      %% the presence of the line wrapping, but \path|...| and
+      %% \verb|...| do not.
+      "\showDOI{%" writeln
+      "\url{http://dx.doi.org/" strip.doi * "}}" * writeln
+    }
+  if$
+}
+
+FUNCTION { output.isbn }                % UTAH
+{ % output non-empty ISBN-10 and/or ISBN-13 as one-line sentences (stack untouched)
+  show-isbn-10-and-13
+    {
+      %% show both 10- and 13-digit ISBNs
+      isbn empty.or.unknown
+        { }
+        {
+          "\showISBNx{" isbn * "}" * writeln
+        }
+      if$
+      isbn-13 empty.or.unknown
+        { }
+        {
+          "\showISBNxiii{" isbn-13 * "}" * writeln
+        }
+      if$
+    }
+    {
+      %% show 10-digit ISBNs only if 13-digit ISBNs not available
+      isbn-13 empty.or.unknown
+        {
+          isbn empty.or.unknown
+            { }
+            {
+              "\showISBNx{" isbn * "}" * writeln
+            }
+          if$
+        }
+        {
+          "\showISBNxiii{" isbn-13 * "}" * writeln
+        }
+      if$
+    }
+  if$
+}
+
+FUNCTION { output.issn } % UTAH
+{ % output non-empty ISSN as one-line sentence (stack untouched)
+  issn empty.or.unknown
+    { }
+    { "\showISSN{" issn * "}" * writeln }
+  if$
+}
+
+FUNCTION { output.issue }
+{ % output non-empty issue number as a one-line sentence (stack untouched)
+  issue empty.or.unknown
+    { }
+    { "Issue " issue * "." * writeln }
+  if$
+}
+
+FUNCTION { output.lccn } % UTAH
+{ % return with stack untouched
+  lccn empty.or.unknown
+    { }
+    { "\showLCCN{" lccn * "}" * writeln }
+  if$
+}
+
+FUNCTION { output.note } % UTAH
+{ % return with stack empty
+  note empty.or.unknown
+    { }
+    { "\shownote{" note add.period$ * "}" * writeln }
+  if$
+}
+
+FUNCTION { output.note.check } % UTAH
+{ % return with stack empty
+  note empty.or.unknown
+    { "empty note in " cite$ * warning$ }
+    { "\shownote{" note add.period$ * "}" * writeln }
+  if$
+}
+
+%
+% Changes by BV 2011/04/15.  Do not output
+% url if doi is defined
+%
+FUNCTION { output.url } % UTAH
+{ % return with stack untouched
+  % output URL and associated lastaccessed fields
+  doi empty.or.unknown
+  {
+    url empty.or.unknown
+      { }
+      {
+          %% NB: We want URLs at beginning of line to reduce likelihood of
+          %% BibTeX's nasty line wrapping after column 79, which would require
+          %% manual (or automated) editing of the .bbl file to repair.  However,
+          %% the \url{} macro handles the unwrapping job automatically.
+          "\showURL{%" writeln
+          lastaccessed empty.or.unknown
+            { "" }
+            { "Retrieved " lastaccessed * " from " * }
+          if$
+
+          %% The URL field may contain a semicolon-separated list of Web
+          %% addresses, and we locate and wrap each of them in \url{...}.
+          %% The simplistic approach of putting the entire list into the
+          %% macro argument is that the semicolons are typeset in a
+          %% typewriter font, and no space follows them.
+          %%
+          %% We therefore replace the original code
+          %%    "\url{" * url * "}}" * writeln
+          %% with this character-at-a-time loop:
+
+          "\url{" *
+
+          url 't :=                       % get modifiable copy of URL list
+
+          { t text.length$ }
+          {
+            t #1 #1 substring$ ";" =
+              {                         % then split argument at separator
+                "};" * writeln
+                "\url{"
+              }
+              {                         % else concatenate nonblank character to argument
+                t #1 #1 substring$ " " =
+                  { }
+                  { t #1 #1 substring$ * }
+                if$
+              }
+            if$
+
+            t #2 t text.length$ #1 - substring$ 't :=
+          }
+          while$
+
+          "}}" * writeln
+      }
+    if$
+  }
+  { }
+  if$
+}
+
+FUNCTION { output.year.check }
+{ % warn if year empty, else output top string and leave " YEAR<label>" on stack in mid-sentence
+  year empty.or.unknown
+     { "empty year in " cite$ * warning$ }
+     { write$
+        " " year * extra.label *
+       mid.sentence 'output.state :=
+     }
+  if$
+}
+
+FUNCTION { not }
+{
+    { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION { and }
+{
+    'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION { or }
+{
+   { pop$ #1 }
+    'skip$
+  if$
+}
+
+FUNCTION { le }
+{
+  %% test whether first number is less than or equal to second number
+  %% stack in:  n1 n2
+  %% stack out: if n1 <= n2 then 1 else 0
+
+  %% "DEBUG: le " cite$ * warning$
+  > { #0 } { #1 } if$
+}
+
+FUNCTION { ge }
+{
+  %% test whether first number is greater than or equal to second number
+  %% stack in:  n1 n2
+  %% stack out: if n1 >= n2 then 1 else 0
+
+  %% "DEBUG: ge " cite$ * warning$
+  < { #0 } { #1 } if$
+}
+
+FUNCTION { is.leading.digit }
+{
+  %% test whether first character of string is a digit
+  %% stack in:  string
+  %% stack out: if first-char-is-digit then 1 else 0
+
+  #1 #1 substring$                      % replace string by string[1:1]
+  duplicate$                            % string[1:1] string[1:1]
+  chr.to.int$
+  "0" chr.to.int$ swap$ le              % "0" <= string[1:1] --> 0-or-1
+  swap$                                 % 0-or-1 string[1:1]
+  chr.to.int$
+  "9" chr.to.int$ le                    % string[1:1} <= "9" --> 0-or-1
+  and
+}
+
+FUNCTION { skip.digits }
+{
+  %% skip over leading digits in string
+  %% stack in:  string
+  %% stack out: rest-of-string leading-digits
+
+  %% "DEBUG: enter skip.digits " cite$ * warning$
+
+  %% dump.stack.1
+
+  duplicate$
+  't :=
+  't.org :=
+  "" 'u :=
+
+  { t text.length$ }
+  {
+    %% "=================DEBUG: skip.digits   t = [" t * "]" * warning$
+    t is.leading.digit
+      { t #2 t text.length$ #1 - substring$ }
+      {
+        t 'u :=
+        ""
+      }
+    if$
+    't :=
+  }
+  while$
+
+  u                                                             % rest of string
+  t.org #1 t.org text.length$ u text.length$ - substring$       % leading digits
+
+  %% "DEBUG: t.org = [" t.org * "]" * warning$
+  %% "DEBUG: u     = [" u * "]" * warning$
+
+  %% dump.stack.2
+
+  %% "DEBUG: leave skip.digits " cite$ * warning$
+}
+
+FUNCTION { skip.nondigits }
+{
+  %% skip over leading nondigits in string
+  %% stack in:  string
+  %% stack out: rest-of-string
+
+  %% "DEBUG: enter skip.nondigits " cite$ * warning$
+
+  't :=
+  "" 'u :=
+
+  { t text.length$ }
+  {
+    %% "=================DEBUG: skip.nondigits   t = [" t * "]" * warning$
+    t is.leading.digit
+      {
+        t 'u :=
+        ""
+      }
+      { t #2 t text.length$ #1 - substring$ }
+    if$
+    't :=
+  }
+  while$
+
+  u                     % rest of string
+
+  %% dump.stack.1
+  %% "DEBUG: leave skip.nondigits " cite$ * warning$
+}
+
+FUNCTION { parse.next.number }
+{
+  %% stack in:  string
+  %% stack out: rest-of-string next-numeric-part-of-string
+  %% Example:
+  %% stack in:  "123:1--123:59"
+  %% stack out: ":1--123:59" "123"
+
+  's :=
+  s skip.nondigits 's :=
+  s skip.digits
+}
+
+FUNCTION { reduce.pages.to.page.count }
+{
+  %% Stack in:  arbitrary-and-unused
+  %% Stack out: unchanged
+  %%
+  %% For the new-style pagination with article number and numpages or
+  %% pages, we expect to have BibTeX entries containing something like
+  %%     articleno = "17",
+  %%     pages     = "1--23",
+  %% with output "Article 17, 23 pages",
+  %% or
+  %%     articleno = "17",
+  %%     numpages  = "23",
+  %% with output "Article 17, 23 pages",
+  %% or
+  %%     articleno = "17",
+  %%     pages     = "17:1--17:23",
+  %% with output "Article 17, 23 pages",
+  %%
+  %% If articleno is missing or empty, then we should output "1--23",
+  %% "23" (with a warning of a missing articleno), or "17:1--17:23",
+  %% respectively.
+
+  %% "DEBUG: enter reduce.pages.to.page.count " cite$ * warning$
+
+  %% "DEBUG: pages = [" pages * "]" * warning$
+
+  pages
+  parse.next.number 'p1 :=
+  parse.next.number 'p2 :=
+  parse.next.number 'p3 :=
+  parse.next.number 'page.count :=
+
+  duplicate$
+  empty.or.unknown
+    {  }
+    {
+      duplicate$ "unexpected trailing garbage [" swap$ *
+      "] after n:p1--n:p2 in pages = [" *
+      pages *
+      "] in " *
+      cite$ *
+      warning$
+    }
+  if$
+
+  pop$
+
+  %% "DEBUG: reduce.pages.to.page.count: "
+  %% " p1 = " p1 * *
+  %% " p2 = " p2 * *
+  %% " p3 = " p3 * *
+  %% " p4 = " page.count * *
+  %% " in " cite$ * * warning$
+
+  p1 p3 =   p2 "1" =   and   numpages empty.or.unknown   and
+    { "INFO: reduced pages = [" pages * "] to numpages = [" * page.count * "]" * warning$ }
+    {
+      numpages empty.or.unknown
+        { pages }
+        { numpages }
+      if$
+      'page.count :=
+    }
+  if$
+
+  p1 "1" =   p3 empty.or.unknown   and   numpages empty.or.unknown   and
+    {
+      p2 'page.count :=
+      "INFO: reduced pages = [" pages * "] to numpages = [" * page.count * "]" * warning$
+    }
+    {
+      numpages empty.or.unknown
+        { pages }
+        { numpages }
+      if$
+      'page.count :=
+    }
+  if$
+
+  %% "DEBUG: leave reduce.pages.to.page.count " cite$ * warning$
+}
+
+FUNCTION { new.block.checkb }
+{ % issue a new.block only if at least one of top two stack strings is not empty
+  empty.or.unknown
+  swap$ empty.or.unknown
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION { field.or.null }
+{ % convert empty value to null string, else return value
+  duplicate$ empty.or.unknown
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+FUNCTION { emphasize }
+{ % emphasize a non-empty top string on the stack (WITHOUT italic correction)
+  duplicate$ empty.or.unknown
+    { pop$ "" }
+    { "{\em " swap$ * "}" * }
+  if$
+}
+
+FUNCTION { emphasize.with.italic.correction }
+{ % convert empty string to null string, or emphasize with a trailing italic correction
+  duplicate$ empty.or.unknown
+    { pop$ "" }
+    { "{\em " swap$ * "\/}" * }
+  if$
+}
+
+FUNCTION { comma }
+{ % convert empty string to null string, or brace string and add trailing comma
+  duplicate$ empty.or.unknown
+    { pop$ "" }
+    { "{" swap$ * "}," * }
+  if$
+}
+
+FUNCTION { format.names }
+{
+  % Format bibliographical entries with the first author last name first,
+  % and subsequent authors with initials followed by last name.
+  % All names are formatted in this routine.
+
+  's :=
+  #1 'nameptr :=               % nameptr = 1;
+  s num.names$ 'numnames :=    % numnames = num.name$(s);
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { nameptr #1 =
+        %NO: BAD ORDER: {"{" s nameptr "{ff~}{ll}{, jj}{, vv}" format.name$ * "}" * 't := }
+        %NO: BAD ORDER: {"{" s nameptr "{ff~}{ll}{, jj}{, vv}" format.name$ * "}" * 't := }
+        {"{" s nameptr "{ff }{vv }{ll}{, jj}" format.name$ * "}" * 't := }
+        {"{" s nameptr "{ff }{vv }{ll}{, jj}" format.name$ * "}" * 't := }
+      if$
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "{\sc others}" =
+                { " {et~al\mbox{.}}" * } % jrh: avoid spacing problems
+                { " {and} " * t * } % from Chicago Manual of Style
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=          % nameptr += 1;
+      namesleft #1 - 'namesleft :=      % namesleft =- 1;
+    }
+  while$
+}
+
+FUNCTION { my.full.label }
+{
+  's :=
+  #1 'nameptr :=               % nameptr = 1;
+  s num.names$ 'numnames :=    % numnames = num.name$(s);
+  numnames 'namesleft :=
+    { namesleft #0 > }
+
+    { s nameptr "{vv~}{ll}" format.name$ 't :=  % get the next name
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                { " et~al\mbox{.}" * } % jrh: avoid spacing problems
+                { " and " * t * } % from Chicago Manual of Style
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=          % nameptr += 1;
+      namesleft #1 - 'namesleft :=      % namesleft =- 1;
+    }
+  while$
+
+}
+
+FUNCTION { format.names.fml }
+{
+  % Format names in "familiar" format, with first initial followed by
+  % last name. Like format.names, ALL names are formatted.
+  % jtb: The names are NOT put in small caps
+
+  's :=
+  #1 'nameptr :=               % nameptr = 1;
+  s num.names$ 'numnames :=    % numnames = num.name$(s);
+  numnames 'namesleft :=
+    { namesleft #0 > }
+
+    {
+      "{" s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ * "}" * 't :=
+
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "{others}" =
+                { " {et~al\mbox{.}}" * }
+                { " {and} " * t * }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=          % nameptr += 1;
+      namesleft #1 - 'namesleft :=      % namesleft =- 1;
+    }
+  while$
+}
+
+FUNCTION { format.authors }
+{
+  author empty.or.unknown
+    { "" }
+    { author format.names add.period$} % jtb: add period if none before
+  if$
+}
+
+FUNCTION { format.key }
+{
+  empty.or.unknown
+    { key field.or.null }
+    { "" }
+  if$
+}
+
+FUNCTION { format.no.key }
+{
+  empty.or.unknown
+    { "" }
+    { "" }
+  if$
+}
+
+FUNCTION { format.editors.fml }
+{
+  % Format editor names for use in the "in" types: inbook, incollection,
+  % inproceedings: first initial, then last names. When editors are the
+  % LABEL for an entry, then format.editor is used which lists editors
+  % by last name first.
+
+  editor empty.or.unknown
+    { "" }
+    {
+      editor format.names.fml editor num.names$ #1 >
+        { " (Eds.)" * }
+        { " (Ed.)" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION { format.editors }
+{ % format editor names for use in labels, last names first.
+  editor empty.or.unknown
+    { "" }
+    {
+      editor format.names
+      editor num.names$ #1 >
+        { " (Eds.)." * }
+        { " (Ed.)." * }
+      if$
+    }
+  if$
+}
+
+FUNCTION { format.articletitle }
+{
+  title empty.or.unknown
+    { "" }
+    % Use this to preserve lettercase in titles:
+    { "\showarticletitle{" title * "}" * }
+    % Use this for downcase title style:
+    % { \showarticletitle{" title "t" change.case$ * "}" * }
+  if$
+}
+
+FUNCTION { format.title }
+{
+  title empty.or.unknown
+    { "" }
+    % Use this to preserve lettercase in titles:
+    { title }
+    % Use this for downcase title style:
+    % { title "t" change.case$ }
+  if$
+}
+
+FUNCTION { n.dashify }
+{
+  't :=
+  ""
+    { t empty.or.unknown not }
+    {
+      t #1 #1 substring$ "-" =
+        {
+          t #1 #2 substring$ "--" = not
+            { "--" *
+              t #2 global.max$ substring$ 't :=
+            }
+            {
+              { t #1 #1 substring$ "-" = }
+              {
+                "-" *
+                t #2 global.max$ substring$ 't :=
+              }
+              while$
+            }
+          if$
+        }
+        {
+          t #1 #1 substring$ *
+          t #2 global.max$ substring$ 't :=
+        }
+      if$
+    }
+  while$
+}
+
+FUNCTION { format.btitle }
+{
+  edition empty.or.unknown
+  { title emphasize }
+  { title empty.or.unknown
+    { title emphasize } % jtb: what is this supposed to do ?!?
+    { "{\em " title * "\/} (" * edition "l" change.case$ * " ed.)" * } % jtb: no parens for ed.
+    if$
+  }
+  if$
+}
+
+FUNCTION { format.emphasize.booktitle }
+{ % push "" or "{\em booktitle}" or "{\em booktitle}, (second ed.)" on stack
+  edition empty.or.unknown
+    { booktitle emphasize }
+    { booktitle empty.or.unknown
+      { "" }
+      { "{\em " booktitle * "} (" * edition "l" change.case$ * " ed.)" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION { format.city }
+{
+  % jtb: if the preceding string (the title of the conference) is non-empty,
+  % jtb: append the location, otherwise leave empty (so as to trigger the
+  % jtb: error message in output.check
+
+  duplicate$ empty.or.unknown
+    { }
+    {
+      city empty.or.unknown
+        {
+          date empty.or.unknown
+            { }
+            { " (" * date * ")" * }
+          if$
+        }
+        {
+          date empty.or.unknown
+            { " (" * city * ")" * }
+            { " (" * city * ", " * date * ")" * }
+          if$
+        }
+      if$
+    }
+  if$
+}
+
+FUNCTION { tie.or.space.connect }
+{
+  duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION { either.or.check }
+{
+  empty.or.unknown
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION { format.bvolume }
+{
+  % jtb: If there is a series, this is added and the volume trails after it.
+  % jtb: Otherwise, "Vol" is Capitalized.
+
+  volume empty.or.unknown
+    { "" }
+    {
+      series empty.or.unknown
+        { "Vol." volume tie.or.space.connect}
+        { series ", " * "Vol." volume tie.or.space.connect *}
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION { format.bvolume.noseries }
+{
+  volume empty.or.unknown
+    { "" }
+    {
+      series empty.or.unknown
+        { "Vol." volume tie.or.space.connect}
+        { "Vol." volume tie.or.space.connect}
+%        { series ", " * "Vol." volume tie.or.space.connect *}
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION { format.series }
+{
+  series empty.or.unknown
+    {""}
+    {" {\em (" * series ")}" *}
+  if$
+}
+
+FUNCTION { format.number.series }
+{
+  volume empty.or.unknown
+    {
+      number empty.or.unknown
+        {
+          volume empty.or.unknown
+          { "" }
+          {
+            series empty.or.unknown
+              { "" }
+              { " (" series * ")" * }
+            if$
+          }
+          if$
+        }                                       %    { series field.or.null }
+        {
+          output.state mid.sentence =
+            { "Number" }                        % gnp - changed to mixed case always
+            { "Number" }
+          if$
+          number tie.or.space.connect series empty.or.unknown
+            { "there's a number but no series in " cite$ * warning$ }
+            { " in " * series * }
+          if$
+        }
+      if$
+    }
+    {
+      ""
+    }
+  if$
+}
+
+FUNCTION { multi.page.check }
+{
+  't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty.or.unknown not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+    { #1 'multiresult := }
+    { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION { format.pages }
+{
+  pages empty.or.unknown
+    { "" }
+    {
+      pages multi.page.check
+        { pages n.dashify } % gnp - removed () % jtb: removed pp.
+        { pages }
+      if$
+    }
+  if$
+}
+
+FUNCTION { format.pages.check.without.articleno }
+{ %% format pages field only if articleno is absent
+  %% Stack out: pages-specification
+  numpages missing$ pages missing$ and
+    { "page numbers missing in both pages and numpages fields in " cite$ * warning$ }
+    { }
+  if$
+
+  articleno empty.or.unknown
+    {
+      pages missing$
+        { numpages }
+        { format.pages }
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION { format.pages.check }
+{
+  pages empty.or.unknown
+    { "page numbers missing in " cite$ * warning$ "" }
+    { pages n.dashify }
+  if$
+}
+
+FUNCTION { format.bookpages }
+{
+  bookpages empty.or.unknown
+    { "" }
+    { bookpages "book pages" tie.or.space.connect }
+  if$
+}
+
+FUNCTION { format.named.pages }
+{
+  pages empty.or.unknown
+    { "" }
+    { format.pages "pages" tie.or.space.connect }
+  if$
+}
+
+%
+% Changed by Boris Veytsman, 2011-03-13
+% Now the word "pages" is printed even if
+% there field pages is not empty.
+%
+
+FUNCTION { format.page.count }
+{
+  page.count empty.or.unknown
+    { "" }
+    {
+      articleno empty.or.unknown
+        { "numpages field, but no articleno field, in " cite$ * warning$ }
+        { }
+      if$
+      page.count "pages" tie.or.space.connect
+    }
+  if$
+}
+
+FUNCTION { format.articleno.numpages }
+{
+  %% There are seven possible outputs, depending on which fields are set.
+  %%
+  %% These four are handled here:
+  %%
+  %%     articleno, numpages, pages     -> "Article articleno-value, numpages-value pages"
+  %%     articleno, numpages            -> "Article articleno-value, numpages-value pages"
+  %%     articleno, pages               -> "Article articleno-value, reduced-pages-value pages"
+  %%     articleno                      -> "Article articleno-value" and warn about missing numpages
+  %%
+  %% The remaining three have already been handled by
+  %% format.pages.check.without.articleno:
+  %%
+  %%     numpages, pages                -> "pages-value"
+  %%     numpages                       -> "numpages-value"
+  %%     pages                          -> "pages-value"
+
+  articleno empty.or.unknown
+    {
+      numpages empty.or.unknown
+        { }
+        { "require articleno with numpages field in " cite$ * warning$ }
+      if$
+      ""
+    }
+    {
+      numpages empty.or.unknown
+        {
+          pages empty.or.unknown
+            {
+              "require pages or numpages fields with articleno field in " cite$ * warning$
+              "" 'page.count :=
+            }
+            { reduce.pages.to.page.count }
+          if$
+        }
+        { numpages 'page.count := }
+      if$
+
+      %% The Article number is now handled in format.day.month.year because
+      %% ACM prefers the style "Digital Libraries 12, 3, Article 5 (July 2008)"
+      %% over "Digital Libraries 12, 3 (July 2008), Article 5"
+      %% format.articleno output
+      format.page.count
+    }
+  if$
+}
+
+FUNCTION { format.journal.volume.number.day.month.year }
+{
+  % By Young (and Spencer)
+  % GNP - fixed bugs with missing volume, number, and/or pages
+  %
+  % Format journal, volume, number, pages for article types.
+  %
+  journal empty.or.unknown
+    { "no journal in " cite$ * warning$
+      "" }
+%    { journal emphasize.with.italic.correction }
+     {
+           journal "Journal of the ACM" =
+       { "{\it J. ACM}" }
+       {
+           journal "American Mathematical Society Translations" =
+       { "{\it Amer. Math. Soc. Transl.}" }
+       {
+           journal "Bulletin of the American Mathematical Society" =
+       { "{\it Bull. Amer. Math. Soc.}" }
+       {
+           journal "Proceedings of the American Mathematical Society" =
+       { "{\it Proc. Amer. Math. Soc.}" }
+       {
+           journal "Transactions of the American Mathematical Society" =
+       { "{\it Trans. Amer. Math. Soc.}" }
+       {
+           journal "Communications of the {ACM}" =
+       { "{\it Commun. {ACM}}" }
+       {
+           journal "{ACM} Computing Surveys" =
+       { "{\it Comput. Surveys}" }
+       {
+           journal "{ACM} Transactions on Mathematical Software" =
+       { "{\it {ACM} Trans. Math. Software}" }
+       {
+           journal "{ACM} {SIGNUM} Newsletter" =
+       { "{\it {ACM} {SIGNUM} Newslett.}" }
+       {
+           journal "American Journal of Sociology" =
+       { "{\it Amer. J. Sociology}" }
+       {
+           journal "Journal of the American Statistical Association" =
+       { "{\it J. Amer. Statist. Assoc.}" }
+       {
+           journal "Applied Mathematics and Computation" =
+       { "{\it Appl. Math. Comput.}" }
+       {
+           journal "American Mathematical Monthly" =
+       { "{\it Amer. Math. Monthly}" }
+       {
+           journal "British Journal of Mathematical and Statistical Psychology" =
+       { "{\it Brit. J. Math. Statist. Psych.}" }
+       {
+           journal "Canadian Mathematical Bulletin" =
+       { "{\it Canad. Math. Bull.}" }
+       {
+           journal "Journal of Computational and Applied Mathematics" =
+       { "{\it J. Comput. Appl. Math.}" }
+       {
+           journal "Journal of Computational Physics" =
+       { "{\it J. Comput. Phys.}" }
+       {
+           journal "Computers and Structures" =
+       { "{\it Comput. \& Structures}" }
+       {
+           journal "The Computer Journal" =
+       { "{\it Comput. J.}" }
+       {
+           journal "Journal of Computer and System Sciences" =
+       { "{\it J. Comput. System Sci.}" }
+       {
+           journal "Contemporary Mathematics" =
+       { "{\it Contemp. Math.}" }
+       {
+           journal "Crelle's Journal" =
+       { "{\it Crelle's J.}" }
+       {
+           journal "Giornale di Mathematiche" =
+       { "{\it Giorn. Mat.}" }
+       {
+           journal "{IEEE} Transactions on Computers" =
+       { "{\it {IEEE} Trans. Comput.}" }
+       {
+           journal "{IEEE} Transactions on Automatic Control" =
+       { "{\it {IEEE} Trans. Automat. Control}" }
+       {
+           journal "Proceedings of the {IEEE}" =
+       { "{\it Proc. {IEEE}}" }
+       {
+           journal "{IEEE} Transactions on Aerospace and Electronic Systems" =
+       { "{\it {IEEE} Trans. Aerospace Electron. Systems}" }
+       {
+           journal "{IMA} Journal of Numerical Analysis" =
+       { "{\it {IMA} J. Numer. Anal.}" }
+       {
+           journal "Information Processing Letters" =
+       { "{\it Inform. Process. Lett.}" }
+       {
+           journal "Journal of the Institute of Mathematics and its Applications" =
+       { "{\it J. Inst. Math. Appl.}" }
+       {
+           journal "International Journal of Control" =
+       { "{\it Internat. J. Control}" }
+       {
+           journal "International Journal for Numerical Methods in Engineering" =
+       { "{\it Internat. J. Numer. Methods Engrg.}" }
+       {
+           journal "International Journal of Supercomputing Applications" =
+       { "{\it Internat. J. Supercomputing Applic.}" }
+       {
+           journal "Journal of Research of the National Bureau of Standards" =
+       { "{\it J. Res. Nat. Bur. Standards}" }
+       {
+           journal "Linear Algebra and its Applications" =
+       { "{\it Linear Algebra Appl.}" }
+       {
+           journal "Journal of Mathematical Analysis and Applications" =
+       { "{\it J. Math. Anal. Appl.}" }
+       {
+           journal "Mathematische Annalen" =
+       { "{\it Math. Ann.}" }
+       {
+           journal "Journal of Mathematical Physics" =
+       { "{\it J. Math. Phys.}" }
+       {
+           journal "Mathematics of Computation" =
+       { "{\it Math. Comp.}" }
+       {
+           journal "Mathematica Scandinavica" =
+       { "{\it Math. Scand.}" }
+       {
+           journal "Mathematical Tables and Other Aids to Computation" =
+       { "{\it Math. Tables Aids Comput.}" }
+       {
+           journal "Numerische Mathematik" =
+       { "{\it Numer. Math.}" }
+       {
+           journal "Pacific Journal of Mathematics" =
+       { "{\it Pacific J. Math.}" }
+       {
+           journal "Journal of Parallel and Distributed Computing" =
+       { "{\it J. Parallel and Distrib. Comput.}" }
+       {
+           journal "Parallel Computing" =
+       { "{\it Parallel Comput.}" }
+       {
+           journal "Philosophical Magazine" =
+       { "{\it Philos. Mag.}" }
+       {
+           journal "Proceedings of the National Academy of Sciences of the USA" =
+       { "{\it Proc. Nat. Acad. Sci. U. S. A.}" }
+       {
+           journal "Quarterly Journal of Mathematics, Oxford, Series (2)" =
+       { "{\it Quart. J. Math. Oxford Ser. (2)}" }
+       {
+           journal "Quarterly of Applied Mathematics" =
+       { "{\it Quart. Appl. Math.}" }
+       {
+           journal "Review of the International Statisical Institute" =
+       { "{\it Rev. Inst. Internat. Statist.}" }
+       {
+           journal "Journal of the Society for Industrial and Applied Mathematics" =
+       { "{\it J. Soc. Indust. Appl. Math.}" }
+       {
+           journal "Journal of the Society for Industrial and Applied Mathematics, Series B, Numerical Analysis" =
+       { "{\it J. Soc. Indust. Appl. Math. Ser. B Numer. Anal.}" }
+       {
+           journal "{SIAM} Journal on Algebraic and Discrete Methods" =
+       { "{\it {SIAM} J. Algebraic Discrete Methods}" }
+       {
+           journal "{SIAM} Journal on Applied Mathematics" =
+       { "{\it {SIAM} J. Appl. Math.}" }
+       {
+           journal "{SIAM} Journal on Computing" =
+       { "{\it {SIAM} J. Comput.}" }
+       {
+           journal "{SIAM} Journal on Matrix Analysis and Applications" =
+       { "{\it {SIAM} J. Matrix Anal. Appl.}" }
+       {
+           journal "{SIAM} Journal on Numerical Analysis" =
+       { "{\it {SIAM} J. Numer. Anal.}" }
+       {
+           journal "{SIAM} Review" =
+       { "{\it {SIAM} Rev.}" }
+       {
+           journal "{SIAM} Journal on Scientific and Statistical Computing" =
+       { "{\it {SIAM} J. Sci. Statist. Comput.}" }
+       {
+           journal "Software Practice and Experience" =
+       { "{\it Software Prac. Experience}" }
+       {
+           journal "Statistical Science" =
+       { "{\it Statist. Sci.}" }
+       {
+           journal "{USSR} Computational Mathematics and Mathematical Physics" =
+       { "{\it {U. S. S. R.} Comput. Math. and Math. Phys.}" }
+       {
+           journal "Journal of {VLSI} and Computer Systems" =
+       { "{\it J. {VLSI} Comput. Syst.}" }
+       {
+           journal "Zeitschrift fur Angewandte Mathematik und Mechanik" =
+       { "{\it Z. Angew. Math. Mech.}" }
+       {
+           journal "Zeitschrift fur Angewandte Mathematik und Physik" =
+       { "{\it Z. Angew. Math. Phys.}" }
+       {
+           journal "ACM Computing Surveys" =
+       { "{\it Comput. Surveys}" }
+       {
+           journal "ACM Transactions on Mathematical Software" =
+       { "{\it ACM Trans. Math. Software}" }
+       {
+           journal "ACM {SIGNUM} Newsletter" =
+       { "{\it ACM {SIGNUM} Newslett.}" }
+       {
+           journal "IEEE Transactions on Computers" =
+       { "{\it IEEE Trans. Comput.}" }
+       {
+           journal "IEEE Transactions on Automatic Control" =
+       { "{\it IEEE Trans. Automat. Control}" }
+       {
+           journal "Proceedings of the IEEE" =
+       { "{\it Proc. IEEE}" }
+       {
+           journal "IEEE Transactions on Aerospace and Electronic Systems" =
+       { "{\it IEEE Trans. Aerospace Electron. Systems}" }
+       {
+           journal "IMA Journal of Numerical Analysis" =
+       { "{\it IMA J. Numer. Anal.}" }
+       {
+           journal "SIAM Journal on Algebraic and Discrete Methods" =
+       { "{\it SIAM J. Algebraic Discrete Methods}" }
+       {
+           journal "SIAM Journal on Applied Mathematics" =
+       { "{\it SIAM J. Appl. Math.}" }
+       {
+           journal "SIAM Journal on Computing" =
+       { "{\it SIAM J. Comput.}" }
+       {
+           journal "SIAM Journal on Matrix Analysis and Applications" =
+       { "{\it SIAM J. Matrix Anal. Appl.}" }
+       {
+           journal "SIAM Journal on Numerical Analysis" =
+       { "{\it SIAM J. Numer. Anal.}" }
+       {
+           journal "SIAM Review" =
+       { "{\it SIAM Rev.}" }
+       {
+           journal "SIAM Journal on Scientific and Statistical Computing" =
+       { "{\it SIAM J. Sci. Statist. Comput.}" }
+       {
+           journal "USSR Computational Mathematics and Mathematical Physics" =
+       { "{\it U. S. S. R. Comput. Math. and Math. Phys.}" }
+       {
+           journal "Journal of VLSI and Computer Systems" =
+       { "{\it J. VLSI Comput. Syst.}" }
+       {
+           journal "Communications of the ACM" =
+       { "{\it Commun. ACM}" }
+       %% If no match with cases needing special handling, just output journal name
+       {  journal emphasize.with.italic.correction  }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+           }
+     if$
+         }
+    if$
+
+  number empty.or.unknown
+    {
+      volume empty.or.unknown
+        { "no number and no volume in " cite$ * warning$ "" * }
+        { " " * " {" * volume * "}" * }
+      if$
+    }
+    {
+      volume empty.or.unknown
+        {
+          "unusual to have number, but no volume, for " cite$ * warning$
+          " " * number *
+        }
+        { " " * volume comma " " * number * * }
+      if$
+    }
+  if$
+
+  format.day.month.year *
+}
+
+FUNCTION { format.chapter.pages }
+{
+  chapter empty.or.unknown
+    'format.pages
+    { type empty.or.unknown
+        { "Chapter" } % gnp - changed to mixed case
+        { type "t" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty.or.unknown
+        {"page numbers missing in " cite$ * warning$} % gnp - added check
+        { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION { format.in.emphasize.booktitle }
+{ % jtb: format for collections or proceedings not appearing in a journal
+  booktitle empty.or.unknown
+  { "" }
+  { "In " format.emphasize.booktitle * }
+  if$
+}
+
+FUNCTION { format.in.booktitle }
+{ % jtb: format for proceedings appearing in a journal
+  booktitle empty.or.unknown
+  { "" }
+  { "In " booktitle * }
+  if$
+}
+
+FUNCTION { format.in.ed.booktitle }
+{
+  booktitle empty.or.unknown
+  { "" }
+  { editor empty.or.unknown
+    { "In " format.emphasize.booktitle * }
+                % jtb: swapped editor location
+    { "In " format.emphasize.booktitle * ", " * format.editors.fml * }
+    if$
+  }
+  if$
+}
+
+FUNCTION { format.thesis.type }
+{ % call with default type on stack top
+  type empty.or.unknown
+    'skip$    % use default type
+    {
+      pop$    % discard default type
+      % NO: it is silly to have to brace protect every degree type!:  type "t" change.case$
+      type
+    }
+  if$
+}
+
+FUNCTION { format.tr.number }
+{
+  type empty.or.unknown
+%    { "Tech. Rep." }     
+    { "{T}echnical {R}eport" }     % ACM wants it explicit (Gerry 9/28)
+    'type
+  if$
+  number empty.or.unknown
+    { "t" change.case$ }
+    %% LOOKS BAD: { "." * number tie.or.space.connect }
+    %% Prefer "Research report RJ687." to "Research report. RJ687."
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION { format.advisor }
+{
+  advisor empty.or.unknown
+    { "" }
+    { "Advisor(s) " advisor * }
+  if$
+}
+
+FUNCTION { format.article.crossref }
+{ "See"
+  "\citeN{" * crossref * "}" *
+}
+
+FUNCTION { format.crossref.editor }
+{
+  editor #1 "{vv~}{ll}" format.name$
+  editor num.names$ duplicate$
+  #2 >
+    { pop$ " et~al\mbox{.}" * }         % jrh: avoid spacing problems
+    { #2 <
+    'skip$
+    { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+        { " et~al\mbox{.}" * }          % jrh: avoid spacing problems
+        { " and " * editor #2 "{vv~}{ll}" format.name$ * }
+      if$
+    }
+      if$
+    }
+  if$
+}
+
+FUNCTION { format.book.crossref }
+{
+  volume empty.or.unknown
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect % gnp - changed to mixed case
+      " of " *
+    }
+  if$
+  editor empty.or.unknown
+  editor field.or.null author field.or.null =
+  or
+    { key empty.or.unknown
+    { series empty.or.unknown
+        { "need editor, key, or series for " cite$ * " to crossref " *
+          crossref * warning$
+          "" *
+        }
+        { "{\em " * series * "\/}" * }
+      if$
+    }
+    { key * }
+      if$
+    }
+    { format.crossref.editor * }
+  if$
+  " \citeN{" * crossref * "}" *
+}
+
+FUNCTION { format.incoll.inproc.crossref }
+{ "See"
+  " \citeN{" * crossref * "}" *
+}
+
+FUNCTION { format.lab.names }
+{
+  % format.lab.names:
+  %
+  % determines "short" names for the abbreviated author information.
+  % "Long" labels are created in calc.label, using the routine my.full.label
+  % to format author and editor fields.
+  %
+  % There are 4 cases for labels.   (n=3 in the example)
+  % a) one author             Foo
+  % b) one to n               Foo, Bar and Baz
+  % c) use of "and others"    Foo, Bar et al.
+  % d) more than n            Foo et al.
+
+  's :=
+  s num.names$ 'numnames :=
+  numnames #2 >    % change number to number of others allowed before
+                   % forcing "et al".
+    { s #1 "{vv~}{ll}" format.name$ " et~al\mbox{.}" * } % jrh: \mbox{} added
+    {
+      numnames #1 - 'namesleft :=
+      #2 'nameptr :=
+      s #1 "{vv~}{ll}" format.name$
+        { namesleft #0 > }
+        { nameptr numnames =
+            { s nameptr "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+                { " et~al\mbox{.}" * }          % jrh: avoid spacing problems
+                { " and " * s nameptr "{vv~}{ll}" format.name$ * }
+              if$
+            }
+            { ", " * s nameptr "{vv~}{ll}" format.name$ * }
+          if$
+          nameptr #1 + 'nameptr :=
+          namesleft #1 - 'namesleft :=
+        }
+      while$
+    }
+  if$
+}
+
+FUNCTION { author.key.label }
+{
+  author empty.or.unknown
+    { key empty.or.unknown
+          { "no key, author in " cite$ * warning$
+            cite$ #1 #3 substring$ }
+         'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION { author.key.organization.label }
+{ % added - gnp. Provide label formatting by organization if author is null.
+  author empty.or.unknown
+    { organization empty.or.unknown
+        { key empty.or.unknown
+            { "no key, author or organization in " cite$ * warning$
+              cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { organization }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION { editor.key.organization.label }
+{ % added - gnp. Provide label formatting by organization if editor is null.
+  editor empty.or.unknown
+    { organization empty.or.unknown
+        { key empty.or.unknown
+            { "no key, editor or organization in " cite$ * warning$
+              cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { organization }
+      if$
+    }
+    { editor format.lab.names }
+  if$
+}
+
+FUNCTION { author.editor.key.label }
+{
+  author empty.or.unknown
+    { editor empty.or.unknown
+          { key empty.or.unknown
+               { "no key, author, or editor in " cite$ * warning$
+                 cite$ #1 #3 substring$ }
+             'key
+           if$
+         }
+          { editor format.lab.names }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION { calc.label }
+{
+  % Changed - GNP. See also author.organization.sort, editor.organization.sort
+  % Form label for BibTeX entry. The classification of which fields are used
+  % for which type of entry (book, inbook, etc.) are taken from alpha.bst.
+  % The change here from newapa is to also include organization as a
+  % citation label if author or editor is missing.
+
+  type$ "book" =
+  type$ "inbook" =
+  or
+  type$ "periodical" =
+  or
+    'author.editor.key.label
+    { type$ "proceedings" =
+        'editor.key.organization.label
+        { type$ "manual" =
+            'author.key.organization.label
+            'author.key.label
+          if$
+        }
+      if$
+    }
+  if$
+
+  author empty.or.unknown  % generate the full label citation information.
+    {
+      editor empty.or.unknown
+        {
+          organization empty.or.unknown
+            {
+              key empty.or.unknown
+                {
+                  "no author, editor, organization, or key in " cite$ * warning$
+                  "??"
+                }
+                { key }
+              if$
+            }
+            { organization }
+          if$
+        }
+        { editor my.full.label }
+      if$
+    }
+    { author my.full.label }
+  if$
+
+  % leave label on the stack, to be popped when required.
+
+  "}{" * swap$ * "}{" *
+  %  year field.or.null purify$ #-1 #4 substring$ *
+  %
+  % save the year for sort processing afterwards (adding a, b, c, etc.)
+  %
+  year field.or.null purify$ #-1 #4 substring$
+  'label.year :=
+}
+
+%
+% Change by Gerry: use number-like citations for transactions
+% 2011/03/23
+% Reverting: Ayman
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem{" write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+% FUNCTION { output.bibitem }
+% {
+%   newline$
+%   "\bibitem[\protect\citeauthoryear{" write$
+%   calc.label write$
+%   sort.year write$
+%   "}]%" writeln
+%   "        {" write$
+%   cite$ write$
+%   "}" writeln
+%   ""
+%   before.all 'output.state :=
+% }
+
+
+FUNCTION { output.issue.doi.coden.isxn.lccn.url }
+{ % enter and return with stack empty
+  %% We switch now from buffered output to output of complete lines, so
+  %% that the Issue .. URL data have their own lines, and are less likely
+  %% to be line-wrapped by BibTeX's short-sighted algorithm, which wraps
+  %% lines longer than 79 characters, backtracking to what it thinks is
+  %% a break point in the string.  Any such wrapping MUST be undone to
+  %% prevent percent-newline from appearing in DOIs and URLs.  The
+  %% output data are intentionally wrapped in \showxxx{} macros at
+  %% beginning of line, and that supply their own punctuation (if they
+  %% are not defined to suppress output entirely), to make it easier for
+  %% other software to recover them from .bbl files.
+  %%
+  %% It also makes it possible to later change the macro definitions
+  %% to suppress particular output values, or alter their appearance.
+  %%
+  %% Note that it is possible for theses, technical reports, and
+  %% manuals to have ISBNs, and anything that has an ISBN may also
+  %% have an ISSN.  When there are no values for these keys, there
+  %% is no output generated for them here.
+
+  "\newblock" writeln
+  after.block 'output.state :=
+
+  output.issue
+  output.isbn
+  output.coden  % CODEN is functionally like ISSN, so output them sequentially
+  output.issn
+  output.lccn
+  output.doi    % DOI is ALWAYS last according to CrossRef DOI documentation
+  output.url    % but ACM wants URL last
+}
+
+FUNCTION { output.issue.doi.coden.isxn.lccn.url.note }
+{ % enter with stack empty, return with empty string on stack
+  output.issue.doi.coden.isxn.lccn.url
+  note empty.or.unknown
+    { }
+    {
+      "\newblock" writeln
+      output.note
+    }
+  if$
+  ""
+}
+
+FUNCTION { output.issue.doi.coden.isxn.lccn.url.note.check }
+{ % enter with stack empty, return with empty string on stack
+  output.issue.doi.coden.isxn.lccn.url
+  note empty.or.unknown
+    { }
+    {
+      "\newblock" writeln
+      output.note.check
+    }
+  if$
+  ""
+}
+
+FUNCTION { article }
+{
+  output.bibitem
+
+  author empty.or.unknown
+    {
+      editor empty.or.unknown
+        { "neither author and editor supplied for " cite$ * warning$ }
+        { format.editors "editor" output.check }
+      if$
+    }
+    { format.authors "author" output.check }
+  if$
+
+  author format.no.key output       % added
+  output.year.check                 % added
+  new.block
+  format.articletitle "title" output.check
+  new.block
+  howpublished output
+
+  crossref missing$
+    { format.journal.volume.number.day.month.year }
+    {
+      "cross reference in @Article{...} is unusual" warning$
+      format.article.crossref output.nonnull
+    }
+  if$
+  output
+
+  format.pages.check.without.articleno output
+  format.articleno.numpages output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION { book }
+{
+  output.bibitem
+  author empty.or.unknown
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  output.year.check       % added
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { new.sentence              % jtb: start a new sentence for series/volume
+      format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address "address" output.check    % jtb: require address
+      fin.sentence
+      pages empty.or.unknown
+        { format.bookpages }    % use bookpages when pages empty
+        { format.pages.check "pages" tie.or.space.connect }
+      if$
+      output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION { booklet }
+{
+  output.bibitem
+  format.authors output
+  author format.key output          % added
+  output.year.check                 % added
+  new.block
+  format.title "title" output.check
+  new.block
+  howpublished output
+  address output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION { inbook }
+{
+  output.bibitem
+  author empty.or.unknown
+    { format.editors
+      "author and editor" output.check
+    }
+    { format.authors output.nonnull
+      crossref missing$
+    { "author and editor" editor either.or.check }
+    'skip$
+      if$
+    }
+  if$
+  output.year.check                 % added
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { new.sentence              % jtb: start a new sentence for series/volume
+      format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address "address" output.check    % jtb: require address
+      format.bookpages output
+      format.chapter.pages
+      "chapter and pages" output.check  % jtb: moved from before publisher
+    }
+    {
+      format.bookpages output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION { incollection }
+{
+  output.bibitem
+  format.authors "author" output.check
+  author format.key output       % added
+  output.year.check              % added
+  new.block
+  format.articletitle "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      new.sentence                % jtb: start a new sentence for series/volume
+      format.bvolume output
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address "address" output.check      % jtb: require address
+      format.bookpages output
+      format.chapter.pages output % gnp - was special.output.nonnull
+                                  % left out comma before page numbers
+                                  % jtb: moved from before publisher
+    }
+    {
+      format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION { inproceedings }
+{
+  output.bibitem
+  format.authors "author" output.check
+  author format.key output            % added
+  output.year.check                   % added
+  new.block
+  format.articletitle "title" output.check
+  howpublished output.dot.space
+  crossref missing$
+    {
+      journal missing$          % jtb: proceedings appearing in journals
+        { format.in.emphasize.booktitle format.city "booktitle"  output.check.dot.space
+          format.series output.removenospace
+          format.editors.fml output % BV 2011/09/27 Moved dot to comma
+          format.bvolume.noseries output
+          new.sentence
+          organization output
+          publisher "publisher" output.check % jtb: require publisher (?)
+          address "address" output.check  % jtb: require address
+          format.bookpages output
+        }
+        {
+           format.in.booktitle format.city "booktitle" output.check
+           format.editors.fml output
+           new.sentence
+           format.journal.volume.number.day.month.year output
+        }
+      if$
+      format.articleno output
+      format.pages.check.without.articleno output
+    }
+    {
+      format.incoll.inproc.crossref output.nonnull
+      format.articleno output
+      format.pages.check.without.articleno output
+    }
+  if$
+  format.articleno.numpages output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION { conference } { inproceedings }
+
+FUNCTION { manual }
+{
+  output.bibitem
+  author empty.or.unknown
+    { editor empty.or.unknown
+      { organization "organization" output.check
+        organization format.key output }  % if all else fails, use key
+      { format.editors "author and editor" output.check }
+      if$
+    }
+    { format.authors output.nonnull }
+    if$
+  output.year.check                 % added
+  new.block
+  format.btitle "title" output.check
+  organization address new.block.checkb
+  % jtb: back to normal style: organization, address
+  organization "organization" output.check
+  address output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION { mastersthesis }
+{
+  output.bibitem
+  format.authors "author" output.check
+  author format.key output          % added
+  output.year.check                 % added
+  new.block
+  format.title emphasize "title" output.check  % NB: ACM style requires emphasized thesis title
+  new.block
+  "Master's\ thesis" format.thesis.type output new.sentence  % Added dot. BV 2011/09/27
+  school "school" output.check
+  address output
+  new.block
+  format.advisor output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION { misc }
+{
+  output.bibitem
+  format.authors output
+  author format.key output            % added
+  output.year.check                   % added
+  title howpublished new.block.checkb
+  format.title output
+  new.block
+  howpublished output
+  "" output.nonnull.dot.space
+  output.day.month.year              % Gerry - appears odd if (only) the year is 'repeated' but (appears) 'valuable' if the month/day is _also_ included - 2011/09/28
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION { phdthesis }
+{
+  output.bibitem
+  format.authors "author" output.check
+  author format.key output          % added
+  output.year.check                 % added
+  new.block
+  format.title emphasize "title" output.check  % NB: ACM style requires emphasized thesis title
+  new.block
+ "Ph.D. Dissertation" format.thesis.type output new.sentence % Added dot. BV 2011/09/27
+  school "school" output.check
+  address output
+  new.block
+  format.advisor output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION {format.date}
+{ year empty.or.unknown
+    { month empty.or.unknown
+        {
+          ""                    % output empty date if year/month both empty
+          day empty.or.unknown
+            {  }
+            { "there's a day but no month or year in " cite$ * warning$ }
+          if$
+        }
+        { "there's a month but no year in " cite$ * warning$
+          month
+          day empty.or.unknown
+            { }
+            { " " * day * }
+          if$
+        }
+      if$
+    }
+    { month empty.or.unknown
+        {
+          year                  % output only year if month empty
+          day empty.or.unknown
+            {  }
+            { "there's a day and year but no month in " cite$ * warning$ }
+          if$
+        }
+        {
+          month " " *
+          day empty.or.unknown
+            { }
+            { day * ", " * }
+          if$
+          year *
+        }
+      if$
+    }
+  if$
+}
+
+FUNCTION {new.block.checka}
+{
+  empty.or.unknown
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION { periodical }
+{
+  output.bibitem
+  editor empty.or.unknown
+    { organization output }
+    { format.editors output.nonnull }
+  if$
+  new.block
+  title emphasize "title" output.check
+  format.date output
+  new.sentence
+  publisher output
+  address output
+  howpublished new.block.checka
+  howpublished output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION { proceedings }
+{
+  output.bibitem
+  editor empty.or.unknown
+    { organization output
+      organization format.key output }  % gnp - changed from author format.key
+    { format.editors output.nonnull }
+  if$
+  % author format.key output             % gnp - removed (should be either
+  %                                        editor or organization
+  output.year.check                    % added (newapa)
+  new.block
+  format.btitle format.city "title" output.check        % jtb: added city
+  new.sentence
+  format.bvolume output
+  format.number.series output
+  new.sentence
+  organization output
+  % jtb: normal order: publisher, address
+  publisher output
+  address output
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION { techreport }
+{
+  output.bibitem
+  format.authors "author" output.check
+  author format.key output             % added
+  output.year.check                    % added
+  new.block
+  format.btitle "title" output.check
+  new.block
+%   format.tr.number output               % jtb: moved month ...
+  format.tr.number output new.sentence    % Gerry  - need dot 2011/09/28
+  institution "institution" output.check
+  address output
+  new.sentence
+  format.named.pages output
+  % ACM omits year at end in transactions style
+  % format.day.month.year output.nonnull.dot.space  % jtb: ... to here (no parens)
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note
+  fin.entry
+}
+
+FUNCTION { unpublished }
+{
+  output.bibitem
+  format.authors
+  "author" output.check
+  author format.key output              % added
+  output.year.check                     % added
+  new.block
+  format.title "title" output.check
+  fin.sentence
+  output.day.month.year                 % UTAH
+  fin.block
+  output.issue.doi.coden.isxn.lccn.url.note.check
+  fin.entry
+}
+
+FUNCTION { default.type } { misc }
+
+%%% ACM journal-style month definitions: full name if 1--5 letters, else
+%%% abbreviation of 3 or 4 characters and a dot
+
+MACRO {jan}             {"Jan."}
+
+MACRO {feb}             {"Feb."}
+
+MACRO {mar}             {"March"}
+
+MACRO {apr}             {"April"}
+
+MACRO {may}             {"May"}
+
+MACRO {jun}             {"June"}
+
+MACRO {jul}             {"July"}
+
+MACRO {aug}             {"Aug."}
+
+MACRO {sep}             {"Sept."}
+
+MACRO {oct}             {"Oct."}
+
+MACRO {nov}             {"Nov."}
+
+MACRO {dec}             {"Dec."}
+
+
+%%% ====================================================================
+%%%                 I M P O R T A N T   C H A N G E
+%%%
+%%% For the 2009 release of the official acm-*.bst files, there are to
+%%% be NO predefined journal abbreviations in those style files.
+%%%
+%%% ACM may later develop an official list of mappings of full journal
+%%% names of commonly-cited journals to ACM-preferred abbreviations, but
+%%% authors should consider that use of any of these commented-out
+%%% abbreviations is DEPRECATED unless the BibTeX file itself provides
+%%% its own @String{name = "value"} definitions.
+%%%
+%%% Use of journal (and publisher and address) @String{...}
+%%% abbreviations, as opposed to explicit value assignments such as
+%%% journal = "J. ACM" and publisher = "IEEE", is preferred in
+%%% bibliographic databases, because it makes it easier for journal
+%%% production staff to replace those definitions by publisher-preferred
+%%% abbreviations when articles are typeset for publication.
+%%%
+%%% For historical reasons, and because some of these abbreviations are
+%%% used in other (non-ACM) bibliography style files, they are preserved
+%%% here in comments.  Future releases of the acm*-.bst files are likely
+%%% to remove them entirely.
+%%% ====================================================================
+%%%
+%%% DEPRECATED: MACRO {acmcs}           {"ACM Comput. Surv."}                   % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {acmlett}         {"ACM Lett. Program. Lang. Syst."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {acta}            {"Acta Inf."}                           % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ai}              {"Artificial Intelligence"}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {al}              {"Ada Lett."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {acr}             {"Adv. Comput. Res."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {bit}             {"Bit"}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {cacm}            {"Commun. ACM"}                         % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {cj}              {"Comput. J."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {cn}              {"Comput. Netw."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {cl}              {"Comput. Lang."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ibmjrd}          {"IBM J. Res. and Development"}         % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ibmsj}           {"IBM Systems Journal"}                 % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ict}             {"Inf. Contr."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ieebcs}          {"IEE/BCS Softw. Eng. J."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ieees}           {"IEEE Softw."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ieeese}          {"IEEE Trans. Softw. Eng."}             % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ieeetc}          {"IEEE Trans. Comput."}                 % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ieeetcad}        {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"} % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ieeetpds}        {"IEEE Trans. Parall. Distrib. Syst."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ieeetit}         {"IEEE Trans. Inf. Theory"}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ipl}             {"Inf. Process. Lett."}                 % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {icp}             {"Inf. Comput."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ist}             {"Inf. Softw. Tech."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ijsa}            {"Int. J. Supercomput. Appl."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ijpp}            {"Int. J. Parallel Program."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {jacm}            {"J. ACM"}                              % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: % MACRO {jcss}          {"Journal of Computer and System Sciences"} % original BibTeX
+%%% DEPRECATED: MACRO {jcss}            {"J. Comput. Syst. Sci."}               % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {jlp}             {"J. Logic Program."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {jfp}             {"J. Funct. Program."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {jsmrp}           {"J. Softw. Maint. Res. Pract."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {jss}             {"J. Syst. Softw."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {jlc}             {"J. Logic and Comput."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {jlsc}            {"J. Lisp Symb. Comput."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {lpls}            {"Lett. Program. Lang. Syst."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {mor}             {"Math. Oper. Res."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {mscs}            {"Math. Struct. Comput. Sci."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {mst}             {"Math. Syst. Theor."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {ngc}             {"New Gen. Comput."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {scp}             {"Sci. Comput. Program."}               % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {sicomp}          {"SIAM J. Comput."}                     % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {spe}             {"Softw. Pract. Exper."}
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {tocs}            {"ACM Trans. Comput. Syst."}            % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {tods}            {"ACM Trans. Database Syst."}           % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {tog}             {"ACM Trans. Graphics"}                 % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {toms}            {"ACM Trans. Math. Softw."}             % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {toois}           {"ACM Trans. Office Inf. Syst."}        % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {toplas}          {"ACM Trans. Program. Lang. Syst."}     % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {tcs}             {"Theor. Comput. Sci."}                 % original BibTeX
+%%% DEPRECATED:
+%%% DEPRECATED: MACRO {tr}              {"Tech. Rep."}
+%%% ====================================================================
+
+READ
+
+FUNCTION { sortify }
+{
+  purify$
+  "l" change.case$
+}
+
+FUNCTION { chop.word }
+{
+  's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+
+FUNCTION { sort.format.names }
+{
+  's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { nameptr #1 >
+          { "   " * }
+         'skip$
+      if$
+  %      s nameptr "{ff{ } }{ll{ }}{  vv{ }}{  jj{ }}" format.name$ 't :=
+      s nameptr "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}" format.name$ 't :=
+      nameptr numnames = t "others" = and
+          { " et~al" * }
+          { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION { sort.format.title }
+{
+  't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+
+FUNCTION { author.sort }
+{
+  author empty.or.unknown
+    { key empty.or.unknown
+         { "to sort, need author or key in " cite$ * warning$
+           "" }
+         { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION { author.editor.sort }
+{
+  author empty.or.unknown
+    {
+      editor empty.or.unknown
+         {
+           key empty.or.unknown
+             { "to sort, need author, editor, or key in " cite$ * warning$
+               ""
+             }
+             { key sortify }
+           if$
+         }
+         { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION { author.organization.sort }
+{
+  % added - GNP. Stack author or organization for sorting (from alpha.bst).
+  % Unlike alpha.bst, we need entire names, not abbreviations
+
+  author empty.or.unknown
+    { organization empty.or.unknown
+        { key empty.or.unknown
+            { "to sort, need author, organization, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { organization sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION { editor.organization.sort }
+{
+  % added - GNP. Stack editor or organization for sorting (from alpha.bst).
+  % Unlike alpha.bst, we need entire names, not abbreviations
+
+  editor empty.or.unknown
+    { organization empty.or.unknown
+        { key empty.or.unknown
+            { "to sort, need editor, organization, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { organization sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+
+FUNCTION { presort }
+{
+  % Presort creates the bibentry's label via a call to calc.label, and then
+  % sorts the entries based on entry type. Chicago.bst adds support for
+  % including organizations as the sort key; the following is stolen from
+  % alpha.bst.
+
+  calc.label sortify % recalculate bibitem label
+  year field.or.null purify$ #-1 #4 substring$ * % add year
+  "    "
+  *
+  type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+        'editor.organization.sort
+        { type$ "manual" =
+            'author.organization.sort
+            'author.sort
+          if$
+        }
+      if$
+    }
+  if$
+  #1 entry.max$ substring$        % added for newapa
+  'sort.label :=                  % added for newapa
+  sort.label                      % added for newapa
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE { presort }
+
+SORT             % by label, year, author/editor, title
+
+FUNCTION { initialize.extra.label.stuff }
+{ #0 int.to.chr$ 'last.label :=
+  "" 'next.extra :=
+  #0 'last.extra.num :=
+}
+
+FUNCTION { forward.pass }
+{
+  % Pass through all entries, comparing current entry to last one.
+  % Need to concatenate year to the stack (done by calc.label) to determine
+  % if two entries are the same (see presort)
+
+  last.label
+  % OLD:calc.label year field.or.null purify$ #-1 #4 substring$ * % add year
+  % NEW:
+  author.key.label year field.or.null purify$ #-1 #4 substring$ * % add year
+  #1 entry.max$ substring$ =     % are they equal?
+     { last.extra.num #1 + 'last.extra.num :=
+       last.extra.num int.to.chr$ 'extra.label :=
+     }
+     { "a" chr.to.int$ 'last.extra.num :=
+       "" 'extra.label :=
+       % OLD: calc.label year field.or.null purify$ #-1 #4 substring$ * % add year
+       % NEW:
+       author.key.label year field.or.null purify$ #-1 #4 substring$ * % add year
+       #1 entry.max$ substring$ 'last.label := % assign to last.label
+     }
+  if$
+}
+
+FUNCTION { reverse.pass }
+{
+  next.extra "b" =
+    { "a" 'extra.label := }
+     'skip$
+  if$
+  label.year extra.label * 'sort.year :=
+  extra.label 'next.extra :=
+}
+
+EXECUTE {initialize.extra.label.stuff}
+
+ITERATE {forward.pass}
+
+REVERSE {reverse.pass}
+
+FUNCTION { bib.sort.order }
+{
+  sort.label
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE { bib.sort.order }
+
+SORT             % by sort.label, year, title --- giving final bib. order.
+
+FUNCTION { begin.bib }
+{
+  %% Set to #0 show 13-digit ISBN in preference to 10-digit ISBN.
+  %% Set to #1 to show both 10-digit and 13-digit ISBNs.
+  #1 'show-isbn-10-and-13 :=
+
+  "%%% -*-BibTeX-*-" writeln
+  "%%% Do NOT edit. File created by BibTeX with style" writeln
+  "%%% ACM-Reference-Format-Journals [18-Jan-2012]." writeln
+  "" writeln
+
+  preamble$ empty.or.unknown
+    'skip$
+    { preamble$ writeln }
+  if$
+  "\begin{thebibliography}{00}" writeln
+  ""                                                                         writeln
+  "%%% ====================================================================" writeln
+  "%%% NOTE TO THE USER: you can override these defaults by providing"       writeln
+  "%%% customized versions of any of these macros before the \bibliography"  writeln
+  "%%% command.  Each of them MUST provide its own final punctuation,"       writeln
+  "%%% except for \shownote{}, \showDOI{}, and \showURL{}.  The latter two"  writeln
+  "%%% do not use final punctuation, in order to avoid confusing it with"    writeln
+  "%%% the Web address."                                                     writeln
+  "%%%"                                                                      writeln
+  "%%% To suppress output of a particular field, define its macro to expand" writeln
+  "%%% to an empty string, or better, \unskip, like this:"                   writeln
+  "%%%"                                                                      writeln
+  "%%% \newcommand{\showDOI}[1]{\unskip}   % LaTeX syntax"                   writeln
+  "%%%"                                                                      writeln
+  "%%% \def \showDOI #1{\unskip}           % plain TeX syntax"               writeln
+  "%%%"                                                                      writeln
+  "%%% ====================================================================" writeln
+  ""                                                                         writeln
+
+  %% ACM publications do not use CODEN, ISSN, and LCCN data, so their default
+  %% macro wrappers expand to \unskip, discarding their values and unwanted
+  %% space.
+  %%
+  %% For other publications, prior definitions like these may be useful:
+  %%
+  %%     Plain TeX:
+  %%         \def \showCODEN     #1{CODEN #1.}
+  %%         \def \showISSN      #1{ISSN #1.}
+  %%         \def \showLCCN      #1{LCCN #1.}
+  %%
+  %%     LaTeX:
+  %%         \newcommand{\showCODEN}[1]{CODEN #1.}
+  %%         \newcommand{\showISSN}[1]#1{ISSN #1.}
+  %%         \newcommand{\showLCCN}[1]{LCCN #1.}
+
+  "\ifx \showCODEN    \undefined \def \showCODEN     #1{\unskip}     \fi" writeln
+  "\ifx \showDOI      \undefined \def \showDOI       #1{{\tt DOI:}\penalty0{#1}\ } \fi" writeln
+  % ACM styles omit ISBNs, but they can be included by suitable definitions of
+  % \showISBNx and \showISBNxiii before the .bbl file is read
+  "\ifx \showISBNx    \undefined \def \showISBNx     #1{\unskip}     \fi" writeln
+  "\ifx \showISBNxiii \undefined \def \showISBNxiii  #1{\unskip}     \fi" writeln
+  "\ifx \showISSN     \undefined \def \showISSN      #1{\unskip}     \fi" writeln
+  "\ifx \showLCCN     \undefined \def \showLCCN      #1{\unskip}     \fi" writeln
+  "\ifx \shownote     \undefined \def \shownote      #1{#1}          \fi" writeln % NB: final period supplied by add.period$ above
+  "\ifx \showarticletitle \undefined \def \showarticletitle #1{#1}   \fi" writeln
+  "\ifx \showURL      \undefined \def \showURL       #1{#1}          \fi" writeln
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION { end.bib }
+{
+  newline$
+  "\end{thebibliography}"
+  writeln
+}
+
+EXECUTE {end.bib}
+
+%%% End of ACM-Reference-Format-Journals.bst V1.00 - 18 January 2012
+
diff --git a/paper_source/acmcopyright.sty b/paper_source/acmcopyright.sty
new file mode 100644 (file)
index 0000000..e8de127
--- /dev/null
@@ -0,0 +1,221 @@
+%%
+%% This is file `acmcopyright.sty',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% acmcopyright.dtx  (with options: `style')
+%% 
+%% IMPORTANT NOTICE:
+%% 
+%% For the copyright see the source file.
+%% 
+%% Any modified versions of this file must be renamed
+%% with new filenames distinct from acmcopyright.sty.
+%% 
+%% For distribution of the original source see the terms
+%% for copying and modification in the file acmcopyright.dtx.
+%% 
+%% This generated file may be distributed as long as the
+%% original source files, as listed above, are part of the
+%% same distribution. (The sources need not necessarily be
+%% in the same archive or directory.)
+%% \CharacterTable
+%%  {Upper-case    \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z
+%%   Lower-case    \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z
+%%   Digits        \0\1\2\3\4\5\6\7\8\9
+%%   Exclamation   \!     Double quote  \"     Hash (number) \#
+%%   Dollar        \$     Percent       \%     Ampersand     \&
+%%   Acute accent  \'     Left paren    \(     Right paren   \)
+%%   Asterisk      \*     Plus          \+     Comma         \,
+%%   Minus         \-     Point         \.     Solidus       \/
+%%   Colon         \:     Semicolon     \;     Less than     \<
+%%   Equals        \=     Greater than  \>     Question mark \?
+%%   Commercial at \@     Left bracket  \[     Backslash     \\
+%%   Right bracket \]     Circumflex    \^     Underscore    \_
+%%   Grave accent  \`     Left brace    \{     Vertical bar  \|
+%%   Right brace   \}     Tilde         \~}
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{acmcopyright}
+[2014/06/29 v1.2 Copyright statemens for ACM classes]
+\newif\if@printcopyright
+\@printcopyrighttrue
+\newif\if@printpermission
+\@printpermissiontrue
+\newif\if@acmowned
+\@acmownedtrue
+\RequirePackage{xkeyval}
+\define@choicekey*{ACM@}{acmcopyrightmode}[%
+  \acm@copyrightinput\acm@copyrightmode]{none,acmcopyright,acmlicensed,%
+  rightsretained,usgov,usgovmixed,cagov,cagovmixed,%
+  licensedusgovmixed,licensedcagovmixed,othergov,licensedothergov}{%
+  \@printpermissiontrue
+  \@printcopyrighttrue
+  \@acmownedtrue
+  \ifnum\acm@copyrightmode=0\relax % none
+   \@printpermissionfalse
+   \@printcopyrightfalse
+   \@acmownedfalse
+  \fi
+  \ifnum\acm@copyrightmode=2\relax % acmlicensed
+   \@acmownedfalse
+  \fi
+  \ifnum\acm@copyrightmode=3\relax % rightsretained
+   \@acmownedfalse
+  \fi
+  \ifnum\acm@copyrightmode=4\relax % usgov
+   \@printpermissiontrue
+   \@printcopyrightfalse
+   \@acmownedfalse
+  \fi
+  \ifnum\acm@copyrightmode=6\relax % cagov
+   \@acmownedfalse
+  \fi
+  \ifnum\acm@copyrightmode=8\relax % licensedusgovmixed
+   \@acmownedfalse
+  \fi
+  \ifnum\acm@copyrightmode=9\relax % licensedcagovmixed
+   \@acmownedfalse
+  \fi
+  \ifnum\acm@copyrightmode=10\relax % othergov
+   \@acmownedtrue
+  \fi
+  \ifnum\acm@copyrightmode=11\relax % licensedothergov
+   \@acmownedfalse
+   \@printcopyrightfalse
+  \fi}
+\def\setcopyright#1{\setkeys{ACM@}{acmcopyrightmode=#1}}
+\setcopyright{acmcopyright}
+\def\@copyrightowner{%
+  \ifcase\acm@copyrightmode\relax % none
+  \or % acmcopyright
+  ACM.
+  \or % acmlicensed
+  Copyright held by the owner/author(s). Publication rights licensed to
+  ACM.
+  \or % rightsretained
+  Copyright held by the owner/author(s).
+  \or % usgov
+  \or % usgovmixed
+  ACM.
+  \or % cagov
+  Crown in Right of Canada.
+  \or %cagovmixed
+  ACM.
+  \or %licensedusgovmixed
+  Copyright held by the owner/author(s). Publication rights licensed to
+  ACM.
+  \or %licensedcagovmixed
+  Copyright held by the owner/author(s). Publication rights licensed to
+  ACM.
+  \or % othergov
+  ACM.
+  \or % licensedothergov
+  \fi}
+\def\@copyrightpermission{%
+  \ifcase\acm@copyrightmode\relax % none
+  \or % acmcopyright
+   Permission to make digital or hard copies of all or part of this
+   work for personal or classroom use is granted without fee provided
+   that copies are not made or distributed for profit or commercial
+   advantage and that copies bear this notice and the full citation on
+   the first page. Copyrights for components of this work owned by
+   others than ACM must be honored. Abstracting with credit is
+   permitted. To copy otherwise, or republish, to post on servers or to
+   redistribute to lists, requires prior specific permission
+   and\hspace*{.5pt}/or  a fee. Request permissions from
+   permissions@acm.org.
+  \or % acmlicensed
+   Permission to make digital or hard copies of all or part of this
+   work for personal or classroom use is granted without fee provided
+   that copies are not made or distributed for profit or commercial
+   advantage and that copies bear this notice and the full citation on
+   the first page. Copyrights for components of this work owned by
+   others than the author(s) must be honored. Abstracting with credit
+   is permitted.  To copy otherwise, or republish, to post on servers
+   or to  redistribute to lists, requires prior specific permission
+   and\hspace*{.5pt}/or  a fee. Request permissions from
+   permissions@acm.org.
+  \or % rightsretained
+   Permission to make digital or hard copies of part or all of this work
+   for personal or classroom use is granted without fee provided that
+   copies are not made or distributed for profit or commercial advantage
+   and that copies bear this notice and the full citation on the first
+   page. Copyrights for third-party components of this work must be
+   honored. For all other uses, contact the
+   owner\hspace*{.5pt}/author(s).
+  \or % usgov
+   This paper is authored by an employee(s) of the United States
+   Government and is in the public domain. Non-exclusive copying or
+   redistribution is allowed, provided that the article citation is
+   given and the authors and agency are clearly identified as its
+   source.
+  \or % usgovmixed
+   ACM acknowledges that this contribution was authored or co-authored
+   by an employee, or contractor of the national government. As such,
+   the Government retains a nonexclusive, royalty-free right to
+   publish or reproduce this article, or to allow others to do so, for
+   Government purposes only. Permission to make digital or hard copies
+   for personal or classroom use is granted. Copies must bear this
+   notice and the full citation on the first page. Copyrights for
+   components of this work owned by others than ACM must be
+   honored. To copy otherwise, distribute, republish, or post,
+   requires prior specific permission and\hspace*{.5pt}/or a
+   fee. Request permissions from permissions@acm.org.
+  \or % cagov
+   This article was authored by employees of the Government of Canada.
+   As such, the Canadian government retains all interest in the
+   copyright to this work and grants to ACM a nonexclusive,
+   royalty-free right to publish or reproduce this article, or to allow
+   others to do so, provided that clear attribution is given both to
+   the authors and the Canadian government agency employing them.
+   Permission to make digital or hard copies for personal or classroom
+   use is granted. Copies must bear this notice and the full citation
+   on the first page.  Copyrights for components of this work owned by
+   others than the Canadain Government must be honored. To copy
+   otherwise, distribute, republish, or post, requires prior specific
+   permission and\hspace*{.5pt}/or a fee. Request permissions from
+   permissions@acm.org.
+  \or % cagovmixed
+   ACM acknowledges that this contribution was co-authored by an
+   affiliate of the national government of Canada. As such, the Crown
+   in Right of Canada retains an equal interest in the copyright.
+   Reprints must include clear attribution to ACM and the author's
+   government agency affiliation.  Permission to make digital or hard
+   copies for personal or classroom use is granted.  Copies must bear
+   this notice and the full citation on the first page. Copyrights for
+   components of this work owned by others than ACM must be honored.
+   To copy otherwise, distribute, republish, or post, requires prior
+   specific permission and\hspace*{.5pt}/or a fee. Request permissions
+   from permissions@acm.org.
+  \or % licensedusgovmixed
+   Publication rights licensed to ACM. ACM acknowledges that this
+   contribution was authored or co-authored by an employee, contractor
+   or affiliate of the United States government. As such, the
+   Government retains a nonexclusive, royalty-free right to publish or
+   reproduce this article, or to allow others to do so, for Government
+   purposes only.
+  \or % licensedcagovmixed
+   Publication rights licensed to ACM. ACM acknowledges that this
+   contribution was authored or co-authored by an employee, contractor
+   or affiliate of the national government of Canada. As such, the
+   Government retains a nonexclusive, royalty-free right to publish or
+   reproduce this article, or to allow others to do so, for Government
+   purposes only.
+  \or % othergov
+   ACM acknowledges that this contribution was authored or co-authored
+   by an employee, contractor or affiliate of a national government. As
+   such, the Government retains a nonexclusive, royalty-free right to
+   publish or reproduce this article, or to allow others to do so, for
+   Government purposes only.
+  \or % licensedothergov
+   Publication rights licensed to ACM. ACM acknowledges that this
+   contribution was authored or co-authored by an employee, contractor
+   or affiliate of a national government. As such, the Government
+   retains a nonexclusive, royalty-free right to publish or reproduce
+   this article, or to allow others to do so, for Government purposes
+   only.
+  \fi}
+\endinput
+%%
+%% End of file `acmcopyright.sty'.
diff --git a/paper_source/auto/generalizable_wiki.el b/paper_source/auto/generalizable_wiki.el
new file mode 100644 (file)
index 0000000..c87a921
--- /dev/null
@@ -0,0 +1,53 @@
+(TeX-add-style-hook
+ "generalizable_wiki"
+ (lambda ()
+   (TeX-add-to-alist 'LaTeX-provided-package-options
+                     '(("color" "usenames" "dvipsnames") ("fontenc" "T1") ("hyperref" "pdflang={en-US}" "pdftex") ("hypcap" "all") ("inputenc" "utf8")))
+   (add-to-list 'LaTeX-verbatim-macros-with-braces-local "path")
+   (add-to-list 'LaTeX-verbatim-macros-with-braces-local "url")
+   (add-to-list 'LaTeX-verbatim-macros-with-braces-local "nolinkurl")
+   (add-to-list 'LaTeX-verbatim-macros-with-braces-local "hyperbaseurl")
+   (add-to-list 'LaTeX-verbatim-macros-with-braces-local "hyperimage")
+   (add-to-list 'LaTeX-verbatim-macros-with-braces-local "hyperref")
+   (add-to-list 'LaTeX-verbatim-macros-with-delims-local "path")
+   (TeX-run-style-hooks
+    "latex2e"
+    "sigchi"
+    "sigchi10"
+    "color"
+    "dcolumn"
+    "array"
+    "balance"
+    "graphics"
+    "fontenc"
+    "txfonts"
+    "mathptmx"
+    "hyperref"
+    "booktabs"
+    "textcomp"
+    "microtype"
+    "hypcap"
+    "ccicons"
+    "inputenc"
+    "tikz")
+   (TeX-add-symbols
+    '("tabhead" 1)
+    "plaintitle"
+    "plainauthor"
+    "emptyauthor"
+    "plainkeywords"
+    "plaingeneralterms"
+    "UrlFont"
+    "pprw"
+    "pprh")
+   (LaTeX-add-labels
+    "plot.editors.time"
+    "newcomer-survival")
+   (LaTeX-add-bibliographies
+    "refs")
+   (LaTeX-add-array-newcolumntypes
+    "d")
+   (LaTeX-add-xcolor-definecolors
+    "linkColor"))
+ :latex)
+
diff --git a/paper_source/figure/newcomer_survival_reversion-1.pdf b/paper_source/figure/newcomer_survival_reversion-1.pdf
new file mode 100644 (file)
index 0000000..53ecd92
Binary files /dev/null and b/paper_source/figure/newcomer_survival_reversion-1.pdf differ
diff --git a/paper_source/figure/plot-editors-1.pdf b/paper_source/figure/plot-editors-1.pdf
new file mode 100644 (file)
index 0000000..be2ced0
Binary files /dev/null and b/paper_source/figure/plot-editors-1.pdf differ
diff --git a/paper_source/generalizable_wiki.Rtex b/paper_source/generalizable_wiki.Rtex
new file mode 100644 (file)
index 0000000..7e703ad
--- /dev/null
@@ -0,0 +1,455 @@
+\documentclass{sigchi}
+%\documentclass[12pt]{article} % FOR PRINTING: OTHERWISE REMOVE THIS LINE 
+
+<<preinit, echo=FALSE>>=
+knit_hooks[['set']](document = function(x) {
+sub('\\usepackage[dvipsnames,usenames]{color}',
+'\\usepackage[]{color}', x, fixed = TRUE)
+})
+@
+
+% Use this section to set the ACM copyright statement (e.g. for
+% preprints).  Consult the conference website for the camera-ready
+% copyright statement.
+
+% Use this command to override the default ACM copyright statement
+% (e.g. for preprints).  Consult the conference website for the
+% camera-ready copyright statement.
+
+% Arabic page numbers for submission.  Remove this line to eliminate
+% page numbers for the camera ready copy
+% \pagenumbering{arabic}
+\usepackage[pdflang={en-US},pdftex]{hyperref}
+% Load basic packages
+\usepackage{dcolumn}
+\newcolumntype{d}[1]{D{.}{.}{#1} }
+\usepackage{array}
+% \usepackage{balance}       % to better equalize the last page
+\usepackage{graphics}      % for EPS, load graphicx instead 
+\usepackage[T1]{fontenc}   % for umlauts and other diaeresis
+\usepackage{txfonts}  % temporarily(?) turned off -mako
+\usepackage{mathptmx}
+
+\usepackage{color}
+\usepackage{booktabs}
+\usepackage{textcomp}
+\usepackage{balance}
+% Some optional stuff you might like/need.
+\usepackage{microtype}        % Improved Tracking and Kerning
+ \usepackage[all]{hypcap}    % Fixes bug in hyperref caption linking
+\usepackage{ccicons}          % Cite your images correctly!
+\usepackage[utf8]{inputenc} % for a UTF8 editor only
+
+\usepackage{dcolumn}
+% Paper metadata (use plain text, for PDF inclusion and later
+% re-using, if desired).  Use \emtpyauthor when submitting for review
+% so you remain anonymous.
+\def\plaintitle{Revisiting ``The Rise and Decline'' \\ in a Population of Peer Production Projects}
+\def\plainkeywords{governance; peer production; online communities; quality control; retention; replication; Wikipedia; wikis}
+\def\plainauthor{Nathan TeBlunthuis, Aaron Shaw, Benjamin Mako Hill}
+\def\emptyauthor{}
+
+
+
+% llt: Define a global style for URLs, rather that the default one
+%  \makeatletter
+% \def\url@leostyle{%
+%  \@ifundefined{selectfont}{
+%    \def\UrlFont{\sf}
+%  }{
+%    \def\UrlFont{\small\bf\ttfamily}
+%  }}
+% \makeatother
+% \urlstyle{leo}
+
+% To make various LaTeX processors do the right thing with page size.
+ \def\pprw{8.5in}
+\def\pprh{11in}
+\special{papersize=\pprw,\pprh}
+\setlength{\paperwidth}{\pprw}
+\setlength{\paperheight}{\pprh}
+\setlength{\pdfpagewidth}{\pprw}
+\setlength{\pdfpageheight}{\pprh}
+
+\usepackage{tikz}
+\usetikzlibrary{arrows}
+\usetikzlibrary{positioning}
+
+% Make sure hyperref comes last of your loaded packages, to give it a
+% fighting chance of not being over-written, since its job is to
+% redefine many LaTeX commands.
+\definecolor{linkColor}{RGB}{6,125,233}
+\hypersetup{%
+  pdftitle={\plaintitle},
+% Use \plainauthor for final version.
+  pdfauthor={\plainauthor},
+%  pdfauthor={\emptyauthor},
+  pdfkeywords={\plainkeywords},
+  pdfdisplaydoctitle=true, % For Accessibility
+  bookmarksnumbered,
+  pdfstartview={FitH},
+  colorlinks,
+  citecolor=black,
+  filecolor=black,
+  linkcolor=black,
+  urlcolor=black,
+  breaklinks=true,
+  hypertexnames=false
+}
+
+% create a shortcut to typeset table headings
+ \newcommand\tabhead[1]{\small\textbf{#1}}
+
+% End of preamble. Here it comes the document.
+\toappear{\scriptsize Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). Copyright is held by the author/owner(s). \\
+{\emph{CHI 2018, April 21--26, 2018, Montr\'eal, QC, Canada.} } \\
+ACM ISBN 978-1-4503-5620-6/18/04. \\
+https://doi.org/10.1145/3173574.3173929}
+% Update the XXXX string to your assigned DOI from ACM.
+
+\clubpenalty=10000
+\widowpenalty = 10000
+
+
+\begin{document}
+%CopyrightYear{2018}
+%setcopyright{rightsretained}
+%conferenceinfo{CHI 2018}{April 21--26, 2018, Montreal, QC, %Canada}\isbn{978-1-4503-5620-6/18/04}
+%\doi{}
+
+
+<<init,echo=FALSE>>=
+
+library(scales)
+neg.log.2 <- function(x){
+    ix.1 <- (x<0) & !is.na(x)
+    ix.2 <- (x>0) & !is.na(x)
+    x[ix.1] <- -1*log(abs(x[ix.1]),base=2) - 1 
+    x[ix.2] <- log(x[ix.2],base=2) + 1
+    return(x)
+}
+
+neg.log.2.inv <- function(x){
+    ix.1 <- (x<0) & !is.na(x)
+    ix.2 <- (x>0) & !is.na(x)
+    x[ix.1] <- -1*2**(abs(x[ix.1] + 1))
+    x[ix.2] <- 2**(x[ix.2]-1)
+    return(x)
+}
+
+neg.log2.trans <- trans_new("neg.log2", transform=neg.log.2,inverse=neg.log.2.inv)
+
+format.ordinal <- function(n){
+    n <- as.character(n)
+    last <- substr(n,nchar(n),nchar(n))
+    first <- substr(n,1,1)
+    if(first == '0')
+        n = last
+    if( n == "1")
+        return(paste0(n,"\\textsuperscript{st}"))
+    else if(last == "2")
+        return(paste0(n,"\\textsuperscript{nd}"))
+    else if (last == "3")
+        return(paste0(n,"\\textsuperscript{rd}"))
+    else
+        return(paste0(n,"\\textsuperscript{th}"))
+}
+
+library('ggplot2')
+library('data.table')
+suppressPackageStartupMessages(library('lubridate'))
+
+logit <- function(x,B){
+  1 / (1 + exp(-x*B))
+}
+
+odds <- function(x,B){
+  logit(x,B) / (1 - logit(x,B))
+}
+
+#suppressPackageStartupMessages(library(texreg,quietly=TRUE,lib.loc="x86_64-pc-linux-gnu-library"))
+bold <- function(x) {paste('{\\textbf{',x,'}}', sep ='')}
+gray <- function(x) {paste('{\\textcolor{gray}{',x,'}}', sep ='')}
+wrapify <- function (x) {paste("{", x, "}", sep="")}
+
+r.userroles <- readRDS("knitr/lib-01-generate_userroles.RDS")
+r <- readRDS("knitr/remember.RDS")
+attach(r)
+m1.coef <- as.list(halfak.model@coef)
+m1.se <- as.list(halfak.model@se)
+m2.coef <- as.list(morgan.model@coef)
+m2.se <- as.list(morgan.model@se)
+
+f <- function (x) {formatC(x, format="d", big.mark=',')}
+format.percent <- function(x) {paste(signif(100*x,2),"\\%",sep='')}
+format.day.ordinal <- function(x) {
+    day <- format(x,format="%d")
+    daylast <- substr(day,nchar(day),nchar(day))
+    dayfirst <- substr(day,1,1)
+    if(dayfirst == '0')
+        day = daylast
+
+    if( daylast == "1")
+        day <- paste0(day,"\\textsuperscript{st}")
+    else if(daylast == "2")
+        day <- paste0(day,"\\textsuperscript{nd}")
+    else if (daylast == "3")
+        day <- paste0(day,"\\textsuperscript{rd}")
+    else
+        day <- paste0(day,"\\textsuperscript{th}")
+        
+    return(day)
+}
+
+format.month <- function(x){
+    return( format(x,format='%B %Y'))
+}
+
+format.date <- function(x) {
+    return(paste(format(x,format = '%B'),format.day.ordinal(x),format(x,format='%Y'),sep=' '))
+}
+@
+
+\title{\plaintitle}
+
+\numberofauthors{3}
+\author{%
+  \alignauthor{Nathan TeBlunthuis\\
+    \affaddr{University of Washington}\\
+    \affaddr{Seattle, Washington, USA}\\
+    \email{nathante@uw.edu}}\\
+  \alignauthor{Aaron Shaw\\
+    \affaddr{Northwestern University}\\
+    \affaddr{Evanston, IL, USA}\\
+    \email{aaronshaw@northwestern.edu}}\\
+  \alignauthor{Benjamin Mako Hill\\
+    \affaddr{University of  Washington}\\
+    \affaddr{Seattle, Washington, USA}\\
+    \email{makohill@uw.edu}}\\
+}
+\maketitle
+
+\begin{abstract}
+Do patterns of growth and stabilization found in large peer production systems such as Wikipedia occur in other communities? This study assesses the generalizability of Halfaker et al.'s influential 2013 paper on “The Rise and Decline of an Open Collaboration System.” We replicate its tests of several theories related to newcomer retention and norm entrenchment using a dataset of hundreds of active peer production wikis from Wikia. We reproduce the subset of the findings from Halfaker and colleagues that we are able to test, comparing both the estimated signs and magnitudes of our models. Our results support the external validity of Halfaker et al.'s claims that quality control systems may limit the growth of peer production communities by deterring new contributors and that norms tend to become entrenched over time. 
+\end{abstract}
+
+\category{H.5.3.}{Information Interfaces and Presentat
+ion (e.g. HCI)}{Group and Organization Interfaces -- Computer-supported cooperative work}
+
+\keywords{governance; peer production; online communities; quality control; retention; replication; Wikipedia; wikis}
+
+
+
+\section{Introduction}
+
+``Peer production'' describes a way of organizing collaborative information production in online commons \cite{benkler_coases_2002-2}. Over the last decade, peer production has become a central object of HCI research. However, the vast majority of peer production research has studied a small number of the largest communities \cite{benkler_peer_2015,crowston_free/libre_2008}. An enormous portion of empirical studies of peer production in HCI are of the English-language version of Wikipedia. Unfortunately, HCI's historical focus on novelty has meant that tests of the applicability of findings shown in one setting to other contexts rarely happens \cite{wilson_replichi_2011}. As a result, we know little about the degree to which theory and design claims from studies of Wikipedia apply more broadly.
+
+\begin{figure}
+<<plot-editors, echo=FALSE,fig.height=3.125,fig.width=5,out.width="\\columnwidth",cache=FALSE,message=FALSE,warning=FALSE>>=
+xlabels = paste0("Year ", 0:max(plot.active.editors.dt$wiki.age.years))
+
+xbreaks = plot.active.editors.dt[,.(b=min(wiki.age.months)),by=.(wiki.age.years)]$b
+
+p2 <- ggplot(plot.active.editors.dt, aes(y=sd.units.active.editors,x=wiki.age.months,ymin=lower.ci,ymax=upper.ci)) + geom_point()
+
+p2 <- p2 + geom_errorbar(width=0.5,alpha=0.25)
+
+p2 <- p2 + geom_smooth(method='loess',se=FALSE,linetype='dashed',color='#E69F00',size=1.1)
+
+p2 <- p2 + scale_x_continuous(name="Wiki age",breaks=xbreaks,labels=xlabels) + scale_y_continuous(name="Active editors (Std dev units)",limit=c(1,2.05),breaks=c(1,1.5,2),minor_breaks=NULL)
+
+p2 <- p2 + theme_minimal(base_size=12) + theme(legend.position="None") 
+
+print(p2)
+@
+
+\caption{Mean of the number of editors with at least 5 edits per month in standard deviation units for wikis in our sample. The dashed lines represent the results of a LOESS regression. The error bars represent bootstrap 95\% confidence intervals. This replicates Figure 2 in RAD.}
+\label{plot.editors.time}
+\end{figure}
+
+This paper replicates analysis from Halfaker et al.'s ``The Rise and Decline of an Open Collaboration System'' \cite{halfaker_rise_2013} (which we abbreviate  ``RAD'') in a sample of \Sexpr{n.wikia.wikis} active wikis hosted on Wikia.\footnote{Wikia is a wiki hosting platform where anyone can start a wiki. In 2016, Wikia partially rebranded as ``Fandom'' to emphasize support for fan communities. See: \url{https://www.wikia.com/} (\url{https://perma.cc/TL79-VB57}).} RAD makes one of the most influential and highly cited claims about peer production dynamics, attributing English Wikipedia's decline in contributors since 2007 to entrenchment (RAD uses the term ``calcification'') within the community as norms and policies become difficult to change, especially for newer users. Our results reproduce most of RAD's findings. Like RAD, we find that the average community in our dataset experiences a ``rise and decline,'' that newcomers are less likely to survive over time, that rejected newcomers are less likely to survive, that editors with longer tenure have more influence over norms, and that norms become entrenched as wikis age. In addition to providing an external validation of RAD's findings, we rule out alternative explanations of RAD's results that emphasize unique attributes of Wikipedia or the timing of the editor decline in that community.
+
+\section{Background}
+\subsection{Entrenchment in Wikipedia and Peer Production}
+
+Active peer production communities often experience a period of rapid growth followed by stabilization \cite{ortega_wikipedia:_2009, schweik_internet_2012}. Following this pattern, the number of contributors to English Wikipedia grew exponentially until March 2007, when it began to decline \cite{suh_singularity_2009-2}. Although early accounts of peer production argued that projects such as Wikipedia and the Linux kernel mobilized massive collaboration without the sorts of formal hierarchies or bureaucracies used in formal organizations \cite{benkler_coases_2002-2, konieczny_governance_2009}, organizational research has argued that the formalization of rules, norms, and routines accompany this trajectory in many types of organizations \cite{hannan_population_1977, meyer_institutionalized_1977, scott_organizations_2006}. Drawing from this work, early explanations for Wikipedia's decline included bureaucratic overhead and increasing resistance to contributions from less active editors \cite{suh_singularity_2009-2, forte_decentralization_2009}. During this same period, algorithmic tools such as ``bots'' became important parts of Wikipedia's quality control systems \cite{geiger_work_2010}, and the proportion of edits that were rejected increased  \cite{halfaker_dont_2011}. Building on this prior work, Halfaker et al.'s ``The Rise and Decline of an Open Collaboration System'' \cite{halfaker_rise_2013} found evidence in support of the theory that three elements of Wikipedia's quality control system---newcomer rejection, algorithmic tools, and norm entrenchment---could explain the transition from growth to decline. However, despite the impact and influence of RAD's explanation, it has not been replicated beyond Wikipedia until now.
+
+\subsection{Replication in Social Computing Research}
+
+Although HCI research prizes novelty and provocation, it also seeks to build scientifically rigorous, replicable, and generalizable knowledge \cite{wilson_replichi_2011}. Replicability refers to how well results hold up when other researchers follow reported procedures. Hornæk et al.~define replication studies as attempts ``to confirm, expand, or generalize an earlier study's findings''  \cite{hornbaek_is_2014}. Generalizability (external validity) refers to the degree to which results hold up across different populations \cite{bollen_social_2015}. Replication studies thus assess whether details of context or methodological choice explain results.
+
+Although comparative analysis of peer production communities has emerged as an important means to understand the life cycles and dynamics of social computing systems \cite{ortega_wikipedia:_2009,roth_measuring_2008,shaw_laboratories_2014}, there have been few efforts to establish whether findings from Wikipedia and other large communities replicate or generalize \cite{benkler_peer_2015,hill_studying_2017}. In one important exception, Kittur and Kraut \cite{kittur_beyond_2010} examine the prevalence of social mechanisms related to conflict and coordination in Wikipedia among nearly 7,000 wikis from Wikia. Their work found both similarities and differences between these communities and Wikipedia.
+
+\subsection{Replicating RAD}
+
+Do the relationships described in RAD generalize to other peer production communities? The evidence on project life cycles, stabilization, and entrenchment suggests that similar patterns may occur beyond Wikipedia. However, Wikipedia's scale and popularity make it a unique outlier among these communities. It is likely unusual in other ways as well. In their conclusion, the RAD authors note: ``Wikipedia's  [growth and quality assurance] challenges may seem unique to its status as one of the largest collaborative projects in human history.'' Nevertheless, they suggest that their analysis of ``sociotechnical gatekeeping and its consequences''  has general applicability.  Indeed, their conclusions have informed analyses of crowdsourced fund-raising \cite{agrawal_simple_2014}, social media \cite{crawford_what_2016}, and online collaborative mapping \cite{palen_success_2015}. 
+
+This paper assesses the replicability and external validity of RAD to provide an empirical foundation for such generalization. In doing so, we also evaluate several alternative explanations that RAD could not rule out. In particular, the simultaneous decline and entrenchment RAD observes in Wikipedia could be driven by external factors related to time, such as the rise of other online communities (e.g., Facebook) that might compete for newcomers. By studying a population of communities whose trajectories start at different points in time, we can model wiki age accounting for calendar time in ways RAD could not. By studying many communities, we can also better understand the scope of RAD's generalizability by measuring variation between wikis.
+
+\section{Methods}
+% P.1
+We attempt to follow RAD's measures and methods to the fullest extent possible. In some places, we are forced to make changes to accommodate differences between English Wikipedia and Wikia and the fact that our analysis includes multiple wikis. To describe our methodology, we briefly summarize RAD's techniques and note several ways that we diverge. Additional detail on operationalization is provided in RAD. We also provide access to the complete R source code that we used to complete our analysis in the supplementary material that accompanies this paper. In the description that follows, variable names are italicized.
+
+% P.2
+The RAD authors present three interdependent analyses. The first tests whether the rejection of edits made by newcomers causes decreased newcomer retention which in turn leads to a decline in the number of active editors.
+To support this claim, the RAD authors use data from English Wikipedia to plot three trends: the number of active contributors, the rate of newcomer survival, and the rate of newcomer rejection. The first plot shows the rise and decline in active contributors (i.e.,~individuals who make at least 5 edits in a given month). The second plot shows that the proportion of good-faith newcomers who ``survive'' falls over time. The third plot shows that the proportion of good-faith newcomers ``rejected'' in their first edit session rises over time. RAD considers a newcomer to have \emph{survived} if the newcomer edits during the period between \Sexpr{as.double(newcomer.period,units='days')} days and \Sexpr{as.double(newcomer.sunset,units='days')} days after their first edit session (i.e.,~sequence of consecutive edits less than one hour apart) and to have been \emph{rejected} if a change the newcomer makes to an article in their first edit session is undone. Using data from all newcomers drawn from a set of Wikia wikis, we replicate these plots in our Study 1. 
+
+% P.3
+Additionally, RAD provides evidence that newcomer rejection is a mechanism for declining newcomer retention by estimating logistic regression models predicting newcomer survival. According to these models, newcomers are less likely to survive both when rejected and when the community was older. RAD presents separate models for good-faith newcomers and for all newcomers. The variables in their model are \textit{year} to model time, \textit{session edits} (the number of edits made in the first session) to account for the newcomer's early activity level, \textit{messaged} specifying if the newcomer was messaged during the newcomer's first 60 days, \textit{reverted} indicating if the newcomer had an edit to an existing page rejected, and \textit{deleted} to specify whether the newcomer created a new page which was deleted. We replicate these findings in our Study 2.
+
+% P.4
+RAD's second analysis builds closely on the same logistic regression to test their theory that the rise of algorithmic quality control tools are an additional cause of Wikipedia's transition from rise to decline. They follow the methods of Geiger et al.~\cite{geiger_defense_2012} to track a number of different tools, including bots, to create a variable, \textit{tool reverted}, that indicates whether the newcomer was reverted by a bot or human using an algorithmic tool. Plots in their paper show that tool use increased greatly and that desirable newcomers were increasingly likely to be reverted by tools. RAD argues that tool use may decrease newcomer retention over-and-above other forms of rejection, because tool users are less likely to practice a norm thought to mitigate discouragement following rejection known as the ``BOLD, revert, discuss cycle'' (BRD). BRD prescribes that reverting editors reciprocate discussion with those they revert.  A negative coefficient for \textit{tool reverted} in the logistic regression model described above provides evidence that algorithmic tool use may be a mechanism for declining newcomer retention.  Because tool-based rejection is extremely rare on Wikia, we do not attempt to replicate RAD's finding that tool use is associated with lower levels of ``BOLD, revert, discuss.'' The rest of their analysis is replicated in our Study 2.
+
+% P.5
+In their third analysis, the RAD authors seek to measure the entrenchment of norms on Wikipedia. Norms are formed at many sites on Wikipedia, including three different kinds of norm pages analyzed by RAD: official policy pages, less formal guidelines, and informal essays. As evidence that norm entrenchment may be a cause of the decline, they plot the number of edits to these different kinds of pages over time. Edits to policies and guidelines began decreasing in 2006. Edits to essays slowed during the transition from rise to decline in 2008, decreasing thereafter. 
+
+% P.6
+RAD once again uses a logistic regression predicting whether an edit to a norm page is \emph{reverted} to provide evidence of norm entrenchment: norm pages become more difficult to edit over time, measured as \textit{year}, and those with greater \emph{editor tenure} have their contributions to norm pages reverted less often than newer editors. They also model whether or not the norm page was an \textit{essay}, the interaction between \textit{editor tenure} and \textit{essay}, and the interaction between \textit{essay} and \textit{year}. They find that essays had calcified substantially less than policies. Norm pages categories do not exist systematically on Wikia, so we are not able to reproduce the analysis of different levels of formality in norm pages. We replicate the other analyses from RAD's regression analysis in our Study 3.
+
+% P.7
+As we have suggested, there are several parts of RAD that we do not attempt to replicate. RAD makes considerable effort to address a threat arising from high prevalence of vandalism on Wikipedia---because vandals may not intend to continue contributing, they may be unaffected by rejection. Additionally, a decline in the number of desirable newcomers may also be a cause of the decline in active contributors. To address these potential confounds, they hand-code a sample of ``good-faith'' newcomers and report that the proportion of newcomers classified as ``good-faith'' fell during the period of rapid growth, about one year before the transition to decline. Their results for their sample of good-faith newcomers and for all newcomers are substantively similar.
+
+% P.8
+Our work is only able to replicate RAD in a sample of all newcomers and does not attempt to create a sub-sample of desirable contributors. As experienced Wikipedians, the RAD authors and their volunteer coders were qualified to judge the quality of newcomers. Many Wikia wikis are about subject matter we are not familiar with. In many cases, they are written in languages we cannot read. We are confident in our results despite this omission for two reasons. First, RAD found very similar estimates in the models restricted to good-faith newcomers and the models that include all newcomers. To the extent that their good-faith-only estimates represented a robustness check that their analysis passed, we are comfortable forgoing it. Second, exploratory analysis of our data suggests that rates of vandalism are lower on Wikia than on Wikipedia, which should lessen the underlying threat.
+
+% P.9
+Additionally, our analysis deviates from RAD in several ways that reflect the challenges and threats associated with studying a population of communities. Most importantly, we diverge from RAD by using estimation techniques and additional control variables appropriate to data nested within multiple communities.
+Because we consider multiple communities, it is possible that a single person might be a ``newcomer'' in our dataset more than once. To avoid analytic problems with repeated measures of users, and because individuals with experience in other wikis are likely not newcomers in the way RAD conceptualized them, our analysis identifies newcomers as individuals who have not edited any wiki in our sample and who are not marked as bots. In results available in our supplement, we fit models that include newcomers with prior experience in other wikis in our sample. Our results are not substantively different.
+
+%P.10 TODO. it would be nice to cite something in this paragraph
+Of course, RAD itself has limitations. In particular, the quality of the evidence for the proposed causes of newcomer retention hinges on the assumption that other contemporaneous factors did not drive the decline. However, external events such as the rise of social media sites such as Facebook, as well as cultural changes in how the Internet was used and popularly understood, overlap with the transition from growth to decline. Studying multiple wikis that began at different points in time allows us to partially address this limitation in RAD. Our analysis inherits other limitations from RAD that we do not address. Importantly, entrenchment is theorized to contribute to declining newcomer retention, although this relationship is not modeled explicitly.
+
+\subsection{Data}
+
+Our dataset consists of page, user, and revision history data from \Sexpr{n.wikia.wikis} wikis publicly hosted on Wikia, the largest peer production wiki platform in terms of number of communities. Our initial dataset included all public edits to all Wikia wikis between \Sexpr{format.date(earliest.data.point)} and \Sexpr{format.date(latest.data.point)}.
+The \Sexpr{n.wikia.wikis} wikis whose data we use to replicate RAD include the top 1\% of Wikia wikis by number of unique registered article editors. We include only these wikis because they have newcomer and governance activity appropriate for replicating RAD.
+We follow the RAD authors by only including newcomers we can observe for \Sexpr{as.double(newcomer.sunset,units='days')} days. We also obtain records identifying bot and administrator accounts from the Wikia API. We exclude \Sexpr{length(deleted.wikis)} wikis where the API is unavailable because the wiki had been deleted since the XML archives we used were created.
+
+Our dataset includes substantial variation between wikis, including linguistic diversity,  activity level, and organizational complexity. Wikis vary in size, with numbers of unique contributors ranging from \Sexpr{f(wiki.stats[,min(total.editors)])} to \Sexpr{f(wiki.stats[,max(total.editors)])} (median \Sexpr{f(wiki.stats[,median(total.editors)])}). Some wikis in our sample produce collections of facts about popular culture, video games, and fandom. Others, such as the \textit{Althistory Wiki},\footnote{\url{http://althistory.wikia.com/}  (\url{https://perma.cc/4EPQ-FW6Q})} write collaborative fiction. Still others, such as \textit{Uncyclopedia},\footnote{\url{http://uncyclopedia.wikia.com/} (\url{https://perma.cc/L9CC-KN5A})} parody Wikipedia. 
+These \Sexpr{n.wikia.wikis} wikis also vary along our measures. For example, quality control practices vary, and the number of reverts within our communities ranges from \Sexpr{f(wiki.stats[,min(total.reverts)])} to \Sexpr{f(wiki.stats[,max(total.reverts)])} (median \Sexpr{f(wiki.stats[,median(total.reverts)])}). Only \Sexpr{format.percent(wiki.stats[,mean(total.bot.reverts!=0)])} use bots to revert edits and the number of bot reverts among wikis with any ranges from \Sexpr{f(wiki.stats[total.bot.reverts!=0,min(total.bot.reverts)])} to \Sexpr{f(wiki.stats[total.bot.reverts!=0,max(total.bot.reverts)])} (median \Sexpr{f(wiki.stats[total.bot.reverts!=0,median(total.bot.reverts)])}). The communities also vary in terms of policy making activity. A ``namespace'' is a high-level category used on all wikis. The project namespace is typically used for policy and documentation and governance activity. The number of edits to the project namespace ranges from \Sexpr{f(wiki.stats[,min(total.ns4.edits)])} to \Sexpr{f(wiki.stats[,max(total.ns4.edits)])} (median \Sexpr{f(wiki.stats[,median(total.ns4.edits)])}). 
+
+\section{Study 1: Trajectories}
+
+\begin{figure}[t]
+<<newcomer_survival_reversion,echo=FALSE,fig.height=3.8,fig.width=5,out.width="\\columnwidth",cache=FALSE,message=FALSE,warning=FALSE>>=
+min.newcomers.plotted <- 1
+min.wikis.plotted <- 20
+p.stats[variable=='p.survives',variable:="Surviving"]
+p.stats[variable=='p.reverted',variable:="Reverted"]
+
+p.stats <- p.stats[variable=='Surviving' | variable=="Reverted"] 
+
+max.year <- 5
+p.stats <- p.stats[wiki.age.half.years/2 <= max.year]
+xlabels <-  paste0("Year ", 0:5)
+breaks <- (0:5)*2
+
+equal_breaks <- function(...){
+    function(x){
+        if(max(x) < 0.1){
+            return(c(0,0.015,0.03,0.045,0.06))
+        }
+        else{
+            return(c(0,0.2,0.4,0.6,0.8))
+        }
+    }
+}
+
+p.stats <- p.stats[variable=='Surviving',variable:="Survived"]
+p.stats <- p.stats[variable=='Reverted',variable:="Rejected"]
+p <- ggplot(p.stats,aes(x=as.factor(wiki.age.half.years),ymin=min,lower=q1,middle=med,upper=q3,ymax=max,width=0.4))
+p <- p + geom_boxplot(stat='identity')
+p <- p + geom_line(aes(x=wiki.age.half.years + 1, y=mu),color="#E69F00",linetype=2)
+p <- p + geom_line(aes(x=wiki.age.half.years + 1, y=med),color="#CC79A7",linetype=1)
+p <- p + facet_wrap("variable",nrow=2,strip.position="right",scales="free_y")
+p <- p + scale_y_continuous(name="Proportion of newcomers ",minor_breaks=NULL,breaks=equal_breaks()) + scale_x_discrete(name="Wiki age", labels=xlabels,breaks=breaks)
+p <- p + theme_minimal(base_size=12)  + theme(legend.position="None",panel.grid.major.x=element_blank(),panel.spacing=unit(2,'lines'))
+
+print(p)
+@
+\caption{Newcomer survival and rejection over time. The orange dashed line shows the mean and the pink solid line shows the median. Years with data from at least  \Sexpr{min.wikis.plotted} wikis are shown.}
+\label{newcomer-survival}
+\end{figure}
+
+We first replicate RAD's Figure 2 to determine whether the ``rise and decline'' pattern generalizes. To compare across wikis of vastly different size, we divide the number of monthly active contributors to each wiki in a given month by the standard deviation of that measure within that wiki.
+Figure \ref{plot.editors.time} plots our results and shows a trajectory of growth and decline similar to English Wikipedia. While Wikipedia's exponential growth was more explosive and lasted longer, the average active Wikia wiki follows a similar pattern. These communities begin small, tend to grow for 3-4 years, and then transition from growth to decline. 
+Because few wikis in our dataset have existed for more than 5 years, we visualize only months with at least \Sexpr{min(plot.active.editors.dt[,N.wikis])} active wikis (the 90th percentile). 
+Although the downward trend continues after this threshold, the estimates become noisier. 
+
+Next we replicate RAD's Figures 3 and 4 to visualize the average trajectories in newcomer survival and rejection. Our results are shown in Figure \ref{newcomer-survival}. Lines connect the mean and median rates for all wikis active in each period to show the overall trend. Box plots visualize the variation between wikis.
+The top panel of Figure \ref{newcomer-survival} corresponds to RAD's Figure 3 and shows box plots for the proportion of newcomers who survive in each year. As in Wikipedia, newcomer retention declines over time in the average wiki in our dataset. The trend is statistically significant (Spearman's \(\rho=\Sexpr{signif(survives.cor.test$estimate,2)},~p<0.001\)).
+
+The bottom panel of Figure \ref{newcomer-survival} corresponds to RAD's Figure 4 and shows box plots for the proportion of newcomers who are rejected over time. Although rejection is much less common in our wikis than in English Wikipedia, wikis in our sample exhibit increasing rates of newcomer rejection. The trend is statistically significant (Spearman's \(\rho=\Sexpr{signif(reverted.cor.test$estimate,2)},~p<0.001\)). Although our estimates point in the same direction as RAD's, the average trajectory is qualitatively different. Rates of newcomer rejection are initially very low, increase over the first year, and remain level for most of the wiki's lifetime. They begin increasing again in the 4\textsuperscript{th} year, when the number of active editors tends to decline.
+
+\section{Study 2: Newcomer survival}
+\subsection{Methods}
+
+We replicate RAD's first logistic regression model predicting whether a newcomer \emph{survived} to test whether being \emph{reverted} or \emph{tool reverted} in the first edit session makes newcomers less likely to survive. RAD includes a single variable capturing the linear effect of time which reflects both the age of Wikipedia and the passage of calendar time. Having multiple wikis, we can tease apart wiki age and calendar time by measuring \emph{wiki age} as the time since the first edit to each wiki in years. We include a linear specification of this variable following RAD. We also add a control for calendar time by adding a categorical variable for \emph{quarter} that includes dummy variables for each 90-day calendar period.
+We also include \emph{wiki}, a categorical variable with \Sexpr{f(n.wikia.wikis)} levels to account for variation in baseline level of newcomer retention between wikis and to address issues of serial correlation in our standard errors. We do not report the results for these categorical variables, both for the sake of clarity and because they control for variation in the dataset that does not relate to the core theoretical concerns.
+
+We cannot replicate several facets of this part of RAD's analysis in this study. RAD considers two kinds of newcomer rejection. Although the first of these corresponds with our measure for \emph{reverted}, the RAD authors also consider whether an article created by a newcomer  in their first session is \emph{deleted}. We do not have information about deleted pages. 
+Additionally, RAD considers two kinds of algorithmic tools: fully automated ``bots'' and semi-automated editing interfaces that automatically alert human users to suspected vandalism. These interfaces are either very rare or invisible on Wikia, so our measure of \emph{tool reverted} only includes rejection by bots. Summary statistics for our analytic variables are available in the supplementary material.
+\subsection{Results}
+
+\input{tables/halfak.mod.tex}
+
+Table \ref{table:regression.1} shows our fitted regression model. This table closely mirrors the first column of RAD's Table 1.
+Like RAD, we find that newcomers reverted in their first edit session are less likely to survive (\(\beta=\Sexpr{signif(m1.coef[['is.revertedTRUE']],2)}\), \(SE = \Sexpr{signif(m1.se[['is.revertedTRUE']],2)}\)).  The magnitude of our coefficient for \emph{reverted} is very close to that reported by RAD (\(\beta=-0.68\), \( SE=0.04\)). According to our model, a newcomer who is reverted in their first session has \Sexpr{signif(exp(m1.coef[['is.revertedTRUE']]),2)} times the odds of continuing to contribute of a newcomer who is not reverted.
+We also find a negative relationship between \emph{wiki age} and newcomer survival (\(\beta=\Sexpr{signif(m1.coef[['wiki.age']],2)}\), \(SE=\Sexpr{signif(m1.se[['wiki.age']],2)}\)). Again, this is very close to that reported by RAD (\(\beta=-0.40\), \(SE=0.012\)).
+Our parameter estimate for \emph{tool reverted} (\(\beta=\Sexpr{signif(m1.coef[['is.bot.revertedTRUE']],2)}\), \(SE=\Sexpr{signif(m1.se[['is.bot.revertedTRUE']],2)}\)) suggests that newcomers who are rejected by a bot might be less likely to survive. However, the magnitude of this coefficient is too small relative to its standard error to support confidence in this conclusion.\footnote{A post-hoc power analysis suggests that, even if the true relationship is the same as that observed in RAD, we may have been unable to observe it because only \Sexpr{f(sum(newcomer.summary.stats[['p.bot.reverted']]) * halfak.model@gof[5])} newcomers were reverted by bots in our dataset. See the supplementary materials for details.}
+
+
+\section{Study 3: Entrenchment}
+\subsection{Methods}
+
+Finally, we replicate RAD's second model that predicts whether or not an edit to a policy page will be reverted. Although RAD carefully distinguishes between official policy pages and essays, Wikia contributors do not systematically label policy pages in this way. Therefore, we follow Shaw and Hill \cite{shaw_laboratories_2014} and analyze all edits to the project namespace. This departure presents a substantial threat to validity, as the project namespace may be used for purposes besides documenting norms. Despite this limitation, we believe this measure provides the best available opportunity to study norm entrenchment in Wikia. Not all of the wikis in our sample utilize the namespace, so we use only the subset of \Sexpr{n.wikis.ns4} wikis that do for this analysis. Summary statistics for our analytic variables are available in the supplementary material. 
+
+\subsection{Results}
+\input{tables/morgan.model.tex}
+
+Table \ref{table.regression.2} shows the fitted model results and replicates RAD's Table 2.
+Like RAD, we find that contributors with greater \emph{editor tenure} are less likely to have their edits to policy pages reverted (\(\beta=\Sexpr{signif(m2.coef[['age']],2)}\), \(SE=\Sexpr{signif(m2.se[['age']],2)}\)). Our model predicts that, everything else equal, an editor with a 1-week tenure faces about \Sexpr{signif(exp(51/52*-1*m2.coef[['age']]),2)} times the odds of having their edit reverted compared to an editor with a 1-year tenure. RAD reported an odds ratio of  \Sexpr{signif(exp(51/52*0.29),2)}. 
+Also consistent with RAD, we find that project page edits become more likely to be reverted as \emph{wiki age} increases (\(\beta=\Sexpr{signif(m2.coef[['wiki.age']],2)}\), \(SE=\Sexpr{signif(m2.se[['wiki.age']],2)}\)).  According to our model, an edit to the project namespace on a wiki that is 1 year old has about \Sexpr{signif(exp(51/52*m2.coef[['wiki.age']]),2)} times the odds of rejection as when the wiki is 1 week old. 
+
+\section{Discussion}
+
+We find that the patterns of community entrenchment documented in English Wikipedia also occur in comparable Wikia wikis. Wikis in our dataset experience growth in active contributors over about three years and then decline. Newcomer survival tends to decline over time, and newcomers who are rejected are less likely to survive. Older editors have more influence over norms, and norms become more difficult to change. 
+
+By studying these dynamics outside Wikipedia, we can rule out potential explanations of RAD's results linked to unique characteristics of Wikipedia, such as its specific culture. We can also rule out explanations linked to the specific time at which English Wikipedia experienced its decline. The diversity and size of our sample support both more precise estimation of the observed relationships in the data as well as stronger confidence in the validity of our inferences.
+
+Our work has important limitations. Our data are observational, our sample may have unknown biases, and our measures may contain hidden sources of error. For example, wiki editors may change accounts, bots may be unreported, and the project namespace may include material unrelated to norms. Omitted variables may also bias our results. Readers should be careful not to draw causal conclusions from our findings.  
+
+The units of analysis in our regression models are newcomers and edits to project namespaces. Because the wikis in our sample have different numbers of each, the average effects we report could disproportionately reflect the experience of users in the communities that contribute the most observations to our sample. As a robustness check, we fit another set of regression models where each wiki is given equal weight. Our substantive conclusions are robust to this change. Indeed, the re-weighted models suggest that the relationships reported in RAD may even be stronger in smaller or less active communities. In one unsurprising exception, we find that norm pages do not appear to become more difficult to edit over time in wikis that make very little use of the project namespace. These preliminary findings suggest analysis of the relationship between the size of a community and governance systems as a promising direction for future work. Details are available in the supplementary material. 
+
+Despite our effort at generalization, we cannot know if our findings will generalize beyond the wikis in our sample. That said, we think the mechanisms driving the emergence of entrenchment on wikis are similar to mechanisms theorized to drive the emergence and centralization of authority in democratic organizations. For example, Michels' ``iron law of oligarchy'' predicts that bureaucracies arising in large democratic organizations will centralize authority \cite{michels_political_1915, shaw_laboratories_2014}. Similarly, Freeman's ``The Tyranny of Structurelessness'' describes how, even when activist groups deliberately avoid creating formalized rules and bureaucracies, informal structures arise \cite{freeman_tyranny_1972}. Indeed, due to their opacity, informal structures can be more difficult for newcomers to navigate than formalized bureaucracies. Drawing from this earlier theoretical work and our own results, we believe that the patterns of increasing entrenchment and newcomer rejection we estimate will generalize beyond the wikis in our sample to other peer production projects and informal organizations. Understanding why some communities in our sample show more entrenchment than others remains a fascinating subject for further research.
+
+\section{Conclusion}
+
+Our study supports RAD's claim that quality control practices help explain increases in entrenchment and decreases in growth among peer production communities. Our work contributes to social computing and peer production research by providing evidence in support of the external validity of RAD, an influential empirical study. We also contribute to a small but growing literature on replication in HCI by demonstrating a replication study focused on generalizability. Our evidence in support of generalizability rests not only on the signs of our regression coefficients, but also on the similarity of our point estimates and visualizations. This work supports designers and community managers who are acting on the implications in RAD's earlier work.
+
+\section{Acknowledgments}
+
+The authors would like to thank our anonymous reviewers and associate chairs at CHI for their thoughtful and detailed feedback. We would also like to thank Wikia for providing public access to data from their wikis and other members of the Community Data Science Collective for sharing the software, data, and research infrastructure necessary to complete this work. We thank Jonathan Morgan for providing help in planning this study and Amanda TeBlunthuis, Kaylea Champion, Wm Salt Hale, and Sayamindu Dasgupta for feedback on drafts of our manuscript. This project was completed using the Hyak high performance computing cluster at the University of Washington. Financial support for this work came from the National Science Foundation (grants IIS-1617129, IIS-1617468, and GRFP-2016220885), Northwestern University, and the University of Washington. 
+
+\section{Access to Data}
+
+A replication dataset has been placed in the Harvard Dataverse archive and is available at the following URL: \\ \href{https://doi.org/10.7910/DVN/SG3LP1}{https://doi.org/10.7910/DVN/SG3LP1}.
+
+% REFERENCES FORMAT
+\bibliographystyle{SIGCHI-Reference-Format}
+\bibliography{refs}
+
+\end{document}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: t
+%%% End:
+
+%  LocalWords:  xlabels xbreaks aes loess linetype rebranded RAD's
+%  LocalWords:  replicable Hornæk Kittur sociotechnical crowdsourced
+%  LocalWords:  BRD Althistory Uncyclopedia namespace ymin ymax Hyak
+%  LocalWords:  boxplot Spearman's Structurelessness Kaylea IIS
+%  LocalWords:  Dataverse
diff --git a/paper_source/knitr/lib-01-generate_userroles.RDS b/paper_source/knitr/lib-01-generate_userroles.RDS
new file mode 100644 (file)
index 0000000..b3fd2fe
Binary files /dev/null and b/paper_source/knitr/lib-01-generate_userroles.RDS differ
diff --git a/paper_source/knitr/remember.RDS b/paper_source/knitr/remember.RDS
new file mode 100644 (file)
index 0000000..1f662aa
Binary files /dev/null and b/paper_source/knitr/remember.RDS differ
diff --git a/paper_source/refs.bib b/paper_source/refs.bib
new file mode 100644 (file)
index 0000000..42969d8
--- /dev/null
@@ -0,0 +1,1185 @@
+
+@inproceedings{halfaker_dont_2011,
+  address = {New York, New York},
+  title = {Don't Bite the Newbies: {{How}} Reverts Affect the Quantity and Quality of {{Wikipedia}} Work},
+  isbn = {978-1-4503-0909-7},
+  shorttitle = {Don't Bite the Newbies},
+  doi = {10.1145/2038558.2038585},
+  urldate = {2013-03-06},
+  booktitle = {Proceedings of the 7th {{International Symposium}} on {{Wikis}} and {{Open Collaboration}} ({{WikiSym}} '11)},
+  publisher = {{ACM}},
+  author = {Halfaker, Aaron and Kittur, Aniket and Riedl, John},
+  year = {2011},
+  keywords = {WikiWork,experience,quality,motivation,productivity,revert,wikipedia},
+  pages = {163--172}
+}
+
+@inproceedings{chen_effects_2010,
+  address = {New York, NY, USA},
+  series = {CHI '10},
+  title = {The Effects of Diversity on Group Productivity and Member Withdrawal in Online Volunteer Groups},
+  isbn = {978-1-60558-929-9},
+  doi = {10.1145/1753326.1753447},
+  urldate = {2012-06-14},
+  booktitle = {Proceedings of the 28th International Conference on {{Human}} Factors in Computing Systems},
+  publisher = {{ACM}},
+  author = {Chen, Jilin and Ren, Yuqing and Riedl, John},
+  year = {2010},
+  keywords = {diversity,performance,online volunteer group,wikipedia},
+  pages = {821--830}
+}
+
+@inproceedings{morgan_tea_2013,
+  address = {New York, NY, USA},
+  series = {CSCW '13},
+  title = {Tea and Sympathy: Crafting Positive New User Experiences on Wikipedia},
+  isbn = {978-1-4503-1331-5},
+  shorttitle = {Tea and Sympathy},
+  doi = {10.1145/2441776.2441871},
+  urldate = {2013-04-01},
+  booktitle = {Proceedings of the 2013 Conference on {{Computer}} Supported Cooperative Work},
+  publisher = {{ACM}},
+  author = {Morgan, Jonathan T. and Bouterse, Siko and Walls, Heather and Stierch, Sarah},
+  year = {2013},
+  keywords = {gender,socialization,new users,user experience,collaboration,Collaboration,wikipedia},
+  pages = {839--848}
+}
+
+@inproceedings{collier_conflict_2012,
+  address = {New York, NY, USA},
+  series = {CSCW '12},
+  title = {Conflict, Criticism, or Confidence: An Empirical Examination of the Gender Gap in Wikipedia Contributions},
+  isbn = {978-1-4503-1086-4},
+  shorttitle = {Conflict, Criticism, or Confidence},
+  doi = {10.1145/2145204.2145265},
+  urldate = {2013-04-01},
+  booktitle = {Proceedings of the {{ACM}} 2012 Conference on {{Computer Supported Cooperative Work}}},
+  publisher = {{ACM}},
+  author = {Collier, Benjamin and Bear, Julia},
+  year = {2012},
+  keywords = {criticism,confidence,gender,conflict,survey,wikipedia},
+  pages = {383--392}
+}
+
+@inproceedings{forte_scaling_2008-2,
+  title = {Scaling {{Consensus}}: {{Increasing Decentralization}} in {{Wikipedia Governance}}},
+  isbn = {1530-1605},
+  shorttitle = {Scaling {{Consensus}}},
+  doi = {10.1109/HICSS.2008.383},
+  urldate = {2009-08-26},
+  booktitle = {Proceedings of the 41st {{Annual Hawaii International Conference}} on {{System Science}}},
+  publisher = {{IEEE}},
+  author = {Forte, Andrea and Bruckman, Amy},
+  year = {2008},
+  keywords = {Encyclopedias,Buildings,Educational institutions,Environmental economics,Floods,Global warming,Production,Resource management,Web sites,commons-based governance,content-related decision making process,decision making,self-organised Wikipedia governance,social structure,wikipedia},
+  pages = {157}
+}
+
+@inproceedings{halfaker_making_2013,
+  address = {New York, NY, USA},
+  series = {CSCW '13},
+  title = {Making Peripheral Participation Legitimate: Reader Engagement Experiments in Wikipedia},
+  isbn = {978-1-4503-1331-5},
+  shorttitle = {Making Peripheral Participation Legitimate},
+  doi = {10.1145/2441776.2441872},
+  urldate = {2013-07-06},
+  booktitle = {Proceedings of the 2013 Conference on {{Computer}} Supported Cooperative Work},
+  publisher = {{ACM}},
+  author = {Halfaker, Aaron and Keyes, Oliver and Taraborelli, Dario},
+  year = {2013},
+  keywords = {Experiment,legitimate peripheral participation,participation,open production,quatitative,social learning,wikipedia},
+  pages = {849--860}
+}
+
+@phdthesis{loubser_organisational_2010,
+  address = {Oxford, United Kingdom},
+  type = {DPhil},
+  title = {Organisational Mechanisms in Peer Production: {{The}} Case of {{Wikipedia}}},
+  school = {Oxford Internet Institute, Oxford University},
+  author = {Loubser, Max},
+  month = jul,
+  year = {2010}
+}
+
+@book{michels_political_1915,
+  address = {New York, NY},
+  title = {Political {{Parties}}: {{A Sociological Study}} of the {{Oligarchical Tendencies}} of {{Modern Democracy}}},
+  shorttitle = {Political {{Parties}}},
+  publisher = {{Hearst}},
+  author = {Michels, Robert},
+  translator = {Paul, Eden and Paul, Cedar},
+  year = {1915}
+}
+
+@inproceedings{geiger_using_2013,
+  address = {New York, NY, USA},
+  series = {CSCW '13},
+  title = {Using {{Edit Sessions}} to {{Measure Participation}} in {{Wikipedia}}},
+  isbn = {978-1-4503-1331-5},
+  doi = {10.1145/2441776.2441873},
+  urldate = {2014-01-11},
+  booktitle = {Proceedings of the 2013 {{Conference}} on {{Computer Supported Cooperative Work}}},
+  publisher = {{ACM}},
+  author = {Geiger, R. Stuart and Halfaker, Aaron},
+  year = {2013},
+  keywords = {sessions,work,work practices,activity,labor-hours,quantitative methods,labor,peer production,wikipedia},
+  pages = {861--870}
+}
+
+@phdthesis{ortega_wikipedia:_2009,
+  address = {Madrid, Spain},
+  type = {Ph.D. dissertation},
+  title = {Wikipedia: {{A Quantitative Analysis}}},
+  language = {English},
+  urldate = {2009-06-22},
+  school = {Universidad Rey Juan Carlos},
+  author = {Ortega, Felipe},
+  year = {2009}
+}
+
+@inproceedings{panciera_wikipedians_2009,
+  address = {New York, New York},
+  series = {GROUP '09},
+  title = {Wikipedians Are Born, Not Made: A Study of Power Editors on {{Wikipedia}}},
+  isbn = {978-1-60558-500-0},
+  shorttitle = {Wikipedians Are Born, Not Made},
+  doi = {10.1145/1531674.1531682},
+  urldate = {2012-06-14},
+  booktitle = {Proceedings of the {{ACM}} 2009 International Conference on {{Supporting}} Group Work},
+  publisher = {{ACM}},
+  author = {Panciera, Katherine and Halfaker, Aaron and Terveen, Loren},
+  year = {2009},
+  keywords = {contribution,power editors,collaboration,Wiki,Collaboration,wikipedia},
+  pages = {51--60}
+}
+
+@article{shah_motivation_2006,
+  title = {Motivation, Governance, and the Viability of Hybrid Forms in Open Source Software Development},
+  volume = {52},
+  doi = {10.1287/mnsc.1060.0553},
+  number = {7},
+  urldate = {2009-12-07},
+  journal = {Management Science},
+  author = {Shah, Sonali K.},
+  month = jul,
+  year = {2006},
+  keywords = {To Read,MANAGEMENT,FOSS},
+  pages = {1000--1014}
+}
+
+@article{hargittai_mind_2015-1,
+  title = {Mind the Skills Gap: The Role of Internet Know-How and Gender in Differentiated Contributions to {{Wikipedia}}},
+  volume = {18},
+  issn = {1369-118X},
+  shorttitle = {Mind the Skills Gap},
+  doi = {10.1080/1369118X.2014.957711},
+  number = {4},
+  urldate = {2015-01-09},
+  journal = {Information, Communication \& Society},
+  author = {Hargittai, Eszter and Shaw, Aaron},
+  month = apr,
+  year = {2015},
+  pages = {424--442}
+}
+
+@unpublished{gorbatai_aligning_2011,
+  address = {Cambridge, MA},
+  type = {Working Paper},
+  title = {Aligning Collective Production with Demand: {{Evidence}} from {{Wikipedia}}},
+  author = {Gorbatai, Andreea},
+  year = {2011},
+  note = {Available at SSRN 1949327}
+}
+
+@book{ostrom_governing_1990,
+  address = {New York, NY},
+  title = {Governing the Commons: {{The}} Evolution of Institutions for Collective Action},
+  shorttitle = {Governing the {{Commons}}},
+  publisher = {{Cambridge University Press}},
+  author = {Ostrom, Elinor},
+  year = {1990}
+}
+
+@article{shaw_centralized_2012,
+  title = {Centralized and {{Decentralized Gatekeeping}} in an {{Open Online Collective}}},
+  volume = {40},
+  issn = {0032-3292, 1552-7514},
+  doi = {10.1177/0032329212449009},
+  language = {en},
+  number = {3},
+  urldate = {2012-08-27},
+  journal = {Politics \& Society},
+  author = {Shaw, Aaron},
+  year = {2012},
+  keywords = {Internet,gatekeeping,politics,Organizations,social movements},
+  pages = {349--388}
+}
+
+@article{hannan_population_1977,
+  title = {The Population Ecology of Organizations},
+  volume = {82},
+  issn = {0002-9602},
+  doi = {10.2307/2777807},
+  number = {5},
+  urldate = {2013-10-18},
+  journal = {American Journal of Sociology},
+  author = {Hannan, Michael T. and Freeman, John},
+  year = {1977},
+  pages = {929--964}
+}
+
+@article{meyer_institutionalized_1977,
+  title = {Institutionalized Organizations: {{Formal}} Structure as Myth and Ceremony},
+  volume = {83},
+  issn = {00029602},
+  shorttitle = {Institutionalized {{Organizations}}},
+  number = {2},
+  urldate = {2008-10-09},
+  journal = {American Journal of Sociology},
+  author = {Meyer, John W. and Rowan, Brian},
+  month = sep,
+  year = {1977},
+  keywords = {Organization Behavior,Sociology},
+  pages = {340--363}
+}
+
+@book{scott_organizations_2006,
+  address = {Upper Saddle River, New Jersey},
+  title = {Organizations and Organizing: {{Rational}}, Natural and Open Systems Perspectives},
+  isbn = {0-13-195893-3},
+  shorttitle = {Organizations and {{Organizing}}},
+  publisher = {{Pearson Prentice Hall}},
+  author = {Scott, W. Richard and Davis, Gerald F},
+  year = {2006},
+  keywords = {Organization Behavior,Sociology}
+}
+
+@book{lessig_code_1999,
+  address = {New York, NY},
+  title = {Code and {{Other Laws}} of {{Cyberspace}}},
+  isbn = {978-0-465-03912-8},
+  publisher = {{Basic Books}},
+  author = {Lessig, Lawrence},
+  year = {1999}
+}
+
+@book{schweik_internet_2012,
+  address = {Cambridge, Massachusetts},
+  title = {Internet Success: A Study of Open-Source Software Commons},
+  isbn = {978-0-262-01725-1},
+  lccn = {QA76.76.S46 S37 2012},
+  shorttitle = {Internet Success},
+  publisher = {{MIT Press}},
+  author = {Schweik, Charles M. and English, Robert C.},
+  year = {2012},
+  keywords = {Computers / Information Theory,Computers / Programming / Open Source,Information commons,Open source software}
+}
+
+@article{crowston_free/libre_2008,
+  title = {Free/Libre Open-Source Software Development: {{What}} We Know and What We Do Not Know},
+  volume = {44},
+  issn = {0360-0300},
+  shorttitle = {Free/{{Libre Open}}-Source {{Software Development}}},
+  doi = {10.1145/2089125.2089127},
+  number = {2},
+  urldate = {2016-02-01},
+  journal = {ACM Computing Surveys},
+  author = {Crowston, Kevin and Wei, Kangning and Howison, James and Wiggins, Andrea},
+  month = mar,
+  year = {2008},
+  keywords = {Free/Libre open-source software,distributed work,development,computer-mediated communication},
+  pages = {7:1--7:35}
+}
+
+@inproceedings{lam_wp:clubhouse?:_2011,
+  address = {New York, New York},
+  series = {WikiSym '11},
+  title = {{{WP}}:{{Clubhouse}}?: {{An Exploration}} of {{Wikipedia}}'s {{Gender Imbalance}}},
+  isbn = {978-1-4503-0909-7},
+  shorttitle = {{{WP}}},
+  doi = {10.1145/2038558.2038560},
+  urldate = {2015-03-08},
+  booktitle = {Proceedings of the 7th {{International Symposium}} on {{Wikis}} and {{Open Collaboration}}},
+  publisher = {{ACM}},
+  author = {Lam, Shyong (Tony) K. and Uduwage, Anuradha and Dong, Zhenhua and Sen, Shilad and Musicant, David R. and Terveen, Loren and Riedl, John},
+  year = {2011},
+  keywords = {collaboration,Collaboration,content coverage,gender gap,wikipedia},
+  pages = {1--10}
+}
+
+@inproceedings{butler_dont_2008,
+  address = {New York, NY, USA},
+  series = {CHI '08},
+  title = {Don't {{Look Now}}, but {{We}}'{{Ve Created}} a {{Bureaucracy}}: {{The Nature}} and {{Roles}} of {{Policies}} and {{Rules}} in {{Wikipedia}}},
+  isbn = {978-1-60558-011-1},
+  shorttitle = {Don't {{Look Now}}, but {{We}}'{{Ve Created}} a {{Bureaucracy}}},
+  doi = {10.1145/1357054.1357227},
+  urldate = {2015-05-09},
+  booktitle = {Proceedings of the {{SIGCHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
+  publisher = {{ACM}},
+  author = {Butler, Brian and Joyce, Elisabeth and Pike, Jacqueline},
+  year = {2008},
+  keywords = {policy,wikis,dynamics,policies,rules,collaboration,Collaboration,wikipedia,community},
+  pages = {1101--1110}
+}
+
+@inproceedings{beschastnikh_wikipedian_2008,
+  title = {Wikipedian {{Self}}-{{Governance}} in {{Action}}: {{Motivating}} the {{Policy Lens}}.},
+  shorttitle = {Wikipedian {{Self}}-{{Governance}} in {{Action}}},
+  urldate = {2015-05-09},
+  booktitle = {{{ICWSM}}},
+  author = {Beschastnikh, Ivan and Kriplean, Travis and McDonald, David W.},
+  year = {2008}
+}
+
+@inproceedings{kittur_he_2007-1,
+  address = {New York, NY, USA},
+  series = {CHI '07},
+  title = {He {{Says}}, {{She Says}}: {{Conflict}} and {{Coordination}} in {{Wikipedia}}},
+  isbn = {978-1-59593-593-9},
+  shorttitle = {He {{Says}}, {{She Says}}},
+  doi = {10.1145/1240624.1240698},
+  urldate = {2015-03-08},
+  booktitle = {Proceedings of the {{SIGCHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
+  publisher = {{ACM}},
+  author = {Kittur, Aniket and Suh, Bongwon and Pendleton, Bryan A. and Chi, Ed H.},
+  year = {2007},
+  keywords = {collaboration,Wiki,Collaboration,conflict,user model,web-based interaction,visualization,wikipedia},
+  pages = {453--462}
+}
+
+@inproceedings{viegas_hidden_2007-2,
+  series = {Lecture Notes in Computer Science},
+  title = {The {{Hidden Order}} of {{Wikipedia}}},
+  isbn = {978-3-540-73256-3 978-3-540-73257-0},
+  language = {en},
+  urldate = {2015-05-09},
+  booktitle = {Online {{Communities}} and {{Social Computing}}},
+  publisher = {{Springer Berlin Heidelberg}},
+  author = {Vi{\'e}gas, Fernanda B. and Wattenberg, Martin and McKeon, Matthew M.},
+  editor = {Schuler, Douglas},
+  year = {2007},
+  keywords = {Personal Computing,Computers and Society,Computer Communication Networks,Management of Computing and Information Systems,Information Systems Applications (incl.Internet),Legal Aspects of Computing,commons,governance,peer production,wikipedia},
+  pages = {445--454}
+}
+
+@inproceedings{viegas_talk_2007,
+  title = {Talk {{Before You Type}}: {{Coordination}} in {{Wikipedia}}},
+  shorttitle = {Talk {{Before You Type}}},
+  doi = {10.1109/HICSS.2007.511},
+  booktitle = {40th {{Annual Hawaii International Conference}} on {{System Sciences}}, 2007. {{HICSS}} 2007},
+  author = {Viegas, F.B. and Wattenberg, M. and Kriss, J. and {van Ham}, F.},
+  month = jan,
+  year = {2007},
+  keywords = {Internet,collaboration,encyclopaedias,Talk page,online encyclopedia,Guidelines,Visual communication,strategic planning,visualization,wikipedia,encyclopedias,information resources,resilience},
+  pages = {78--78}
+}
+
+@book{kraut_building_2012,
+  address = {Cambridge, MA},
+  title = {Building Successful Online Communities: {{Evidence}}-Based Social Design},
+  isbn = {978-0-262-29831-5},
+  language = {English},
+  publisher = {{MIT Press}},
+  author = {Kraut, Robert E and Resnick, Paul and Kiesler, Sara},
+  year = {2012},
+  keywords = {design,foundations of social computing}
+}
+
+@inproceedings{kittur_beyond_2010,
+  address = {New York, New York},
+  series = {CSCW '10},
+  title = {Beyond {{Wikipedia}}: {{Coordination}} and {{Conflict}} in {{Online Production Groups}}},
+  isbn = {978-1-60558-795-0},
+  shorttitle = {Beyond {{Wikipedia}}},
+  doi = {10.1145/1718918.1718959},
+  urldate = {2015-07-31},
+  booktitle = {Proceedings of the 2010 {{ACM Conference}} on {{Computer Supported Cooperative Work}}},
+  publisher = {{ACM}},
+  author = {Kittur, Aniket and Kraut, Robert E.},
+  year = {2010},
+  keywords = {coordination,collaboration,collective intelligence,distributed cognition,social computing,Wiki,Collaboration,conflict,online production,wikipedia},
+  pages = {215--224}
+}
+
+@article{forte_defining_2013,
+  title = {Defining, Understanding, and Supporting Open Collaboration: {{Lessons}} from the Literature},
+  volume = {57},
+  issn = {0002-7642, 1552-3381},
+  doi = {10.1177/0002764212469362},
+  language = {en},
+  number = {5},
+  urldate = {2016-03-24},
+  journal = {American Behavioral Scientist},
+  author = {Forte, Andrea and Lampe, Cliff},
+  year = {2013},
+  keywords = {Open source software,Wiki,open collaboration,peer production,wikipedia},
+  pages = {535--547}
+}
+
+@inproceedings{priedhorsky_creating_2007,
+  address = {New York, NY},
+  series = {GROUP '07},
+  title = {Creating, Destroying, and Restoring Value in Wikipedia},
+  isbn = {978-1-59593-845-9},
+  doi = {10.1145/1316624.1316663},
+  urldate = {2015-03-08},
+  booktitle = {Proceedings of the 2007 {{International ACM Conference}} on {{Supporting Group Work}}},
+  publisher = {{ACM}},
+  author = {Priedhorsky, Reid and Chen, Jilin and Lam, Shyong (Tony) K. and Panciera, Katherine and Terveen, Loren and Riedl, John},
+  year = {2007},
+  keywords = {collaboration,Wiki,Collaboration,damage,vandalism,wikipedia},
+  pages = {259--268}
+}
+
+@inproceedings{halfaker_jury_2009,
+  address = {New York, NY, USA},
+  series = {WikiSym '09},
+  title = {A {{Jury}} of {{Your Peers}}: {{Quality}}, {{Experience}} and {{Ownership}} in {{Wikipedia}}},
+  isbn = {978-1-60558-730-1},
+  shorttitle = {A {{Jury}} of {{Your Peers}}},
+  doi = {10.1145/1641309.1641332},
+  urldate = {2015-03-08},
+  booktitle = {Proceedings of the 5th {{International Symposium}} on {{Wikis}} and {{Open Collaboration}}},
+  publisher = {{ACM}},
+  author = {Halfaker, Aaron and Kittur, Aniket and Kraut, Robert and Riedl, John},
+  year = {2009},
+  keywords = {WikiWork,experience,ownership,peer,peer review,quality,wikipedia},
+  pages = {15:1--15:10}
+}
+
+@inproceedings{kittur_harnessing_2008,
+  address = {New York, NY, USA},
+  series = {CSCW '08},
+  title = {Harnessing the {{Wisdom}} of {{Crowds}} in {{Wikipedia}}: {{Quality Through Coordination}}},
+  isbn = {978-1-60558-007-4},
+  shorttitle = {Harnessing the {{Wisdom}} of {{Crowds}} in {{Wikipedia}}},
+  doi = {10.1145/1460563.1460572},
+  urldate = {2015-05-09},
+  booktitle = {Proceedings of the 2008 {{ACM Conference}} on {{Computer Supported Cooperative Work}}},
+  publisher = {{ACM}},
+  author = {Kittur, Aniket and Kraut, Robert E.},
+  year = {2008},
+  keywords = {coordination,collaboration,collective intelligence,distributed cognition,quality of content,social computing,social interaction,Wiki,wikipedia},
+  pages = {37--46}
+}
+
+@article{shaw_laboratories_2014,
+  title = {Laboratories of Oligarchy? {{How}} the Iron Law Extends to Peer Production},
+  volume = {64},
+  issn = {1460-2466},
+  shorttitle = {Laboratories of {{Oligarchy}}?},
+  doi = {10.1111/jcom.12082},
+  language = {en},
+  number = {2},
+  urldate = {2015-05-09},
+  journal = {Journal of Communication},
+  author = {Shaw, Aaron and Hill, Benjamin M.},
+  year = {2014},
+  pages = {215--238}
+}
+
+@article{coleman_social_1988,
+  title = {Social {{Capital}} in the {{Creation}} of {{Human Capital}}},
+  volume = {94},
+  issn = {0002-9602},
+  urldate = {2016-05-13},
+  journal = {American Journal of Sociology},
+  author = {Coleman, James S.},
+  year = {1988},
+  pages = {S95--S120}
+}
+
+@inproceedings{arazy_functional_2015,
+  address = {New York, NY, USA},
+  series = {CSCW '15},
+  title = {Functional {{Roles}} and {{Career Paths}} in {{Wikipedia}}},
+  isbn = {978-1-4503-2922-4},
+  doi = {10.1145/2675133.2675257},
+  urldate = {2015-11-11},
+  booktitle = {Proceedings of the 18th {{ACM Conference}} on {{Computer Supported Cooperative Work}} \& {{Social Computing}}},
+  publisher = {{ACM}},
+  author = {Arazy, Ofer and Ortega, Felipe and Nov, Oded and Yeo, Lisa and Balila, Adam},
+  year = {2015},
+  keywords = {peer-production,functional roles,role transitions,ORGANIZATIONAL structure,wikipedia},
+  pages = {1092--1105}
+}
+
+@article{benkler_coases_2002-2,
+  title = {Coase's Penguin, or, {{Linux}} and the Nature of the Firm},
+  volume = {112},
+  number = {3},
+  urldate = {2008-09-14},
+  journal = {Yale Law Journal},
+  author = {Benkler, Yochai},
+  year = {2002},
+  keywords = {Internet,Advantages,Economics,FOSS,Law,Legal Studies,Open source software,Production cooperatives,Socioeconomic factors},
+  pages = {369--446}
+}
+
+@article{black_self-governance_2011,
+  title = {Self-{{Governance Through Group Discussion}} in {{Wikipedia Measuring Deliberation}} in {{Online Groups}}},
+  volume = {42},
+  issn = {1046-4964, 1552-8278},
+  doi = {10.1177/1046496411406137},
+  language = {en},
+  number = {5},
+  urldate = {2014-01-01},
+  journal = {Small Group Research},
+  author = {Black, Laura W. and Welser, Howard T. and Cosley, Dan and DeGroot, Jocelyn M.},
+  month = oct,
+  year = {2011},
+  keywords = {Virtual teams,computer-mediated communication (CMC),deliberative discussion,Online Communities},
+  pages = {595--634}
+}
+
+@inproceedings{bryant_becoming_2005,
+  address = {New York, NY},
+  series = {GROUP '05},
+  title = {Becoming {{Wikipedian}}: {{Transformation}} of Participation in a Collaborative Online Encyclopedia},
+  isbn = {1-59593-223-2},
+  shorttitle = {Becoming {{Wikipedian}}},
+  doi = {10.1145/1099203.1099205},
+  urldate = {2015-05-09},
+  booktitle = {Proceedings of the 2005 {{International ACM SIGGROUP Conference}} on {{Supporting Group Work}}},
+  publisher = {{ACM}},
+  author = {Bryant, Susan L. and Forte, Andrea and Bruckman, Amy},
+  year = {2005},
+  keywords = {activity theory,legitimate peripheral participation,Wiki,qualitative,wikipedia,community},
+  pages = {1--10}
+}
+
+@article{forte_decentralization_2009,
+  title = {Decentralization in {{Wikipedia Governance}}},
+  volume = {26},
+  issn = {0742-1222},
+  doi = {10.2753/MIS0742-1222260103},
+  number = {1},
+  urldate = {2013-01-07},
+  journal = {J. Manage. Inf. Syst.},
+  author = {Forte, Andrea and Larco, Vanesa and Bruckman, Amy},
+  year = {2009},
+  keywords = {governance,Online Communities,Self-Organizing Systems,wikipedia},
+  pages = {49--72}
+}
+
+@article{konieczny_governance_2009,
+  title = {Governance, Organization, and Democracy on the Internet: The Iron Law and the Evolution of {{Wikipedia}}},
+  volume = {24},
+  issn = {1573-7861},
+  shorttitle = {Governance, {{Organization}}, and {{Democracy}} on the {{Internet}}},
+  doi = {10.1111/j.1573-7861.2008.01090.x},
+  language = {en},
+  number = {1},
+  urldate = {2011-11-21},
+  journal = {Sociological Forum},
+  author = {Konieczny, Piotr},
+  year = {2009},
+  keywords = {Democracy,oligarchy,Internet,organization,wikipedia,community},
+  pages = {162--192}
+}
+
+@inproceedings{kriplean_community_2007-1,
+  address = {New York, NY, USA},
+  series = {GROUP '07},
+  title = {Community, {{Consensus}}, {{Coercion}}, {{Control}}: {{Cs}}*{{W}} or {{How Policy Mediates Mass Participation}}},
+  isbn = {978-1-59593-845-9},
+  shorttitle = {Community, {{Consensus}}, {{Coercion}}, {{Control}}},
+  doi = {10.1145/1316624.1316648},
+  urldate = {2015-05-09},
+  booktitle = {Proceedings of the 2007 {{International ACM Conference}} on {{Supporting Group Work}}},
+  publisher = {{ACM}},
+  author = {Kriplean, Travis and Beschastnikh, Ivan and McDonald, David W. and Golder, Scott A.},
+  year = {2007},
+  keywords = {policy,collaborative authoring,power,wikipedia,community},
+  pages = {167--176}
+}
+
+@inproceedings{ciampaglia_moodbar:_2015,
+  address = {New York, NY, USA},
+  series = {CSCW '15},
+  title = {{{MoodBar}}: {{Increasing New User Retention}} in {{Wikipedia Through Lightweight Socialization}}},
+  isbn = {978-1-4503-2922-4},
+  shorttitle = {{{MoodBar}}},
+  doi = {10.1145/2675133.2675181},
+  urldate = {2016-05-26},
+  booktitle = {Proceedings of the 18th {{ACM Conference}} on {{Computer Supported Cooperative Work}} \& {{Social Computing}}},
+  publisher = {{ACM}},
+  author = {Ciampaglia, Giovanni Luca and Taraborelli, Dario},
+  year = {2015},
+  keywords = {Experiment,online community,socialization,wikipedia,user retention},
+  pages = {734--742}
+}
+
+@inproceedings{farzan_socializing_2012,
+  address = {New York, NY, USA},
+  series = {CSCW '12},
+  title = {Socializing {{Volunteers}} in an {{Online Community}}: {{A Field Experiment}}},
+  isbn = {978-1-4503-1086-4},
+  shorttitle = {Socializing {{Volunteers}} in an {{Online Community}}},
+  doi = {10.1145/2145204.2145256},
+  urldate = {2016-05-26},
+  booktitle = {Proceedings of the {{ACM}} 2012 {{Conference}} on {{Computer Supported Cooperative Work}}},
+  publisher = {{ACM}},
+  author = {Farzan, Rosta and Kraut, Robert and Pal, Aditya and Konstan, Joseph},
+  year = {2012},
+  keywords = {Experiment,online volunteer communities,socialization},
+  pages = {325--334}
+}
+
+@article{ducheneaut_socialization_2005,
+  title = {Socialization in an {{Open Source Software Community}}: {{A Socio}}-{{Technical Analysis}}},
+  volume = {14},
+  issn = {1573-7551},
+  doi = {10.1007/s10606-005-9000-1},
+  number = {4},
+  journal = {Computer Supported Cooperative Work (CSCW)},
+  author = {Ducheneaut, Nicolas},
+  year = {2005},
+  pages = {323--368}
+}
+
+@inproceedings{suh_singularity_2009-2,
+  address = {New York, New York},
+  series = {WikiSym '09},
+  title = {The {{Singularity}} Is {{Not Near}}: {{Slowing Growth}} of {{Wikipedia}}},
+  isbn = {978-1-60558-730-1},
+  shorttitle = {The {{Singularity}} Is {{Not Near}}},
+  doi = {10.1145/1641309.1641322},
+  urldate = {2016-04-21},
+  booktitle = {Proceedings of the 5th {{International Symposium}} on {{Wikis}} and {{Open Collaboration}}},
+  publisher = {{ACM}},
+  author = {Suh, Bongwon and Convertino, Gregorio and Chi, Ed H. and Pirolli, Peter},
+  year = {2009},
+  keywords = {growth,logistic model,population,power law,resistance,wikipedia},
+  pages = {1--10}
+}
+
+@inproceedings{geiger_defense_2012,
+  address = {Dublin, Ireland},
+  title = {Defense {{Mechanism}} or {{Socialization Tactic}}? {{Improving Wikipedia}}'s {{Notifications}} to {{Rejected Contributors}}},
+  shorttitle = {Defense {{Mechanism}} or {{Socialization Tactic}}?},
+  urldate = {2016-05-27},
+  booktitle = {Proceedings of the {{Sixth International AAAI Conference}} on {{Weblogs}} and {{Social Media}}},
+  publisher = {{AAAI Publications}},
+  author = {Geiger, R. Stuart and Halfaker, Aaron and Pinchuk, Maryana and Walling, Steven},
+  month = may,
+  year = {2012},
+  pages = {122--129}
+}
+
+@techreport{bollen_social_2015,
+  title = {Social, {{Behavioral}}, and {{Economic Sciences Perspectives}} on {{Robust}} and {{Reliable Science}}},
+  institution = {{National Science Foundation}},
+  author = {Bollen, Kenneth and Cacioppo, John T. and Kaplan, Robert M. and Krosnick, Jon A. and Olds, James L. and Dean, Heather},
+  month = may,
+  year = {2015}
+}
+
+@article{spinellis_collaborative_2008-1,
+  title = {The {{Collaborative Organization}} of {{Knowledge}}},
+  volume = {51},
+  issn = {0001-0782},
+  doi = {10.1145/1378704.1378720},
+  number = {8},
+  urldate = {2016-09-06},
+  journal = {Commun. ACM},
+  author = {Spinellis, Diomidis and Louridas, Panagiotis},
+  month = aug,
+  year = {2008},
+  pages = {68--73}
+}
+
+@article{stvilia_issues_2009,
+  title = {Issues of Cross-Contextual Information Quality Evaluation\textemdash{}{{The}} Case of {{Arabic}}, {{English}}, and {{Korean Wikipedias}}},
+  volume = {31},
+  issn = {07408188},
+  doi = {10.1016/j.lisr.2009.07.005},
+  language = {en},
+  number = {4},
+  urldate = {2016-09-15},
+  journal = {Library \& Information Science Research},
+  author = {Stvilia, Besiki and Al-Faraj, Abdullah and Yi, Yong Jeong},
+  month = dec,
+  year = {2009},
+  pages = {232--239}
+}
+
+@inproceedings{foote_starting_2017,
+  address = {New York, New York},
+  title = {Starting Online Communities: Motivations and Goals of Wiki Founders},
+  isbn = {978-1-4503-4655-9},
+  shorttitle = {Starting {{Online Communities}}},
+  doi = {10.1145/3025453.3025639},
+  urldate = {2017-05-15},
+  booktitle = {Proceedings of the 2017 {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}} ({{CHI}} '17)},
+  publisher = {{ACM}},
+  author = {Foote, Jeremy and Gergle, Darren and Shaw, Aaron},
+  year = {2017},
+  keywords = {wikis,motivation,survey,peer production,Online Communities},
+  pages = {6376--6380}
+}
+
+@inproceedings{narayan_wikipedia_2017,
+  address = {New York, NY, USA},
+  series = {CSCW '17},
+  title = {The {{Wikipedia Adventure}}: {{Field Evaluation}} of an {{Interactive Tutorial}} for {{New Users}}},
+  isbn = {978-1-4503-4335-0},
+  shorttitle = {The {{Wikipedia Adventure}}},
+  doi = {10.1145/2998181.2998307},
+  urldate = {2017-03-21},
+  booktitle = {Proceedings of the 2017 {{ACM Conference}} on {{Computer Supported Cooperative Work}} and {{Social Computing}}},
+  publisher = {{ACM}},
+  author = {Narayan, Sneha and Orlowitz, Jake and Morgan, Jonathan and Hill, Benjamin Mako and Shaw, Aaron},
+  year = {2017},
+  keywords = {Gamification,systems design,systems evaluation,newcomer socialization,peer production,wikipedia,Online Communities,Systems design},
+  pages = {1785--1799}
+}
+
+@inproceedings{halfaker_snuggle:_2014,
+  address = {New York, NY, USA},
+  series = {CHI '14},
+  title = {Snuggle: {{Designing}} for {{Efficient Socialization}} and {{Ideological Critique}}},
+  isbn = {978-1-4503-2473-1},
+  shorttitle = {Snuggle},
+  doi = {10.1145/2556288.2557313},
+  urldate = {2017-03-21},
+  booktitle = {Proceedings of the {{SIGCHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
+  publisher = {{ACM}},
+  author = {Halfaker, Aaron and Geiger, R. Stuart and Terveen, Loren G.},
+  year = {2014},
+  keywords = {Algorithms,activism,critique,design,quantitative,newcomer socialization,wikipedia},
+  pages = {311--320}
+}
+
+@inproceedings{palen_success_2015,
+  address = {New York, New York},
+  series = {CHI '15},
+  title = {Success \& {{Scale}} in a {{Data}}-{{Producing Organization}}: {{The Socio}}-{{Technical Evolution}} of {{OpenStreetMap}} in {{Response}} to {{Humanitarian Events}}},
+  isbn = {978-1-4503-3145-6},
+  shorttitle = {Success \& {{Scale}} in a {{Data}}-{{Producing Organization}}},
+  doi = {10.1145/2702123.2702294},
+  urldate = {2017-06-14},
+  booktitle = {Proceedings of the 33rd {{Annual ACM Conference}} on {{Human Factors}} in {{Computing Systems}}},
+  publisher = {{ACM}},
+  author = {Palen, Leysia and Soden, Robert and Anderson, T. Jennings and Barrenechea, Mario},
+  year = {2015},
+  keywords = {Organizational Behavior,social computing,crisis informatics,crowdwork,geospatial data,mapping,open data,organizational behavior},
+  pages = {4113--4122}
+}
+
+@article{isaac_group_1994,
+  title = {Group Size and the Voluntary Provision of Public Goods},
+  volume = {54},
+  issn = {0047-2727},
+  doi = {10.1016/0047-2727(94)90068-X},
+  number = {1},
+  urldate = {2017-06-15},
+  journal = {Journal of Public Economics},
+  author = {Isaac, R. Mark and Walker, James M. and Williams, Arlington W.},
+  month = may,
+  year = {1994},
+  pages = {1--36}
+}
+
+@inproceedings{kiene_surviving_2016,
+  address = {San Jose, CA, USA},
+  title = {Surviving an ``{{Eternal September}}'' {{How}} an {{Online Community Managed}} a {{Surge}} of {{Newcomers}}},
+  urldate = {2016-03-14},
+  author = {Kiene, Charles and Monroy-Hern{\'a}ndez, Andr{\'e}s and Hill, Benjamin Mako},
+  year = {2016}
+}
+
+@inproceedings{geiger_work_2010,
+  address = {New York, New York},
+  series = {CSCW '10},
+  title = {The {{Work}} of {{Sustaining Order}} in {{Wikipedia}}: {{The Banning}} of a {{Vandal}}},
+  isbn = {978-1-60558-795-0},
+  shorttitle = {The {{Work}} of {{Sustaining Order}} in {{Wikipedia}}},
+  doi = {10.1145/1718918.1718941},
+  urldate = {2017-07-12},
+  booktitle = {Proceedings of the 2010 {{ACM Conference}} on {{Computer Supported Cooperative Work}}},
+  publisher = {{ACM}},
+  author = {Geiger, R. Stuart and Ribes, David},
+  year = {2010},
+  keywords = {bots,collaboration,distributed cognition,Wiki,Collaboration,ethnography,qualitative,social,trace ethnography,wikipedia},
+  pages = {117--126}
+}
+
+@misc{simonite_fight_????,
+  title = {The {{Fight}} to {{Save Wikipedia}} from {{Itself}}},
+  urldate = {2017-07-13},
+  howpublished = {https://www.technologyreview.com/s/520446/the-decline-of-wikipedia/},
+  journal = {MIT Technology Review},
+  author = {Simonite, Tom}
+}
+
+@inproceedings{hall_freedom_2017,
+  address = {New York, NY, USA},
+  series = {CHI '17},
+  title = {Freedom {{Versus Standardization}}: {{Structured Data Generation}} in a {{Peer Production Community}}},
+  isbn = {978-1-4503-4655-9},
+  shorttitle = {Freedom {{Versus Standardization}}},
+  doi = {10.1145/3025453.3025940},
+  urldate = {2017-07-13},
+  booktitle = {Proceedings of the 2017 {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
+  publisher = {{ACM}},
+  author = {Hall, Andrew and McRoberts, Sarah and Thebault-Spieker, Jacob and Lin, Yilun and Sen, Shilad and Hecht, Brent and Terveen, Loren},
+  year = {2017},
+  keywords = {Standardization,openstreetmap,peer-production communities,structured data},
+  pages = {6352--6362}
+}
+
+@article{preece_etiquette_2004,
+  title = {Etiquette, Empathy and Trust in Communities of Practice: {{Stepping}}-Stones to Social Capital.},
+  volume = {10},
+  shorttitle = {Etiquette, Empathy and Trust in Communities of Practice},
+  number = {3},
+  urldate = {2017-07-14},
+  journal = {J. UCS},
+  author = {Preece, Jennifer},
+  year = {2004},
+  pages = {294--302}
+}
+
+@article{winner_artifacts_1980,
+  title = {Do Artifacts Have Politics?},
+  urldate = {2016-01-07},
+  journal = {Daedalus},
+  author = {Winner, Langdon},
+  year = {1980},
+  pages = {121--136}
+}
+
+@incollection{hill_studying_2017,
+  address = {Oxford},
+  title = {Studying {{Populations}} of {{Online Communities}}},
+  urldate = {2017-07-21},
+  booktitle = {Oxford {{Handbook}} of {{Networked Communication}}},
+  publisher = {{Oxford University Press}},
+  author = {Hill, Benjamin Mako and Shaw, Aaron},
+  editor = {Gonz{\'a}lez-Bail{\'o}n, Sandra and Foucault Welles, Brooke},
+  year = {2017}
+}
+
+@inproceedings{matias_skill_2016,
+  address = {New York, NY, USA},
+  series = {CHI '16},
+  title = {Skill {{Progression}} in {{Scratch Revisited}}},
+  isbn = {978-1-4503-3362-7},
+  doi = {10.1145/2858036.2858349},
+  urldate = {2017-07-25},
+  booktitle = {Proceedings of the 2016 {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
+  publisher = {{ACM}},
+  author = {Matias, J. Nathan and Dasgupta, Sayamindu and Hill, Benjamin Mako},
+  year = {2016},
+  keywords = {computers and children,creativity support tools,replication,Online Communities,Learning},
+  pages = {1486--1490}
+}
+
+@article{crawford_what_2016,
+  title = {What Is a Flag for? {{Social}} Media Reporting Tools and the Vocabulary of Complaint},
+  volume = {18},
+  issn = {1461-4448, 1461-7315},
+  shorttitle = {What Is a Flag For?},
+  doi = {10.1177/1461444814543163},
+  language = {en},
+  number = {3},
+  urldate = {2016-09-09},
+  journal = {New Media \& Society},
+  author = {Crawford, Kate and Gillespie, Tarleton},
+  month = mar,
+  year = {2016},
+  keywords = {YouTube,Facebook,flagging,norms,platforms,twitter,community},
+  pages = {410--428}
+}
+
+@article{mciver_wikipedia_2014,
+  title = {Wikipedia {{Usage Estimates Prevalence}} of {{Influenza}}-{{Like Illness}} in the {{United States}} in {{Near Real}}-{{Time}}},
+  volume = {10},
+  issn = {1553-7358},
+  doi = {10.1371/journal.pcbi.1003581},
+  number = {4},
+  urldate = {2017-07-26},
+  journal = {PLOS Computational Biology},
+  author = {McIver, David J. and Brownstein, John S.},
+  month = apr,
+  year = {2014},
+  keywords = {H1N1,Swine influenza,Online encyclopedias,United States,Infectious disease control,Infectious disease surveillance,Influenza,Influenza viruses},
+  pages = {e1003581}
+}
+
+@inproceedings{steinmacher_social_2015,
+  title = {Social {{Barriers Faced}} by {{Newcomers Placing Their First Contribution}} in {{Open Source Software Projects}}},
+  isbn = {978-1-4503-2922-4},
+  doi = {10.1145/2675133.2675215},
+  language = {en},
+  urldate = {2017-07-27},
+  publisher = {{ACM Press}},
+  author = {Steinmacher, Igor and Conte, Tayana and Gerosa, Marco Aur{\'e}lio and Redmiles, David},
+  year = {2015},
+  pages = {1379--1392}
+}
+
+@inproceedings{lampe_follow_2005,
+  title = {Follow the (Slash) Dot: Effects of Feedback on New Members in an Online Community},
+  shorttitle = {Follow the (Slash) Dot},
+  urldate = {2017-07-27},
+  booktitle = {Proceedings of the 2005 International {{ACM SIGGROUP}} Conference on {{Supporting}} Group Work},
+  publisher = {{ACM}},
+  author = {Lampe, Cliff and Johnston, Erik},
+  year = {2005},
+  pages = {11--20}
+}
+
+@article{ji_influence_2010,
+  title = {The {{Influence}} of {{Cultural Differences}} on the {{Use}} of {{Social Network Services}} and the {{Formation}} of {{Social Capital}}},
+  volume = {26},
+  issn = {1044-7318, 1532-7590},
+  doi = {10.1080/10447318.2010.516727},
+  language = {en},
+  number = {11-12},
+  urldate = {2017-07-27},
+  journal = {International Journal of Human-Computer Interaction},
+  author = {Ji, Yong Gu and Hwangbo, Hwan and Yi, Ji Soo and Rau, P. L. Patrick and Fang, Xiaowen and Ling, Chen},
+  month = nov,
+  year = {2010},
+  pages = {1100--1121}
+}
+
+@article{agrawal_simple_2014,
+  title = {Some Simple Economics of Crowdfunding},
+  volume = {14},
+  number = {1},
+  urldate = {2017-07-27},
+  journal = {Innovation Policy and the Economy},
+  author = {Agrawal, Ajay and Catalini, Christian and Goldfarb, Avi},
+  year = {2014},
+  pages = {63--97}
+}
+
+@article{gorbatai_paradox_2014,
+  title = {The Paradox of Novice Contributions to Collective Production: {{Evidence}} from {{Wikipedia}}},
+  shorttitle = {The Paradox of Novice Contributions to Collective Production},
+  urldate = {2017-07-27},
+  author = {Gorbatai, Andreea D.},
+  year = {2014}
+}
+
+@article{gorbatai_social_2012,
+  title = {Social Structure of Contributions to Wikipedia},
+  volume = {23},
+  urldate = {2017-07-27},
+  journal = {Harvard Business School. Retrieved September},
+  author = {Gorbatai, Andreea Daniela and Piskorski, M.},
+  year = {2012},
+  pages = {2013}
+}
+
+@article{joyce_predicting_2006,
+  title = {Predicting {{Continued Participation}} in {{Newsgroups}}},
+  volume = {11},
+  issn = {1083-6101},
+  doi = {10.1111/j.1083-6101.2006.00033.x},
+  language = {en},
+  number = {3},
+  urldate = {2017-08-02},
+  journal = {Journal of Computer-Mediated Communication},
+  author = {Joyce, Elisabeth and Kraut, Robert E.},
+  month = apr,
+  year = {2006},
+  pages = {723--747}
+}
+
+@inproceedings{burke_feed_2009,
+  address = {New York, NY, USA},
+  series = {CHI '09},
+  title = {Feed {{Me}}: {{Motivating Newcomer Contribution}} in {{Social Network Sites}}},
+  isbn = {978-1-60558-246-7},
+  shorttitle = {Feed {{Me}}},
+  doi = {10.1145/1518701.1518847},
+  urldate = {2017-08-02},
+  booktitle = {Proceedings of the {{SIGCHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
+  publisher = {{ACM}},
+  author = {Burke, Moira and Marlow, Cameron and Lento, Thomas},
+  year = {2009},
+  keywords = {distribution,feedback,social learning,sns,social network sites,motivating contribution,production incentives,sharing,singling out,Online Communities},
+  pages = {945--954}
+}
+
+@inproceedings{lin_better_2017,
+  title = {Better When It Was Smaller? {{Community}} Content and Behavior after Massive Growth.},
+  shorttitle = {Better {{When It Was Smaller}}?},
+  urldate = {2017-08-02},
+  booktitle = {{{ICWSM}}},
+  author = {Lin, Zhiyuan and Salehi, Niloufar and Yao, Bowen and Chen, Yiqi and Bernstein, Michael S.},
+  year = {2017},
+  pages = {132--141}
+}
+
+@inproceedings{wilson_replichi_2011,
+  address = {New York, New York},
+  series = {CHI EA '11},
+  title = {{{RepliCHI}} - {{CHI Should Be Replicating}} and {{Validating Results More}}: {{Discuss}}},
+  isbn = {978-1-4503-0268-5},
+  shorttitle = {{{RepliCHI}} - {{CHI Should Be Replicating}} and {{Validating Results More}}},
+  doi = {10.1145/1979742.1979491},
+  urldate = {2017-07-25},
+  booktitle = {{{CHI}} '11 {{Extended Abstracts}} on {{Human Factors}} in {{Computing Systems}}},
+  publisher = {{ACM}},
+  author = {Wilson, Max L. and Mackay, Wendy and Chi, Ed and Bernstein, Michael and Russell, Dan and Thimbleby, Harold},
+  year = {2011},
+  keywords = {hci,replication,research,science},
+  pages = {463--466}
+}
+
+@article{velden_decentering_2013,
+  title = {Decentering {{Design}}: {{Wikipedia}} and {{Indigenous Knowledge}}},
+  volume = {29},
+  issn = {1044-7318},
+  shorttitle = {Decentering {{Design}}},
+  doi = {10.1080/10447318.2013.765768},
+  number = {4},
+  urldate = {2017-08-03},
+  journal = {International Journal of Human\textendash{}Computer Interaction},
+  author = {van der Velden, Maja},
+  month = mar,
+  year = {2013},
+  pages = {308--316}
+}
+
+@article{_talk:elinor_2017,
+  title = {Talk:{{Elinor Ostrom}}},
+  copyright = {Creative Commons Attribution-ShareAlike License},
+  shorttitle = {Talk},
+  language = {en},
+  urldate = {2017-08-19},
+  journal = {Wikipedia},
+  month = aug,
+  year = {2017}
+}
+
+@article{keegan_evolution_2017,
+  title = {The {{Evolution}} and {{Consequences}} of {{Peer Producing Wikipedia}}'s {{Rules}}},
+  urldate = {2017-08-24},
+  author = {Keegan, Brian and Fiesler, Casey},
+  year = {2017}
+}
+
+@inproceedings{roth_measuring_2008,
+  address = {New York, New York},
+  series = {WikiSym '08},
+  title = {Measuring Wiki Viability: {{An}} Empirical Assessment of the Social Dynamics of a Large Sample of Wikis},
+  isbn = {978-1-60558-128-6},
+  shorttitle = {Measuring {{Wiki Viability}}},
+  doi = {10.1145/1822258.1822294},
+  urldate = {2017-04-16},
+  booktitle = {Proceedings of the 4th {{International Symposium}} on {{Wikis}}},
+  publisher = {{ACM}},
+  author = {Roth, Camille and Taraborelli, Dario and Gilbert, Nigel},
+  year = {2008},
+  keywords = {dynamics,ecology,governance,metrics,moderation,Online Communities,viability,Web 2.0,wiki demography,wikis},
+  pages = {27:1--27:5}
+}
+
+@article{piskorski_testing_2017,
+  title = {Testing {{Coleman}}'s Social-Norm Enforcement Mechanism: {{Evidence}} from {{Wikipedia}}},
+  volume = {122},
+  issn = {0002-9602},
+  shorttitle = {Testing {{Coleman}}'s {{Social}}-{{Norm Enforcement Mechanism}}},
+  doi = {10.1086/689816},
+  number = {4},
+  urldate = {2017-09-05},
+  journal = {American Journal of Sociology},
+  author = {Piskorski, Miko{\l}aj Jan and Gorbat{\^a}i, Andreea},
+  year = {2017},
+  pages = {1183--1222}
+}
+
+@article{zhang_group_2011,
+  title = {Group Size and Incentives to Contribute: {{A}} Natural Experiment at Chinese {{Wikipedia}}},
+  volume = {101},
+  issn = {0002-8282},
+  shorttitle = {Group {{Size}} and {{Incentives}} to {{Contribute}}},
+  doi = {10.2307/23045913},
+  number = {4},
+  urldate = {2017-09-05},
+  journal = {The American Economic Review},
+  author = {Zhang, Xiaoquan and Zhu, Feng},
+  year = {2011},
+  pages = {1601--1615}
+}
+
+@inproceedings{barbosa_averaging_2016,
+  address = {Republic and Canton of Geneva, Switzerland},
+  title = {Averaging Gone Wrong: {{Using}} Time-Aware Analyses to Better Understand Behavior},
+  isbn = {978-1-4503-4143-1},
+  shorttitle = {Averaging {{Gone Wrong}}},
+  doi = {10.1145/2872427.2883083},
+  urldate = {2017-09-09},
+  booktitle = {Proceedings of the 25th {{International Conference}} on {{World Wide Web}} ({{WWW}} '16)},
+  publisher = {{International World Wide Web Conferences Steering Committee}},
+  author = {Barbosa, Samuel and Cosley, Dan and Sharma, Amit and Cesar, Jr., Roberto M.},
+  year = {2016},
+  keywords = {reddit,computational social science,cohorts,temporal,user behavior},
+  pages = {829--841}
+}
+
+@incollection{benkler_peer_2015,
+  address = {Cambridge, Massachusetts},
+  title = {Peer Production: {{A}} Form of Collective Intelligence},
+  isbn = {978-0-262-02981-0},
+  language = {en},
+  booktitle = {Handbook of {{Collective Intelligence}}},
+  publisher = {{MIT Press}},
+  author = {Benkler, Yochai and Shaw, Aaron and Hill, Benjamin Mako},
+  editor = {Malone, Thomas W. and Bernstein, Michael S.},
+  year = {2015},
+  keywords = {Business \& Economics / General,Science / Cognitive Science},
+  pages = {175--204}
+}
+
+@inproceedings{hornbaek_is_2014,
+  address = {New York, New York},
+  title = {Is Once Enough?: {{On}} the Extent and Content of Replications in Human-Computer Interaction},
+  isbn = {978-1-4503-2473-1},
+  shorttitle = {Is {{Once Enough}}?},
+  doi = {10.1145/2556288.2557004},
+  urldate = {2017-11-20},
+  booktitle = {Proceedings of the {{SIGCHI Conference}} on {{Human Factors}} in {{Computing Systems}} ({{CHI}} '14)},
+  publisher = {{ACM}},
+  author = {Hornb{\ae}k, Kasper and Sander, S{\o}ren S. and Bargas-Avila, Javier Andr{\'e}s and Grue Simonsen, Jakob},
+  year = {2014},
+  keywords = {replications},
+  pages = {3523--3532}
+}
+
+@article{freeman_tyranny_1972,
+  title = {The Tyranny of Structurelessness},
+  volume = {17},
+  copyright = {Copyright \textcopyright{} 1972 Regents of the University of California},
+  issn = {0067-5830},
+  urldate = {2015-06-19},
+  journal = {Berkeley Journal of Sociology},
+  author = {Freeman, Jo},
+  month = jan,
+  year = {1972},
+  pages = {151--164}
+}
+
+@article{halfaker_rise_2013,
+  title = {The Rise and Decline of an Open Collaboration System: How {{Wikipedia}}'s Reaction to Popularity Is Causing Its Decline},
+  volume = {57},
+  issn = {0002-7642},
+  shorttitle = {The {{Rise}} and {{Decline}} of an {{Open Collaboration System}}},
+  doi = {10.1177/0002764212469365},
+  language = {en},
+  number = {5},
+  journal = {American Behavioral Scientist},
+  author = {Halfaker, Aaron and Geiger, R. Stuart and Morgan, Jonathan T. and Riedl, John},
+  month = may,
+  year = {2013},
+  pages = {664--688}
+}
+
+
diff --git a/paper_source/sigchi.cls b/paper_source/sigchi.cls
new file mode 100644 (file)
index 0000000..3ee414a
--- /dev/null
@@ -0,0 +1,1676 @@
+% CHI Proceedings Template.
+%
+% FILENAME: sigchi.cls
+%  
+% SOURCE: See https://github.com/sigchi for latest official version.
+%
+% NOTE: If you wish to edit this document, please list edits properly
+% in the CHANGELOG section and not scattered through the code.
+% 
+% CHANGELOG: 
+%
+% 2015-12-10 Daniel Ashbrook Switch 7-bit fonts to 8-bit
+%
+% 2015-03-21 David A. Shamma Updating for new format and Github
+% repo for CHI 2016.
+%
+% 2014-07-30 Scooter Morris Updated to add DOI text to copyright
+% footer
+%
+% 2011-10-19 DanCo Added \sloppy to reduce overfull lines per Sheridan
+%
+% 2011-08-28 Jean-Daniel Fekete Re-added flushleft in \terms &
+% \category, added compatibility for the caption package
+%
+% 2006 Andrew Duchowski Updated the metafont names to use more
+% modern Berry font naming schemes.
+%
+% 2002-07-30 JS/GM Fix to vertical spacing before Proofs
+%
+% 2002-07-29 Georgia Fixed bug in sub-sub-section numbering in
+% paragraphs
+%
+% 1999-09-11 Gerry Murry Allowance Made to switch default fonts
+% between those systems using METAFONT and those using 'Type 1' or
+% 'Truetype' fonts.  See LINE NUMBER 266 for details.  Also provided
+% for enumerated/annotated Corollaries 'surrounded' by enumerated
+% Theorems (line 838).
+%
+% Modified from ACM_PROC_ARTICLE-SP DOCUMENT STYLE by G.K.M. Tobin
+% August-October 1999.  Adapted from ARTICLE document style by Ken
+% Traub, Olin Shivers also using elements of esub2acm.cls.  Modified
+% from ARTICLE DOCUMENT STYLE -- Released 16 March 1988 for LaTeX
+% version 2.09 by Leslie Lamport, 16 March 1988.
+% 
+% MISC: 
+%
+% Fixed the format to match the Word template and added a \teaser
+% command to add a teaser image.
+%
+% Made the Permission Statement / Conference Info / Copyright Info
+% 'user definable' in the source .tex file OR automatic if not
+% specified.  This 'sp' version does NOT produce the permission block.
+%
+% Major change in January 2000 was to include a "blank line" in
+% between new paragraphs. This involved major changes to the, then,
+% acmproc-sp.cls 1.0SP file, precipitating a 'new' name:
+% "acm_proc_article-sp.cls" V2.01SP.  Footnotes inside table cells
+% using \minipage (Oct. 2002)
+%
+% LICENSE: Public domain: You are free to do whatever you want with
+% this template.  If you improve this in any way, please submit a
+% pull request to the Github repository.
+%
+% ---- Start of example  ----
+%
+% \documentclass{sigchi}
+% \usepackage{times}
+% \pagenumbering{arabic} % Arabic page numbers for submission.  
+%                        % Remove this line to eliminate page numbers  
+%                        % for the camera ready copy
+% \title{The Title}
+% \numberofauthors{2}
+% \author{
+%   \alignauthor Author 1
+%     \affaddr{Affiliation}\\
+%     \affaddr{Affiliation}\\
+%     \email{author@a.com}
+%   \alignauthor Author 2
+%     \affaddr{Affiliation}\\
+%     \affaddr{Affiliation}\\
+%     \email{author2@b.com}
+% }
+%
+%\begin{document}
+%\maketitle
+%\abstract{This is the abstract}
+%\keywords{Put author keywords here}
+%\classification{The ACM Classification keywords here.}
+%
+%\section{Introduction}
+% ...
+%
+% ---- End of example  ----
+%
+% For the ACM Tracking Purposes
+\def\fileversion{V0.20}
+\def\filedate{March 21, 2015}
+\def\docdate {\filedate}
+% Packages
+\usepackage{epsfig}
+\usepackage{amssymb}
+\usepackage{amsmath}
+\usepackage{amsfonts}
+%%% ACM_PROC_ARTICLE-SP is a document style for producing two-column camera-ready pages for
+%%% ACM conferences, according to ACM specifications.  The main features of
+%%% this style are:
+%%%
+%%% 1)  Two columns.
+%%% 2)  Side and top margins of 4.5pc, bottom margin of 6pc, column gutter of
+%%%     2pc, hence columns are 20pc wide and 55.5pc tall.  (6pc =3D 1in, approx)
+%%% 3)  First page has title information, and an extra 6pc of space at the
+%%%     bottom of the first column for the ACM copyright notice.
+%%% 4)  Text is 9pt on 10pt baselines; titles (except main) are 9pt bold.
+%%%
+%%%
+%%% There are a few restrictions you must observe:
+%%%
+%%% 1)  You cannot change the font size; ACM wants you to use 9pt.
+%%% 3)  You must start your paper with the \maketitle command.  Prior to the
+%%%     \maketitle you must have \title and \author commands.  If you have a
+%%%     \date command it will be ignored; no date appears on the paper, since
+%%%     the proceedings will have a date on the front cover.
+%%% 4)  Marginal paragraphs, tables of contents, lists of figures and tables,
+%%%     and page headings are all forbidden.
+%%% 5)  The `figure' environment will produce a figure one column wide; if you
+%%%     want one that is two columns wide, use `figure*'.
+%%%
+%
+%%% Copyright Space:
+%%% This style automatically leaves 1" blank space at the bottom of page 1/
+%%% column 1.  This space can optionally be filled with some text using the
+%%% \toappear{...} command.  If used, this command must be BEFORE the \maketitle
+%%% command.  If this command is defined AND [preprint] is on, then the
+%%% space is filled with the {...} text (at the bottom); otherwise, it is
+%%% blank.  If you use \toappearbox{...} instead of \toappear{...} then a
+%%% box will be drawn around the text (if [preprint] is on).
+%%%
+%%% A typical usage looks like this:
+%%%     \toappear{To appear in the Ninth AES Conference on Medievil Lithuanian
+%%%               Embalming Technique, June 1991, Alfaretta, Georgia.}
+%%% This will be included in the preprint, and left out of the conference
+%%% version.
+%%%
+%%% WARNING:
+%%% Some dvi-ps converters heuristically allow chars to drift from their
+%%% true positions a few pixels. This may be noticeable with the 9pt sans-serif
+%%% bold font used for section headers.
+%%% You may turn this hackery off via the -e option:
+%%%     dvips -e 0 foo.dvi >foo.ps
+%%%
+
+\typeout{}
+
+\typeout{Document Class SIGCHI}
+
+\typeout{Available at https://github.com/sigchi.}
+
+\typeout{}
+
+\typeout{- <May 11, 2016> Updated by Florian Heller to accept ACM copyright commands.}
+
+\typeout{- <March 21, 2015> Updated by David A. Shamma for CHI 2016.}
+
+\typeout{- <July 30, 2014> Updated for for 2014 by William Hudson and
+  Jean-Daniel Fekete.}
+
+\typeout{- <October 19, 2010> Updated for 2011 by Manas Tungare.}
+
+\typeout{Shamelessly copied from}
+
+\typeout{- <September 7, 2000> Modified by Jan O. Borchers from
+  `chiproceedings'}
+
+\typeout{- <October 2, 2002> Modified by G.K.M. Tobin from
+  `acm_proc_article-sp'}
+
+\typeout{- <May 22, 1989> Hacked from `acmconf' 4/91 by
+  shivers@cs.cmu.edu, 4/93 by theobald@cs.mcgill.ca}
+
+\typeout{Excerpts were taken from (Journal Style) 'esub2acm.cls'.}
+
+\typeout{}
+
+% Options taken from ACM LaTeX template. FH, May 11 2016
+% New option code by BV
+
+\newcount\ACM@basesize
+\ACM@basesize=9\relax
+\DeclareOption{9pt}{\ACM@basesize=9\relax}
+\DeclareOption{10pt}{\ACM@basesize=10\relax}
+\DeclareOption{11pt}{\ClassError{sig-alternate}{The `11pt' option is
+    not allowed}{sig-alternate now exists in 9pt and 10pt versions only}}
+\DeclareOption{12pt}{\ClassError{sig-alternate}{The `12pt' option is
+    not allowed}{sig-alternate now exists in 9pt and 10pt versions only}}
+
+\ExecuteOptions{9pt}
+\ProcessOptions
+
+\def\doi#1{\def\@doi{#1}}
+\doi{http://dx.doi.org/10.1145/0000000.0000000}
+
+\let\@concepts\@empty
+
+
+\oddsidemargin 1.9025cm            % [jdf] stick to CHI Formating Guidelines
+\evensidemargin 1.9025cm           % [jdf] idem
+\advance\oddsidemargin by -1in  % Correct for LaTeX gratuitousness
+\advance\evensidemargin by -1in % Correct for LaTeX gratuitousness
+\marginparwidth 0pt             % Margin pars are not allowed.
+\marginparsep 11pt              % Horizontal space between outer margin and
+                                % marginal note
+
+                                % Top of page:
+\topmargin 2.2cm                % [jdf] stick to CHI Formating Guidelines
+%\topmargin 6.5pc                % Nominal distance from top of page to top of  % 00-09-07 job (for A4)
+                                % box containing running head.
+\advance\topmargin by -1in      % Correct for LaTeX gratuitousness
+\headheight 0pt                 % Height of box containing running head.
+\headsep 0pt                    % Space between running head and text.
+                                % Bottom of page:
+\footskip 30pt                  % Distance from baseline of box containing foot
+                                % to baseline of last line of text.
+\@ifundefined{footheight}{\newdimen\footheight}{}% this is for LaTeX2e
+\footheight 12pt                % Height of box containing running foot.
+
+
+%% Must redefine the top margin so there's room for headers and
+%% page numbers if you are using the preprint option. Footers
+%% are OK as is. Olin.
+\advance\topmargin by -37pt     % Leave 37pt above text for headers
+\headheight 12pt                % Height of box containing running head.
+\headsep 25pt                   % Space between running head and text.
+
+\textheight 23cm        % [jdf] stick to CHI Formating Guidelines
+\textwidth 18cm         % [jdf] stick to CHI Formating Guidelines
+                        % For two-column mode:
+\columnsep 8.5mm        % [jdf] stick to CHI Formating Guidelines
+\columnseprule 0pt      %    Width of rule between columns.
+\hfuzz 1pt              % Allow some variation in column width, otherwise it's
+                        % too hard to typeset in narrow columns.
+
+\footnotesep 5.6pt      % Height of strut placed at the beginning of every
+                        % footnote =3D height of normal \footnotesize strut,
+                        % so no extra space between footnotes.
+
+\skip\footins 8.1pt plus 4pt minus 2pt  % Space between last line of text and
+                                        % top of first footnote.
+\floatsep 11pt plus 2pt minus 2pt       % Space between adjacent floats moved
+                                        % to top or bottom of text page.
+\textfloatsep 18pt plus 2pt minus 4pt   % Space between main text and floats
+                                        % at top or bottom of page.
+\intextsep 11pt plus 2pt minus 2pt      % Space between in-text figures and
+                                        % text.
+\@ifundefined{@maxsep}{\newdimen\@maxsep}{}% this is for LaTeX2e
+\@maxsep 18pt                           % The maximum of \floatsep,
+                                        % \textfloatsep and \intextsep (minus
+                                        % the stretch and shrink).
+\dblfloatsep 11pt plus 2pt minus 2pt    % Same as \floatsep for double-column
+                                        % figures in two-column mode.
+\dbltextfloatsep 18pt plus 2pt minus 4pt% \textfloatsep for double-column
+                                        % floats.
+\@ifundefined{@dblmaxsep}{\newdimen\@dblmaxsep}{}% this is for LaTeX2e
+\@dblmaxsep 18pt                        % The maximum of \dblfloatsep and
+                                        % \dbltexfloatsep.
+\@fptop 0pt plus 1fil    % Stretch at top of float page/column. (Must be
+                         % 0pt plus ...)
+\@fpsep 8pt plus 2fil    % Space between floats on float page/column.
+\@fpbot 0pt plus 1fil    % Stretch at bottom of float page/column. (Must be
+                         % 0pt plus ... )
+\@dblfptop 0pt plus 1fil % Stretch at top of float page. (Must be 0pt plus ...)
+\@dblfpsep 8pt plus 2fil % Space between floats on float page.
+\@dblfpbot 0pt plus 1fil % Stretch at bottom of float page. (Must be
+                         % 0pt plus ... )
+\marginparpush 5pt       % Minimum vertical separation between two marginal
+                         % notes.
+
+\parskip 0pt                % Extra vertical space between paragraphs.
+                    % Set to 0pt outside sections, to keep section heads
+                    % uniformly spaced.  The value of parskip is set
+                    % to leading value _within_ sections.
+                    % 12 Jan 2000 gkmt
+\parindent 0pt                % Width of paragraph indentation.
+%\partopsep 2pt plus 1pt minus 1pt% Extra vertical space, in addition to
+                                 % \parskip and \topsep, added when user
+                                 % leaves blank line before environment.
+\partopsep 0pt                   % 00-09-07 job
+
+\doublehyphendemerits=9999        % No consecutive line hyphens
+\brokenpenalty=9999               % No broken words across pages
+\widowpenalty=9999                % Almost no widows at bottom of page
+\clubpenalty=9999                 % Almost no orphans at top of page
+\interfootnotelinepenalty=9999    % Almost never break footnotes
+
+\@lowpenalty   51       % Produced by \nopagebreak[1] or \nolinebreak[1]
+\@medpenalty  151       % Produced by \nopagebreak[2] or \nolinebreak[2]
+\@highpenalty 301       % Produced by \nopagebreak[3] or \nolinebreak[3]
+
+\@beginparpenalty -\@lowpenalty % Before a list or paragraph environment.
+\@endparpenalty   -\@lowpenalty % After a list or paragraph environment.
+\@itempenalty     -\@lowpenalty % Between list items.
+
+\@namedef{ds@10pt}{\@latexerr{The `10pt' option is not allowed in the `acmconf'
+  document style.}\@eha}
+\@namedef{ds@11pt}{\@latexerr{The `11pt' option is not allowed in the `acmconf'
+  document style.}\@eha}
+\@namedef{ds@12pt}{\@latexerr{The `12pt' option is not allowed in the `acmconf'
+  document style.}\@eha}
+
+\@options
+
+\lineskip 2pt           % \lineskip is 1pt for all font sizes.
+\normallineskip 2pt
+\def\baselinestretch{1}
+
+\abovedisplayskip 9pt plus2pt minus4.5pt%
+\belowdisplayskip \abovedisplayskip
+\abovedisplayshortskip  \z@ plus3pt%
+\belowdisplayshortskip  5.4pt plus3pt minus3pt%
+\let\@listi\@listI     % Setting of \@listi added 9 Jun 87
+
+\def\small{\@setsize\small{9pt}\viiipt\@viiipt
+\abovedisplayskip 7.6pt plus 3pt minus 4pt%
+\belowdisplayskip \abovedisplayskip
+\abovedisplayshortskip \z@ plus2pt%
+\belowdisplayshortskip 3.6pt plus2pt minus 2pt
+\def\@listi{\leftmargin\leftmargini %% Added 22 Dec 87
+\topsep 4pt plus 2pt minus 2pt\parsep 2pt plus 1pt minus 1pt
+\itemsep \parsep}}
+
+\def\footnotesize{\@setsize\footnotesize{9pt}\ixpt\@ixpt
+\abovedisplayskip 6.4pt plus 2pt minus 4pt%
+\belowdisplayskip \abovedisplayskip
+\abovedisplayshortskip \z@ plus 1pt%
+\belowdisplayshortskip 2.7pt plus 1pt minus 2pt
+\def\@listi{\leftmargin\leftmargini %% Added 22 Dec 87
+\topsep 3pt plus 1pt minus 1pt\parsep 2pt plus 1pt minus 1pt
+\itemsep \parsep}}
+
+\newcount\aucount
+\newcount\originalaucount
+\newdimen\auwidth
+\auwidth=\textwidth
+\newdimen\auskip
+\newcount\auskipcount
+\newdimen\auskip
+\global\auskip=1pc
+\newdimen\allauboxes
+\allauboxes=\auwidth
+\newtoks\addauthors
+
+\newtoks\subtitletext
+\gdef\subtitle#1{\subtitletext={#1}}
+
+\gdef\additionalauthors#1{\addauthors={#1}}
+
+\gdef\numberofauthors#1{\global\aucount=#1
+\ifnum\aucount>3\global\originalaucount=\aucount \global\aucount=3\fi%
+\global\auskipcount=\aucount\global\advance\auskipcount by -1 % [jdf] fix
+\global\multiply\auskipcount by 2
+\global\multiply\auskip by \auskipcount
+\global\advance\auwidth by -\auskip
+\global\divide\auwidth by \aucount
+\global\advance\auwidth by -5pt} % [jdf] tabular add some horizontal space
+
+% \and was modified to count the number of authors.  GKMT 12 Aug 1999
+\def\alignauthor{%                  % \begin{tabular}
+  \end{tabular} %
+  \hskip 1pt % [jdf] allow break for more than 3 authors
+  \begin{tabular}[t]{p{\auwidth}}%
+    \centering\baselineskip 13pt}% [jdf] added more vertical spacing
+
+%  *** NOTE *** NOTE *** NOTE *** NOTE ***
+%  If you have 'font problems' then you may need
+%  to change these, e.g. 'arialb' instead of "arialbd".
+%  Gerry Murray 11/11/1999
+%  *** OR ** comment out block A and activate block B or vice versa.
+% **********************************************
+%
+%  -- Start of block A -- (Type 1 or Truetype fonts)
+%\newfont{\secfnt}{timesbd at 12pt} % was timenrb originally - now is timesbd
+%\newfont{\secit}{timesbi at 12pt}   %13 Jan 00 gkmt
+%\newfont{\subsecfnt}{timesi at 11pt} % was timenrri originally - now is timesi
+%\newfont{\subsecit}{timesbi at 11pt} % 13 Jan 00 gkmt -- was times changed to timesbi gm 2/4/2000
+%                         % because "normal" is italic, "italic" is Roman
+%\newfont{\ttlfnt}{arialbd at 18pt} % was arialb originally - now is arialbd
+%\newfont{\ttlit}{arialbi at 18pt}    % 13 Jan 00 gkmt
+%\newfont{\subttlfnt}{arial at 14pt} % was arialr originally - now is arial
+%\newfont{\subttlit}{ariali at 14pt} % 13 Jan 00 gkmt
+%\newfont{\subttlbf}{arialbd at 14pt}  % 13 Jan 00 gkmt
+%\newfont{\aufnt}{arial at 12pt} % was arialr originally - now is arial
+%\newfont{\auit}{ariali at 12pt} % 13 Jan 00 gkmt
+%\newfont{\affaddr}{arial at 10pt} % was arialr originally - now is arial
+%\newfont{\affaddrit}{ariali at 10pt} %13 Jan 00 gkmt
+%\newfont{\eaddfnt}{arial at 12pt} % was arialr originally - now is arial
+%\newfont{\ixpt}{times at 9pt} % was timenrr originally - now is times
+%\newfont{\confname}{timesi at 8pt} % was timenrri - now is timesi
+%\newfont{\crnotice}{times at 8pt} % was timenrr originally - now is times
+%\newfont{\ninept}{times at 9pt} % was timenrr originally - now is times
+
+% *********************************************
+%  -- End of block A --
+%
+%
+% -- Start of block B -- METAFONT
+% +++++++++++++++++++++++++++++++++++++++++++++
+% Next (default) block for those using Metafont
+% Gerry Murray 11/11/1999
+% *** THIS BLOCK FOR THOSE USING METAFONT *****
+% *********************************************
+%%\newfont{\secfnt}{ptmb at 12pt}
+%\newfont{\secfnt}{phvb at 9pt}                    % 00-09-07 job
+%%\newfont{\secit}{ptmbi at 12pt}    %13 Jan 00 gkmt
+%\newfont{\secit}{phvbo at 9pt}    %13 Jan 00 gkmt % 00-09-07 job
+%%\newfont{\subsecfnt}{ptmri at 11pt}
+%\newfont{\subsecfnt}{phvro at 9pt}                % 00-09-07 job
+%%\newfont{\subsecit}{ptmbi at 11pt}  % 13 Jan 00 gkmt -- was ptmr changed to ptmbi gm 2/4/2000
+%\newfont{\subsecit}{phvr at 9pt}  % 13 Jan 00 gkmt -- was ptmr changed to ptmbi gm 2/4/2000     % 00-09-07 job
+%                         % because "normal" is italic, "italic" is Roman
+%\newfont{\ttlfnt}{phvb at 18pt}
+%\newfont{\ttlit}{phvbo at 18pt}    % GM 2/4/2000
+%\newfont{\subttlfnt}{phvr at 14pt}
+%\newfont{\subttlit}{phvro at 14pt} % GM 2/4/2000
+%\newfont{\subttlbf}{phvb at 14pt}  % 13 Jan 00 gkmt
+%%\newfont{\aufnt}{phvr at 12pt}
+%\newfont{\aufnt}{ptmb at 12pt}                    % 00-09-07 job
+%%\newfont{\auit}{phvro at 12pt}     % GM 2/4/2000
+%\newfont{\auit}{ptmbo at 12pt}    % GM 2/4/2000   % 00-09-07 job
+%%\newfont{\affaddr}{phvr at 10pt}
+%\newfont{\affaddr}{ptmr at 12pt}                  % 00-09-07 job
+%%\newfont{\affaddrit}{phvro at 10pt} % GM 2/4/2000
+%\newfont{\affaddrit}{ptmro at 12pt} % GM 2/4/2000 % 00-09-07 job
+%%\newfont{\eaddfnt}{phvr at 12pt}
+%\newfont{\eaddfnt}{ptmr at 12pt}                  % 00-09-07 job
+%%\newfont{\ixpt}{ptmr at 9pt}
+%\newfont{\ixpt}{ptmr at 10pt}          % 00-09-07 job
+%\newfont{\confname}{ptmri at 8pt}
+%\newfont{\crnotice}{ptmr at 8pt}
+%%\newfont{\ninept}{ptmr at 9pt}
+%\newfont{\ninept}{ptmr at 10pt}        % 00-09-07 job
+% *********************************************
+%  -- End of block B --
+%
+%
+% -- Start of block C -- METAFONT (modern usage)
+% +++++++++++++++++++++++++++++++++++++++++++++
+% Next (default) block for those using Metafont
+% Andrew Duchowski 06/19/2006
+% *** THIS BLOCK FOR THOSE USING METAFONT *****
+% *********************************************
+% notes: 7t fonts are 7-bit latex, 8t fonts are T1 fonts
+\newfont{\secfnt}{phvb8t at 9pt}
+\newfont{\secit}{phvbo8t at 9pt}
+\newfont{\subsecfnt}{phvro8t at 9pt}
+\newfont{\subsecit}{phvr8t at 9pt}
+\newfont{\ttlfnt}{phvb8t at 18pt}
+\newfont{\ttlit}{phvbo8t at 18pt}
+\newfont{\subttlfnt}{phvr8t at 14pt}
+\newfont{\subttlit}{phvro8t at 14pt}
+\newfont{\subttlbf}{phvb8t at 14pt}
+\newfont{\aufnt}{ptmb8t at 12pt}
+\newfont{\auit}{ptmbo8t at 12pt}
+\newfont{\affaddr}{ptmr8t at 12pt}
+\newfont{\affaddrit}{ptmro8t at 12pt}
+\newfont{\eaddfnt}{ptmr8t at 12pt}
+\newfont{\ixpt}{ptmr8t at 10pt}
+\newfont{\confname}{ptmri8t at 8pt}
+\newfont{\crnotice}{ptmr8t at 8pt}
+\newfont{\ninept}{ptmr8t at 10pt}
+% +++++++++++++++++++++++++++++++++++++++++++++
+% -- End of block C --
+
+%\def\email#1{{{\eaddfnt{\vskip 4pt#1}}}}
+\def\email#1{{{\eaddfnt{#1}}}} % 00-09-07 job
+
+\def\addauthorsection{\ifnum\originalaucount>3
+    \section{Additional Authors}\the\addauthors
+  \fi}
+
+\newcount\savesection
+\newcount\sectioncntr
+\global\sectioncntr=1
+
+\setcounter{secnumdepth}{0}
+
+\def\appendix{\par
+\section*{APPENDIX}
+\setcounter{section}{0}
+ \setcounter{subsection}{0}
+ \def\thesection{\Alph{section}} }
+
+
+%\leftmargini 22.5pt
+\leftmargini 10pt % DLC
+\leftmarginii 19.8pt    % > \labelsep + width of '(m)'
+\leftmarginiii 16.8pt   % > \labelsep + width of 'vii.'
+\leftmarginiv 15.3pt    % > \labelsep + width of 'M.'
+\leftmarginv 9pt
+\leftmarginvi 9pt
+
+\leftmargin\leftmargini
+\labelsep 4.5pt
+\labelwidth\leftmargini\advance\labelwidth-\labelsep
+
+\def\@listI{\leftmargin\leftmargini \parsep 3.6pt plus 2pt minus 1pt%
+%\topsep 7.2pt plus 2pt minus 4pt%
+\topsep 0pt%                        % 00-09-07 job
+\itemsep 3.6pt plus 2pt minus 1pt}
+
+\let\@listi\@listI
+\@listi
+
+\def\@listii{\leftmargin\leftmarginii
+   \labelwidth\leftmarginii\advance\labelwidth-\labelsep
+   \topsep 3.6pt plus 2pt minus 1pt
+   \parsep 1.8pt plus 0.9pt minus 0.9pt
+   \itemsep \parsep}
+
+\def\@listiii{\leftmargin\leftmarginiii
+    \labelwidth\leftmarginiii\advance\labelwidth-\labelsep
+    \topsep 1.8pt plus 0.9pt minus 0.9pt
+    \parsep \z@ \partopsep 1pt plus 0pt minus 1pt
+    \itemsep \topsep}
+
+\def\@listiv{\leftmargin\leftmarginiv
+     \labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
+
+\def\@listv{\leftmargin\leftmarginv
+     \labelwidth\leftmarginv\advance\labelwidth-\labelsep}
+
+\def\@listvi{\leftmargin\leftmarginvi
+     \labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
+
+\def\labelenumi{\theenumi.}
+\def\theenumi{\arabic{enumi}}
+
+\def\labelenumii{(\theenumii)}
+\def\theenumii{\alph{enumii}}
+\def\p@enumii{\theenumi}
+
+\def\labelenumiii{\theenumiii.}
+\def\theenumiii{\roman{enumiii}}
+\def\p@enumiii{\theenumi(\theenumii)}
+
+\def\labelenumiv{\theenumiv.}
+\def\theenumiv{\Alph{enumiv}}
+\def\p@enumiv{\p@enumiii\theenumiii}
+
+\def\labelitemi{$\bullet$}
+\def\labelitemii{\bf --}
+\def\labelitemiii{$\ast$}
+\def\labelitemiv{$\cdot$}
+
+\def\verse{\let\\=\@centercr
+  \list{}{\itemsep\z@ \itemindent -1.5em\listparindent \itemindent
+          \rightmargin\leftmargin\advance\leftmargin 1.5em}\item[]}
+\let\endverse\endlist
+
+\def\quotation{\list{}{\listparindent 1.5em
+    \itemindent\listparindent
+    \rightmargin\leftmargin \parsep 0pt plus 1pt}\item[]}
+\let\endquotation=\endlist
+
+\def\quote{\list{}{\rightmargin\leftmargin}\item[]}
+\let\endquote=\endlist
+
+\def\descriptionlabel#1{\hspace\labelsep \bf #1}
+\def\description{\list{}{\labelwidth\z@ \itemindent-\leftmargin
+       \let\makelabel\descriptionlabel}}
+
+\let\enddescription\endlist
+
+\def\theequation{\arabic{equation}}
+
+\arraycolsep 4.5pt   % Half the space between columns in an array environment.
+\tabcolsep 5.4pt     % Half the space between columns in a tabular environment.
+\arrayrulewidth .4pt % Width of rules in array and tabular environment.
+\doublerulesep 1.8pt % Space between adjacent rules in array or tabular env.
+
+\tabbingsep \labelsep   % Space used by the \' command.  (See LaTeX manual.)
+
+\skip\@mpfootins =\skip\footins
+
+\fboxsep =2.7pt      % Space left between box and text by \fbox and \framebox.
+\fboxrule =.4pt      % Width of rules in box made by \fbox and \framebox.
+
+\def\thepart{\Roman{part}} % Roman numeral part numbers.
+\def\thesection       {\arabic{section}}
+\def\thesubsection    {\thesection.\arabic{subsection}}
+%\def\thesubsubsection {\thesubsection.\arabic{subsubsection}} % GM 7/30/2002
+%\def\theparagraph     {\thesubsubsection.\arabic{paragraph}}  % GM 7/30/2002
+\def\thesubparagraph  {\theparagraph.\arabic{subparagraph}}
+
+\def\@pnumwidth{1.55em}
+\def\@tocrmarg {2.55em}
+\def\@dotsep{4.5}
+\setcounter{tocdepth}{3}
+
+\def\tableofcontents{\@latexerr{\tableofcontents: Tables of contents are not
+  allowed in the `acmconf' document style.}\@eha}
+
+\def\l@part#1#2{\addpenalty{\@secpenalty}
+   \addvspace{2.25em plus 1pt}  % space above part line
+   \begingroup
+   \@tempdima 3em       % width of box holding part number, used by
+     \parindent \z@ \rightskip \@pnumwidth      %% \numberline
+     \parfillskip -\@pnumwidth
+     {\large \bf        % set line in \large boldface
+     \leavevmode        % TeX command to enter horizontal mode.
+     #1\hfil \hbox to\@pnumwidth{\hss #2}}\par
+     \nobreak           % Never break after part entry
+   \endgroup}
+
+\def\l@section#1#2{\addpenalty{\@secpenalty} % good place for page break
+   \addvspace{1.0em plus 1pt}   % space above toc entry
+   \@tempdima 1.5em             % width of box holding section number
+   \begingroup
+     \parindent \z@ \rightskip \@pnumwidth
+     \parfillskip -\@pnumwidth
+     \bf                        % Boldface.
+     \leavevmode                % TeX command to enter horizontal mode.
+      \advance\leftskip\@tempdima %% added 5 Feb 88 to conform to
+      \hskip -\leftskip           %% 25 Jan 88 change to \numberline
+     #1\nobreak\hfil \nobreak\hbox to\@pnumwidth{\hss #2}\par
+   \endgroup}
+
+
+\def\l@subsection{\@dottedtocline{2}{1.5em}{2.3em}}
+\def\l@subsubsection{\@dottedtocline{3}{3.8em}{3.2em}}
+\def\l@paragraph{\@dottedtocline{4}{7.0em}{4.1em}}
+\def\l@subparagraph{\@dottedtocline{5}{10em}{5em}}
+
+\def\listoffigures{\@latexerr{\listoffigures: Lists of figures are not
+  allowed in the `acmconf' document style.}\@eha}
+
+\def\l@figure{\@dottedtocline{1}{1.5em}{2.3em}}
+
+\def\listoftables{\@latexerr{\listoftables: Lists of tables are not
+  allowed in the `acmconf' document style.}\@eha}
+\let\l@table\l@figure
+
+\def\footnoterule{\kern-3\p@
+  \hrule width .4\columnwidth
+  \kern 2.6\p@}                 % The \hrule has default height of .4pt .
+% ------
+\long\def\@makefntext#1{\noindent 
+%\hbox to .5em{\hss$^{\@thefnmark}$}#1}   % original
+\hbox to .5em{\hss\textsuperscript{\@thefnmark}}#1}  % C. Clifton / GM Oct. 2nd. 2002
+% -------
+
+\long\def\@maketntext#1{\noindent
+#1}
+
+\long\def\@maketitlenotetext#1#2{\noindent
+            \hbox to 1.8em{\hss$^{#1}$}#2}
+
+\setcounter{topnumber}{2}
+\def\topfraction{.7}
+\setcounter{bottomnumber}{1}
+\def\bottomfraction{.3}
+\setcounter{totalnumber}{3}
+\def\textfraction{.2}
+\def\floatpagefraction{.5}
+\setcounter{dbltopnumber}{2}
+\def\dbltopfraction{.7}
+\def\dblfloatpagefraction{.5}
+
+% According to the CHI specifications, captions should be centered.
+% This looks kind of bad for multi-line captions, so I only center
+% 1-line captions.  
+\long\def\@makecaption#1#2{
+  \vskip 2pt % \baselineskip
+  \setbox\@tempboxa\hbox{\small
+\textbf{#1. #2}} % DLC
+  \ifdim \wd\@tempboxa >\hsize % IF longer than one line:
+     \small\textbf{#1. #2}\par     %   THEN set as ordinary paragraph.
+    \else                      %   ELSE  center. 
+     \hbox to\hsize{\hfil\box\@tempboxa\hfil}\par
+  \fi}
+
+\@ifundefined{figure}{\newcounter {figure}} % this is for LaTeX2e
+
+\def\fps@figure{tbp}
+\def\ftype@figure{1}
+\def\ext@figure{lof}
+\def\fnum@figure{Figure \thefigure}
+\def\figure{\@float{figure}}
+\def\endfigure{\end@float}
+\@namedef{figure*}{\@dblfloat{figure}}
+\@namedef{endfigure*}{\end@dblfloat}
+
+\@ifundefined{table}{\newcounter {table}} % this is for LaTeX2e
+
+\def\fps@table{tbp}
+\def\ftype@table{2}
+\def\ext@table{lot}
+\def\fnum@table{Table \thetable}
+\def\table{\@float{table}}
+\def\endtable{\end@float}
+\@namedef{table*}{\@dblfloat{table}}
+\@namedef{endtable*}{\end@dblfloat}
+
+% [jdf] create a \teaser command for adding a figure in title page
+\newcommand{\chi@empty}{}
+\newcommand{\chi@teaser}{}
+\newcommand{\teaser}[1]{\renewcommand{\chi@teaser}{#1}}
+
+\newlength{\teaserspace}
+\setlength{\teaserspace}{0.25in}
+%  [jdf] end teaser
+
+\newtoks\titleboxnotes
+\newcount\titleboxnoteflag
+
+\def\maketitle{\par
+ \begingroup
+   \def\thefootnote{\fnsymbol{footnote}}
+   \def\@makefnmark{\hbox
+       to 0pt{$^{\@thefnmark}$\hss}}
+     \twocolumn[\@maketitle]
+\@thanks
+ \endgroup
+ \setcounter{footnote}{0}
+ \let\maketitle\relax
+ \let\@maketitle\relax
+ \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\gdef\@subtitle{}\let\thanks\relax
+ \@copyrightspace} % JRB 2008-08-13 this line was commented out for some reason
+                   % which meant that the copyright space wasn't reserved.
+                   % I re-enabled it.
+
+% actual class setup happens here:
+  \NeedsTeXFormat{LaTeX2e}
+  \ProvidesClass{sigchi} [2011/06/06 - V0.16]  % DLC
+  \RequirePackage{latexsym}% QUERY: are these two really needed?
+
+\def\@height{height}
+\def\@width{width}
+\def\@minus{minus}
+\def\@plus{plus}
+\def\hb@xt@{\hbox to}
+\newif\if@faircopy
+\@faircopyfalse
+\def\ds@faircopy{\@faircopytrue}
+
+\def\ds@preprint{\@faircopyfalse}
+
+\@twosidetrue
+\@mparswitchtrue
+\def\ds@draft{\overfullrule 5\p@}
+
+\lineskip \p@
+\normallineskip \p@
+\def\baselinestretch{1}
+\def\@ptsize{0} %needed for amssymbols.sty
+
+% allow use of old-style font change commands in LaTeX2e
+\@maxdepth\maxdepth
+%
+\DeclareOldFontCommand{\rm}{\ninept\rmfamily}{\mathrm}
+\DeclareOldFontCommand{\sf}{\normalfont\sffamily}{\mathsf}
+\DeclareOldFontCommand{\tt}{\normalfont\ttfamily}{\mathtt}
+\DeclareOldFontCommand{\bf}{\normalfont\bfseries}{\mathbf}
+\DeclareOldFontCommand{\it}{\normalfont\itshape}{\mathit}
+\DeclareOldFontCommand{\sl}{\normalfont\slshape}{\@nomath\sl}
+\DeclareOldFontCommand{\sc}{\normalfont\scshape}{\@nomath\sc}
+\DeclareRobustCommand*{\cal}{\@fontswitch{\relax}{\mathcal}}
+\DeclareRobustCommand*{\mit}{\@fontswitch{\relax}{\mathnormal}}
+
+ \renewcommand{\rmdefault}{cmr}  % was 'ttm'
+% Note! I have also found 'mvr' to work ESPECIALLY well.
+% Gerry - October 1999
+% You may need to change your LV1times.fd file so that sc is
+% mapped to cmcsc - -for smallcaps -- that is if you decide
+% to change {cmr} to {times} above. (Not recommended)
+  \renewcommand{\@ptsize}{}
+  \renewcommand{\normalsize}{%
+%    \@setfontsize\normalsize\@ixpt{10.5\p@}%\ninept%
+    \@setfontsize\normalsize\@xpt{11\p@}%\ninept% % from 9/10.5 to 10/11 00-09-07 job
+    \abovedisplayskip 6\p@ \@plus2\p@ \@minus\p@
+    \belowdisplayskip \abovedisplayskip
+    \abovedisplayshortskip 6\p@ \@minus 3\p@
+    \belowdisplayshortskip 6\p@ \@minus 3\p@
+    \let\@listi\@listI
+  }
+
+  \newcommand\scriptsize{\@setfontsize\scriptsize\@viipt{8\p@}}
+  \newcommand\tiny{\@setfontsize\tiny\@vpt{6\p@}}
+  \newcommand\large{\@setfontsize\large\@xiipt{14\p@}}
+  \newcommand\Large{\@setfontsize\Large\@xivpt{18\p@}}
+  \newcommand\LARGE{\@setfontsize\LARGE\@xviipt{20\p@}}
+  \newcommand\huge{\@setfontsize\huge\@xxpt{25\p@}}
+  \newcommand\Huge{\@setfontsize\Huge\@xxvpt{30\p@}}
+
+\normalsize
+
+% make aubox hsize/number of authors up to 3, less gutter
+% then showbox gutter showbox gutter showbox -- GKMT Aug 99
+%\newbox\@acmtitlebox % [jdf] no use of the box
+\def\@maketitle{\newpage
+ \null
+%\setbox\@acmtitlebox\vbox{   % [jdf] box not used
+\baselineskip 20pt            % [jdf] no skip
+  \vskip 1em                   % [jdf] removed Vertical space above title.
+   \begin{center}
+    {\ttlfnt \@title\par}       % Title set in 18pt Helvetica (Arial) bold size.
+    \vskip 0.5ex                % [jdf] Vertical space after title.
+%This should be the subtitle.
+{\subttlfnt \the\subtitletext\par}\vskip 0.7ex%\fi [jdf] less vertical space
+    {\baselineskip 16pt\aufnt   % each author set in \12 pt Arial, in a
+     \lineskip .5em             % tabular environment
+     \begin{tabular}[t]{c}\@author
+     \end{tabular}\par}
+    \vskip 1em                  % [jdf] reduced Vertical space after author.
+   \end{center}%                % [jdf] begin add teaser
+   \ifx\chi@teaser\chi@empty \else%
+    \begingroup%
+      \def\@captype{figure}%
+        \chi@teaser%
+        \endgroup\par%
+        \vspace{\teaserspace}%  % [jdf] end teaser
+    \fi}
+% \dimen0=\ht\@acmtitlebox % [jdf] removed box stuff
+% \advance\dimen0 by -12.75pc\relax % Increased space for title box -- KBT
+% \advance\dimen0 by -8.75pc\relax % Increased space for title box -- KBT  % 00-09-07 job (decreased)
+% \unvbox\@acmtitlebox
+% \ifdim\dimen0<0.0pt\relax\vskip-\dimen0\fi}
+
+
+\newcount\titlenotecount
+\global\titlenotecount=0
+\newtoks\tntoks
+\newtoks\tntokstwo
+\newtoks\tntoksthree
+\newtoks\tntoksfour
+\newtoks\tntoksfive
+
+\def\abstract{
+\ifnum\titlenotecount>0 % was =1
+    \insert\footins{%
+    \reset@font\footnotesize
+        \interlinepenalty\interfootnotelinepenalty
+        \splittopskip\footnotesep
+        \splitmaxdepth \dp\strutbox \floatingpenalty \@MM
+        \hsize\columnwidth \@parboxrestore
+        \protected@edef\@currentlabel{%
+        }%
+        \color@begingroup
+\ifnum\titlenotecount=1
+      \@maketntext{%
+         \raisebox{4pt}{$\ast$}\rule\z@\footnotesep\ignorespaces\the\tntoks\@finalstrut\strutbox}%
+\fi
+\ifnum\titlenotecount=2
+      \@maketntext{%
+      \raisebox{4pt}{$\ast$}\rule\z@\footnotesep\ignorespaces\the\tntoks\par\@finalstrut\strutbox}%
+\@maketntext{%
+         \raisebox{4pt}{$\dagger$}\rule\z@\footnotesep\ignorespaces\the\tntokstwo\@finalstrut\strutbox}%
+\fi
+\ifnum\titlenotecount=3
+      \@maketntext{%
+         \raisebox{4pt}{$\ast$}\rule\z@\footnotesep\ignorespaces\the\tntoks\par\@finalstrut\strutbox}%
+\@maketntext{%
+         \raisebox{4pt}{$\dagger$}\rule\z@\footnotesep\ignorespaces\the\tntokstwo\par\@finalstrut\strutbox}%
+\@maketntext{%
+         \raisebox{4pt}{$\ddagger$}\rule\z@\footnotesep\ignorespaces\the\tntoksthree\@finalstrut\strutbox}%
+\fi
+\ifnum\titlenotecount=4
+      \@maketntext{%
+         \raisebox{4pt}{$\ast$}\rule\z@\footnotesep\ignorespaces\the\tntoks\par\@finalstrut\strutbox}%
+\@maketntext{%
+         \raisebox{4pt}{$\dagger$}\rule\z@\footnotesep\ignorespaces\the\tntokstwo\par\@finalstrut\strutbox}%
+\@maketntext{%
+         \raisebox{4pt}{$\ddagger$}\rule\z@\footnotesep\ignorespaces\the\tntoksthree\par\@finalstrut\strutbox}%
+\@maketntext{%
+         \raisebox{4pt}{$\S$}\rule\z@\footnotesep\ignorespaces\the\tntoksfour\@finalstrut\strutbox}%
+\fi
+\ifnum\titlenotecount=5
+      \@maketntext{%
+         \raisebox{4pt}{$\ast$}\rule\z@\footnotesep\ignorespaces\the\tntoks\par\@finalstrut\strutbox}%
+\@maketntext{%
+         \raisebox{4pt}{$\dagger$}\rule\z@\footnotesep\ignorespaces\the\tntokstwo\par\@finalstrut\strutbox}%
+\@maketntext{%
+         \raisebox{4pt}{$\ddagger$}\rule\z@\footnotesep\ignorespaces\the\tntoksthree\par\@finalstrut\strutbox}%
+\@maketntext{%
+         \raisebox{4pt}{$\S$}\rule\z@\footnotesep\ignorespaces\the\tntoksfour\par\@finalstrut\strutbox}%
+\@maketntext{%
+         \raisebox{4pt}{$\P$}\rule\z@\footnotesep\ignorespaces\the\tntoksfive\@finalstrut\strutbox}%
+\fi
+   \color@endgroup} %g}
+\fi
+\setcounter{footnote}{0}
+\section*{ABSTRACT}\normalsize %\the\parskip \the\baselineskip%\ninept
+}
+
+\def\endabstract{\if@twocolumn\else\endquotation\fi}
+
+\def\keywords#1{%\if@twocolumn
+\section*{Author Keywords}
+\begin{flushleft}#1\end{flushleft}
+%\else \small
+%\quotation #1
+%\fi
+}
+
+\def\classification#1{%\if@twocolumn
+\section*{ACM Classification Keywords}
+\begin{flushleft}#1\end{flushleft}
+%\else \small
+%\quotation\the\parskip
+%\fi
+}
+
+% I've pulled the check for 2 cols, since proceedings are _always_
+% two-column  11 Jan 2000 gkmt
+\def\terms#1{%\if@twocolumn
+\section*{General Terms}
+\begin{flushleft}#1\end{flushleft}
+%\else \small
+%\quotation\the\parskip
+%\fi
+}
+
+% -- Classification needs to be a bit smart due to optionals - Gerry/Georgia November 2nd. 1999
+\newcount\catcount
+\global\catcount=1
+
+\def\category#1#2#3{%
+\ifnum\catcount=1
+\section*{ACM Classification Keywords}  % DLC
+\advance\catcount by 1\else{\unskip; }\fi
+    \@ifnextchar [{\@category{#1}{#2}{#3}}{\@category{#1}{#2}{#3}[]}%
+}
+
+\def\@category#1#2#3[#4]{%
+    \begingroup
+        \let\and\relax
+%            #1 [\textbf{#2}]% 
+            #1 #2%   % DLC
+            \if!#4!%
+                \if!#3!\else : #3\fi
+            \else
+                :\space
+                \if!#3!\else #3\kern\z@---\hskip\z@\fi
+                \textit{#4}%
+            \fi
+    \endgroup
+}
+%
+
+%%% This section (written by KBT) handles the 1" box in the lower left
+%%% corner of the left column of the first page by creating a picture,
+%%% and inserting the predefined string at the bottom (with a negative
+%%% displacement to offset the space allocated for a non-existent
+%%% caption).
+%%%
+\newtoks\copyrightnotice
+\def\ftype@copyrightbox{8}
+\def\@copyrightspace{
+\@float{copyrightbox}[b]
+\begin{center}
+\setlength{\unitlength}{1pc}
+\ifnum\ACM@basesize=9
+\begin{picture}(20,6) %Space for copyright notice
+\put(0,-0.95){\crnotice{\@toappear}}
+\end{picture}
+\fi
+\ifnum\ACM@basesize=10
+\begin{picture}(20,7) %Space for copyright notice
+\put(0,-0.95){\crnotice{\@toappear}}
+\end{picture}
+\fi
+\end{center}
+\end@float}
+
+\def\@toappear{} % Default setting blank - commands below change this.
+\long\def\toappear#1{\def\@toappear{\parbox[b]{20pc}{\baselineskip 9pt#1}}}
+\def\toappearbox#1{\def\@toappear{\raisebox{5pt}{\framebox[20pc]{\parbox[b]{19pc}{#1}}}}}
+
+\newtoks\conf
+\newtoks\confinfo
+\def\conferenceinfo#1#2{\global\conf={#1}\global\confinfo{#2}}
+
+
+% Introduce a "chi_draft" option which conditionally enables marginpars with a warning.
+\let\oldmarginpar\marginpar
+\renewcommand{\marginpar}[2][]{\@latexerr{The marginpar command is not allowed in the
+  `acmconf' document style.}\@eha}
+\DeclareOption{chi_draft}{
+  \marginparwidth 40pt
+  \renewcommand{\marginpar}[2][]{\@latex@warning{The marginpar command is not allowed in the `acmconf'
+  document style. Remove all occurences before final document submission.}\oldmarginpar[#1]{#2}}
+}
+% Immediately execute the options at this point. Reason is discussed
+% here: https://tex.stackexchange.com/questions/203387/
+\ProcessOptions
+
+\mark{{}{}}     % Initializes TeX's marks
+
+\def\today{\ifcase\month\or
+  January\or February\or March\or April\or May\or June\or
+  July\or August\or September\or October\or November\or December\fi
+  \space\number\day, \number\year}
+
+\def\@begintheorem#1#2{%
+    \trivlist
+    \item[%
+        \hskip 10\p@
+        \hskip \labelsep
+        {{\sc #1}\hskip 5\p@\relax#2.}%
+    ]
+    \it
+}
+\def\@opargbegintheorem#1#2#3{%
+    \trivlist
+    \item[%
+        \hskip 10\p@
+        \hskip \labelsep
+        {\sc #1\ #2\             % This mod by Gerry to enumerate corollaries
+   \setbox\@tempboxa\hbox{(#3)}  % and bracket the 'corollary title'
+        \ifdim \wd\@tempboxa>\z@ % and retain the correct numbering of e.g. theorems
+            \hskip 5\p@\relax    % if they occur 'around' said corollaries.
+            \box\@tempboxa       % Gerry - Nov. 1999.
+        \fi.}%
+    ]
+    \it
+}
+\newif\if@qeded
+\global\@qededfalse
+
+% -- original
+%\def\proof{%
+%  \vspace{-\parskip} % GM July 2000 (for tighter spacing)
+%    \global\@qededfalse
+%    \@ifnextchar[{\@xproof}{\@proof}%
+%}
+% -- end of original
+
+% (JSS) Fix for vertical spacing bug - Gerry Murray July 30th. 2002
+\def\proof{%
+\vspace{-\lastskip}\vspace{-\parsep}\penalty-51%
+\global\@qededfalse
+\@ifnextchar[{\@xproof}{\@proof}%
+}
+
+\def\endproof{%
+    \if@qeded\else\qed\fi
+    \endtrivlist
+}
+\def\@proof{%
+    \trivlist
+    \item[%
+        \hskip 10\p@
+        \hskip \labelsep
+        {\sc Proof.}%
+    ]
+    \ignorespaces
+}
+\def\@xproof[#1]{%
+    \trivlist
+    \item[\hskip 10\p@\hskip \labelsep{\sc Proof #1.}]%
+    \ignorespaces
+}
+\def\qed{%
+    \unskip
+    \kern 10\p@
+    \begingroup
+        \unitlength\p@
+        \linethickness{.4\p@}%
+        \framebox(6,6){}%
+    \endgroup
+    \global\@qededtrue
+}
+
+\def\newdef#1#2{%
+    \expandafter\@ifdefinable\csname #1\endcsname
+        {\@definecounter{#1}%
+         \expandafter\xdef\csname the#1\endcsname{\@thmcounter{#1}}%
+         \global\@namedef{#1}{\@defthm{#1}{#2}}%
+         \global\@namedef{end#1}{\@endtheorem}%
+    }%
+}
+\def\@defthm#1#2{%
+    \refstepcounter{#1}%
+    \@ifnextchar[{\@ydefthm{#1}{#2}}{\@xdefthm{#1}{#2}}%
+}
+\def\@xdefthm#1#2{%
+    \@begindef{#2}{\csname the#1\endcsname}%
+    \ignorespaces
+}
+\def\@ydefthm#1#2[#3]{%
+    \trivlist
+    \item[%
+        \hskip 10\p@
+        \hskip \labelsep
+        {\it #2%
+         \savebox\@tempboxa{#3}%
+         \ifdim \wd\@tempboxa>\z@
+            \ \box\@tempboxa
+         \fi.%
+        }]%
+    \ignorespaces
+}
+\def\@begindef#1#2{%
+    \trivlist
+    \item[%
+        \hskip 10\p@
+        \hskip \labelsep
+        {\it #1\ \rm #2.}%
+    ]%
+}
+\def\theequation{\arabic{equation}}
+
+\newcounter{part}
+\newcounter{section}
+\newcounter{subsection}[section]
+\newcounter{subsubsection}[subsection]
+\newcounter{paragraph}[subsubsection]
+\def\thepart{\Roman{part}}
+\def\thesection{\arabic{section}}
+\def\thesubsection{\thesection.\arabic{subsection}}
+\def\thesubsubsection{\thesubsection.\arabic{subsubsection}} %removed \subsecfnt 29 July 2002 gkmt
+\def\theparagraph{\thesubsubsection.\arabic{paragraph}} %removed \subsecfnt 29 July 2002 gkmt
+
+\newif\if@uchead
+\@ucheadfalse
+
+%% CHANGES: NEW NOTE
+%% NOTE: OK to use old-style font commands below, since they were
+%% suitably redefined for LaTeX2e
+%% END CHANGES
+\def\part{%
+    \@startsection{part}{9}{\z@}{-10\p@ \@plus -4\p@ \@minus -2\p@}
+        {4\p@}{\normalsize\@ucheadtrue}%
+}
+
+% Rationale for changes made in next four definitions:
+% "Before skip" is made elastic to provide some give in setting columns (vs.
+% parskip, which is non-elastic to keep section headers "anchored" to their
+% subsequent text.
+%
+% "After skip" is minimized -- BUT setting it to 0pt resulted in run-in heads, despite
+% the documentation asserted only after-skip < 0pt would have result.
+%
+% Baselineskip added to style to ensure multi-line section titles, and section heads
+% followed by another section head rather than text, are decently spaced vertically.
+% 12 Jan 2000 gkmt
+\def\section{%
+    \@startsection{section}{1}{\z@}{8pt plus 3pt minus 3pt}%
+    {0.5pt}{\baselineskip=14pt\secfnt\@ucheadtrue}%
+}
+
+\def\subsection{%
+    \@startsection{subsection}{2}{\z@}{8pt plus 3pt minus 3pt}%
+    {0.5pt}{\baselineskip=14pt\secfnt}%
+}
+\def\subsubsection{%
+    \@startsection{subsubsection}{3}{\z@}{4pt plus 2pt minus 1pt}%
+    {0.5pt}{\baselineskip=14pt\subsecfnt}%
+}
+
+\def\paragraph{%
+    \@startsection{paragraph}{3}{\z@}{4pt plus 2pt minus 1pt}%
+    {0.5pt}{\baselineskip=14pt\subsecfnt}%
+}
+
+\let\@period=.
+\def\@startsection#1#2#3#4#5#6{%
+        \if@noskipsec  %gkmt, 11 aug 99
+        \global\let\@period\@empty
+        \leavevmode
+        \global\let\@period.%
+    \fi
+    \par
+    \@tempskipa #4\relax
+    \@afterindenttrue
+    \ifdim \@tempskipa <\z@
+        \@tempskipa -\@tempskipa
+        \@afterindentfalse
+    \fi
+    %\if@nobreak  11 Jan 00 gkmt
+        %\everypar{}
+    %\else
+        \addpenalty\@secpenalty
+        \addvspace\@tempskipa
+    %\fi
+    \parskip=0pt
+    \@ifstar
+        {\@ssect{#3}{#4}{#5}{#6}}
+        {\@dblarg{\@sect{#1}{#2}{#3}{#4}{#5}{#6}}}%
+}
+
+
+\def\@ssect#1#2#3#4#5{%
+  \@tempskipa #3\relax
+  \ifdim \@tempskipa>\z@
+    \begingroup
+      #4{%
+        \@hangfrom{\hskip #1}%
+          \interlinepenalty \@M #5\@@par}%
+    \endgroup
+  \else
+    \def\@svsechd{#4{\hskip #1\relax #5}}%
+  \fi
+  \par\nobreak
+  \vskip -6pt  % [jdf] less space between section as in the Word template
+  \@xsect{#3}\parskip=6pt} % [jdf] paragraph skip shorter
+
+
+\def\@sect#1#2#3#4#5#6[#7]#8{%
+    \ifnum #2>\c@secnumdepth
+        \let\@svsec\@\refstepcounter{#1}
+    \else
+        \refstepcounter{#1}%
+        \edef\@svsec{%
+            \begingroup
+                %\ifnum#2>2 \noexpand\rm \fi % changed to next 29 July 2002 gkmt
+            \ifnum#2>2 \noexpand#6 \fi
+                \csname the#1\endcsname
+            \endgroup
+            \ifnum #2=1\relax .\fi
+            \hskip 1em
+        }%
+    \fi
+    \@tempskipa #5\relax
+    \ifdim \@tempskipa>\z@
+        \begingroup
+            #6\relax
+            \@hangfrom{\hskip #3\relax\@svsec}%
+            \begingroup
+                \interlinepenalty \@M
+                \if@uchead
+                    \uppercase{#8}%
+                \else
+                    #8%
+                \fi
+                \par
+            \endgroup
+        \endgroup
+        \csname #1mark\endcsname{#7}%
+        \par\nobreak
+        \vskip -6pt  % [jdf] less space between section as in the Word template
+      \addcontentsline{toc}{#1}{%
+            \ifnum #2>\c@secnumdepth \else
+                \protect\numberline{\csname the#1\endcsname}%
+            \fi
+            #7%
+        }%
+    \else
+        \def\@svsechd{%
+            #6%
+            \hskip #3\relax
+            \@svsec
+            \if@uchead
+                \uppercase{#8}%
+            \else
+                #8%
+            \fi
+            \csname #1mark\endcsname{#7}%
+            \addcontentsline{toc}{#1}{%
+                \ifnum #2>\c@secnumdepth \else
+                    \protect\numberline{\csname the#1\endcsname}%
+                \fi
+                #7%
+            }%
+        }%
+    \fi
+    \@xsect{#5}\parskip=6pt% [jdf] shorter
+}
+\def\@xsect#1{%
+    \@tempskipa #1\relax
+    \ifdim \@tempskipa>\z@
+        \par
+        \nobreak
+        \vskip \@tempskipa
+        \@afterheading
+    \else
+        \global\@nobreakfalse
+        \global\@noskipsectrue
+        \everypar{%
+            \if@noskipsec
+                \global\@noskipsecfalse
+                \clubpenalty\@M
+                \hskip -\parindent
+                \begingroup
+                    \@svsechd
+                    \@period
+                \endgroup
+                \unskip
+                \@tempskipa #1\relax
+                \hskip -\@tempskipa
+            \else
+                \clubpenalty \@clubpenalty
+                \everypar{}%
+            \fi
+        }%
+    \fi
+    \ignorespaces
+}
+
+\def\@trivlist{%
+    \@topsepadd\topsep
+    \if@noskipsec
+        \global\let\@period\@empty
+        \leavevmode
+        \global\let\@period.%
+    \fi
+    \ifvmode
+        \advance\@topsepadd\partopsep
+    \else
+        \unskip
+        \par
+    \fi
+    \if@inlabel
+        \@noparitemtrue
+        \@noparlisttrue
+    \else
+        \@noparlistfalse
+        \@topsep\@topsepadd
+    \fi
+    \advance\@topsep \parskip
+    \leftskip\z@skip
+    \rightskip\@rightskip
+    \parfillskip\@flushglue
+    \@setpar{\if@newlist\else{\@@par}\fi}
+    \global\@newlisttrue
+    \@outerparskip\parskip
+}
+
+%%% Actually, 'abbrev' works just fine as the default - Gerry Feb. 2000
+%%% Bibliography style.
+
+\parindent 0pt
+\typeout{Using 'Abbrev' bibliography style}
+\newcommand\bibyear[2]{%
+    \unskip\quad\ignorespaces#1\unskip
+    \if#2..\quad \else \quad#2 \fi
+}
+\newcommand{\bibemph}[1]{{\em#1}}
+\newcommand{\bibemphic}[1]{{\em#1\/}}
+\newcommand{\bibsc}[1]{{\sc#1}}
+\def\@normalcite{%
+    \def\@cite##1##2{[##1\if@tempswa , ##2\fi]}%
+}
+\def\@citeNB{%
+    \def\@cite##1##2{##1\if@tempswa , ##2\fi}%
+}
+\def\@citeRB{%
+    \def\@cite##1##2{##1\if@tempswa , ##2\fi]}%
+}
+\def\start@cite#1#2{%
+    \edef\citeauthoryear##1##2##3{%
+        ###1%
+        \ifnum#2=\z@ \else\ ###2\fi
+    }%
+    \ifnum#1=\thr@@
+        \let\@@cite\@citeyear
+    \else
+        \let\@@cite\@citenormal
+    \fi
+    \@ifstar{\@citeNB\@@cite}{\@normalcite\@@cite}%
+}
+\def\cite{\start@cite23}
+\def\citeNP{\cite*}
+\def\citeA{\start@cite10}
+\def\citeANP{\citeA*}
+\def\shortcite{\start@cite23}
+\def\shortciteNP{\shortcite*}
+\def\shortciteA{\start@cite20}
+\def\shortciteANP{\shortciteA*}
+\def\citeyear{\start@cite30}
+\def\citeyearNP{\citeyear*}
+\def\citeN{%
+    \@citeRB
+    \def\citeauthoryear##1##2##3{##1\ [##3%
+        \def\reserved@a{##1}%
+        \def\citeauthoryear####1####2####3{%
+            \def\reserved@b{####1}%
+            \ifx\reserved@a\reserved@b
+                ####3%
+            \else
+                \errmessage{Package acmart Error: author mismatch
+                         in \string\citeN^^J^^J%
+                    See the acmart package documentation for explanation}%
+            \fi
+        }%
+    }%
+    \@ifstar\@citeyear\@citeyear
+}
+\def\shortciteN{%
+    \@citeRB
+    \def\citeauthoryear##1##2##3{##2\ [##3%
+        \def\reserved@a{##2}%
+        \def\citeauthoryear####1####2####3{%
+            \def\reserved@b{####2}%
+            \ifx\reserved@a\reserved@b
+                ####3%
+            \else
+                \errmessage{Package acmart Error: author mismatch
+                         in \string\shortciteN^^J^^J%
+                    See the acmart package documentation for explanation}%
+            \fi
+        }%
+    }%
+    \@ifstar\@citeyear\@citeyear % changed from  "\@ifstart" 12 Jan 2000 gkmt
+}
+
+    \def\@citenormal{%
+    \@ifnextchar [{\@tempswatrue\@citex;}
+                  {\@tempswafalse\@citex[]}% GERRY FIX FOR BABEL 3/20/2009
+}
+\def\@citeyear{%
+    \@ifnextchar [{\@tempswatrue\@citex,}%
+                  {\@tempswafalse\@citex[]}% GERRY FIX FOR BABEL 3/20/2009
+}
+\def\@citex#1[#2]#3{%
+    \let\@citea\@empty
+    \@cite{%
+        \@for\@citeb:=#3\do{%
+            \@citea
+            \def\@citea{#1, }%         % GERRY FIX FOR BABEL 3/20/2009 -- SO THAT YOU GET [1, 2] IN THE BODY TEXT
+            \edef\@citeb{\expandafter\@iden\@citeb}%
+            \if@filesw
+                \immediate\write\@auxout{\string\citation{\@citeb}}%
+            \fi
+            \@ifundefined{b@\@citeb}{%
+                {\bf ?}%
+                \@warning{%
+                    Citation `\@citeb' on page \thepage\space undefined%
+                }%
+            }%
+            {\csname b@\@citeb\endcsname}%
+        }%
+    }{#2}%
+}
+\let\@biblabel\@gobble
+\newdimen\bibindent
+\bibindent=0em
+\newskip\bibsep % [jdf] allow bib space tweaking
+\bibsep 2pt \@plus 1pt \@minus 1pt % [jdf] initial spacing between ref entries
+\setcounter{enumi}{1}
+\def\thebibliography#1{%
+%% Comment this to have blue DOI links.
+\makeatletter
+\def\url@leostyle{% 
+\@ifundefined{selectfont}{\def\UrlFont{\sf}}{\def\UrlFont{\color{black}\small\bf\ttfamily}}}
+\makeatother
+\urlstyle{leo}
+%%
+    \section{%
+       {References}
+        \@mkboth{{\refname}}{{\refname}}%
+    }%
+%    \list{[\arabic{enumi}]}{%
+    \list{\arabic{enumi}.}{% DLC
+%        \settowidth\labelwidth{[#1]}%
+        \settowidth\labelwidth{#1.}% DLC
+        \leftmargin\labelwidth
+        \advance\leftmargin\labelsep
+        \advance\leftmargin\bibindent
+        \itemindent -\bibindent
+        \listparindent \itemindent
+        \usecounter{enumi}
+        \itemsep\bibsep% [jdf] standard spacing between references
+    }%
+    \let\newblock\@empty
+    \raggedright  %% 7 JAN 2000 gkmt
+    \sloppy
+    \sfcode`\.=1000\relax
+}
+
+
+\gdef\balancecolumns
+{\vfill\eject
+\global\@colht=\textheight
+\global\ht\@cclv=\textheight
+}
+
+\newcount\colcntr
+\global\colcntr=0
+\newbox\savebox
+
+\gdef \@makecol {%
+\global\advance\colcntr by 1
+\ifnum\colcntr>2 \global\colcntr=1\fi
+   \ifvoid\footins
+     \setbox\@outputbox \box\@cclv
+   \else
+     \setbox\@outputbox \vbox{%
+\boxmaxdepth \@maxdepth
+       \@tempdima\dp\@cclv
+       \unvbox \@cclv
+       \vskip-\@tempdima
+       \vskip \skip\footins
+       \color@begingroup
+         \normalcolor
+         \footnoterule
+         \unvbox \footins
+       \color@endgroup
+       }%
+   \fi
+   \xdef\@freelist{\@freelist\@midlist}%
+   \global \let \@midlist \@empty
+   \@combinefloats
+   \ifvbox\@kludgeins
+     \@makespecialcolbox
+   \else
+     \setbox\@outputbox \vbox to\@colht {%
+\@texttop
+       \dimen@ \dp\@outputbox
+       \unvbox \@outputbox
+   \vskip -\dimen@
+       \@textbottom
+       }%
+   \fi
+   \global \maxdepth \@maxdepth
+}
+\def\titlenote{\@ifnextchar[\@xtitlenote{\stepcounter\@mpfn
+\global\advance\titlenotecount by 1
+\ifnum\titlenotecount=1
+    \raisebox{9pt}{$\ast$}
+\fi
+\ifnum\titlenotecount=2
+    \raisebox{9pt}{$\dagger$}
+\fi
+\ifnum\titlenotecount=3
+    \raisebox{9pt}{$\ddagger$}
+\fi
+\ifnum\titlenotecount=4
+\raisebox{9pt}{$\S$}
+\fi
+\ifnum\titlenotecount=5
+\raisebox{9pt}{$\P$}
+\fi
+         \@titlenotetext
+}}
+
+\long\def\@titlenotetext#1{\insert\footins{%
+\ifnum\titlenotecount=1\global\tntoks={#1}\fi
+\ifnum\titlenotecount=2\global\tntokstwo={#1}\fi
+\ifnum\titlenotecount=3\global\tntoksthree={#1}\fi
+\ifnum\titlenotecount=4\global\tntoksfour={#1}\fi
+\ifnum\titlenotecount=5\global\tntoksfive={#1}\fi
+    \reset@font\footnotesize
+    \interlinepenalty\interfootnotelinepenalty
+    \splittopskip\footnotesep
+    \splitmaxdepth \dp\strutbox \floatingpenalty \@MM
+    \hsize\columnwidth \@parboxrestore
+    \protected@edef\@currentlabel{%
+    }%
+    \color@begingroup
+   \color@endgroup}}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%
+\ps@plain
+\baselineskip=11pt
+\let\thepage\relax % For  NO page numbers - Gerry Nov. 30th. 1999
+\def\setpagenumber#1{\global\setcounter{page}{#1}}
+%\pagenumbering{arabic}  % Arabic page numbers but commented out for NO page numbes - Gerry Nov. 30th. 1999
+\twocolumn             % Double column.
+\flushbottom           % Even bottom -- alas, does not balance columns at end of document
+\pagestyle{plain}
+
+% Need Copyright Year and Copyright Data to be user definable (in .tex file).
+% Gerry Nov. 30th. 1999
+\newtoks\copyrtyr
+\newtoks\acmcopyr
+\newtoks\boilerplate
+
+\global\acmcopyr={X-XXXXX-XX-X/XX/XX}  % Default - 5/11/2001 *** Gerry
+\global\copyrtyr={\the\year}                % Default - 3/3/2003 *** Gerry
+\def\acmPrice#1{\gdef\@acmPrice{#1}}
+\acmPrice{} %article price  % Changed to 15 - June 2012 - Gerry
+
+
+\def\CopyrightYear#1{\global\copyrtyr{#1}}
+\def\crdata#1{\global\acmcopyr{#1}}
+\def\permission#1{\global\boilerplate{#1}}
+
+% ISBN
+%
+\def\isbn#1{\global\acmcopyr={#1}}
+\isbn{978-1-4503-2138-9}
+
+\RequirePackage{url}
+\urlstyle{rm}
+\def\doi#1{\def\@doi{#1}}
+\doi{10.1145/1235}
+\def\printdoi#1{\url{#1}}
+
+
+
+% Copyright
+\RequirePackage{acmcopyright}
+\setcopyright{none}
+
+%
+\global\boilerplate={\@copyrightpermission}
+%
+\newtoks\copyrightetc
+\ifnum\ACM@basesize=9\relax
+\global\copyrightetc{%
+{\noindent\confname\ \the\conf } \the\confinfo \par\smallskip
+  \if@printcopyright
+    \copyright\ \the\copyrtyr\ \@copyrightowner
+  \fi
+  \if@acmowned ISBN \else\ifnum\acm@copyrightmode=2 ISBN \else \par\smallskip ACM ISBN \fi\fi
+ \the\acmcopyr\ifx\@acmPrice\@empty.\else\dots\@acmPrice\fi\par\smallskip
+{DOI: \small\expandafter\printdoi\expandafter{\@doi}}} 
+\toappear{\fontsize{7pt}{8pt}\fontfamily{ptm}\selectfont
+  \the\boilerplate\par\smallskip
+ \the\copyrightetc}
+\fi
+\ifnum\ACM@basesize=10\relax
+\global\copyrightetc{%
+{\noindent\confname\ \the\conf\ \the\confinfo}\par\smallskip
+  \if@printcopyright
+    \copyright\ \the\copyrtyr\ \@copyrightowner
+  \fi
+  \if@acmowned ISBN \else\ifnum\acm@copyrightmode=2 ISBN \else \par\smallskip ACM ISBN \fi\fi
+ \the\acmcopyr\ifx\@acmPrice\@empty.\else\dots\@acmPrice\fi\par\smallskip
+{DOI: \small\expandafter\printdoi\expandafter{\@doi}}} 
+\toappear{\fontsize{7.5pt}{8.5pt}\fontfamily{ptm}\selectfont
+  \the\boilerplate\par\smallskip
+ \the\copyrightetc}
+\fi
+
+\clubpenalty=10000 
+\widowpenalty = 10000
+
+%% 
+% For the CCSXML 2012 Categories
+
+\let\@concepts\@empty
+% Support for CCSXML file
+\RequirePackage{comment}
+\excludecomment{CCSXML}
+
+% New concepts scheme
+%
+% The first argument is the significance, the
+% second is the concept(s)
+%
+\newcommand\ccsdesc[2][100]{%
+  \ccsdesc@parse#1~#2~}
+%
+% The parser of the expression Significance~General~Specific
+%
+\def\ccsdesc@parse#1~#2~#3~{%
+  \expandafter\ifx\csname CCS@#2\endcsname\relax
+    \expandafter\gdef\csname CCS@#2\endcsname{\textbullet\textbf{#2} $\to$ }%
+  \g@addto@macro{\@concepts}{\csname CCS@#2\endcsname}\fi
+  \expandafter\g@addto@macro\expandafter{\csname CCS@#2\endcsname}{%
+    \ifnum#1>499\textbf{#3; }\else
+    \ifnum#1>299\textit{#3; }\else
+    #3; \fi\fi}}
+
+\newcommand\printccsdesc{%
+  \ifx\@concepts\@empty\else
+  \if@twocolumn
+    \section*{CCS Concepts}
+    \@concepts
+    \else \small
+    \quotation{\@concepts}%
+    \fi
+    \fi}
+
+
+%% DanCo, 10/19/11: Added this to reduce overfull lines per Sheridan
+%% request; it leads to occasionally unpleasant extra whitespace,
+%% though actually no worse and perhaps better than the 
+%% default Word template.
+%%
+%% This can be manually controlled instead by authors in text
+%% by putting \sloppy before and \fussy immediately 
+%% after troublesome paras.  Idea found at
+%% http://www.economics.utoronto.ca/osborne/latex/PMAKEUP.HTM
+
+\sloppy
diff --git a/paper_source/tables/halfak.mod.tex b/paper_source/tables/halfak.mod.tex
new file mode 100644 (file)
index 0000000..711c059
--- /dev/null
@@ -0,0 +1,23 @@
+% I modified the style of the table output from texreg but kept the numbers the same, except that I replaced very small numbers with <1e-6
+\begin{table}
+\begin{center}
+
+\begin{tabular*}{\columnwidth}{@{\extracolsep{\fill}} l d{1.2} d{1.2} d{1.2}}
+ & \multicolumn{1}{c}{\(\beta\)} & \multicolumn{1}{c}{\(SE\)} & \multicolumn{1}{c}{\(p~\mathrm{value}\)} \\
+\midrule
+Intercept      & -2.83 & 0.88 & <0.001  \\
+Reverted       & -0.72 & 0.04 & <0.001 \\
+Messaged       & 0.68 & 0.01  & <0.001 \\
+Tool reverted  & -0.22 & 0.28 & 0.43 \\
+Session edits  & 0.33 & 0.01  & <0.001  \\
+Wiki age  & -0.37 & 0.07 & <0.001 \\
+\midrule
+Deviance       & \multicolumn{3}{l}{262447} \\
+Num. obs.      & \multicolumn{3}{l}{329636} \\
+\bottomrule
+
+\end{tabular*}
+\caption{Coefficient and standard errors estimated from our fitted logistic regression predicting newcomer survival. Coefficients for wiki and quarter are omitted from this table but available in the supplementary material.}
+\label{table:regression.1}
+\end{center}
+\end{table}
diff --git a/paper_source/tables/morgan.model.tex b/paper_source/tables/morgan.model.tex
new file mode 100644 (file)
index 0000000..f1a8b82
--- /dev/null
@@ -0,0 +1,21 @@
+% I modified the style of the table output from texreg but kept the numbers the same, except that I replaced very small p values with <1e-6
+\begin{table}
+\begin{center}
+
+\begin{tabular*}{\columnwidth}{@{\extracolsep{\fill}} l D{.}{.}{2.2} D{.}{.}{3.2} D{.}{.}{1.2}}
+ & \multicolumn{1}{c}{\(\beta\)} & \multicolumn{1}{c}{\(SE\)} & \multicolumn{1}{c}{\(p~\mathrm{value}\)} \\
+\midrule
+Intercept      & -18.40  & 500.60 & 0.97  \\
+Editor tenure  & -0.44 & 0.02    & <0.001 \\
+Wiki age       & 0.68 & 0.11     & <0.001 \\
+\midrule
+Deviance       & \multicolumn{3}{l}{96585} \\
+Num. obs.      & \multicolumn{3}{l}{703614} \\
+\bottomrule
+\end{tabular*}
+\caption{Fitted estimated from our fitted logistic regressions predicting reverts of edits to project namespace pages. Categorical variables for wiki and quarter are omitted from this table but available in the supplementary material.}
+
+\label{table.regression.2}
+\end{center}
+
+\end{table}
diff --git a/paper_source/todo.txt b/paper_source/todo.txt
new file mode 100644 (file)
index 0000000..112e706
--- /dev/null
@@ -0,0 +1,31 @@
+
+TODO for rebuttal
+=================
++ look at options for CI (DONE, bootstrapped 95% CI looks great)
++ Polish the rebuttal
++ read through reviews again
+
+
+TODO for paper ready copy
+=========================
++ Fix Bibliography
++ Incorporate the following:
+
+Because our dataset is limited to wikis, we cannot address questions of generalizability to other sites empirically. The fact that we had to change RAD's operationalizations of key concepts to apply them to Wikia wikis, suggests that tests further afield will require creative construction of new measures. With these limitations aside, we believe that our results are informative, but far from convincing, evidence that RAD's findings generalize beyond wikis. Therefore, we will add material to the discussion asserting our results increase our expectation that similar patterns occur in other communities engaged in peer production and open collaboration. We reason that the mechanisms driving the patterns of ``calcification'' and entrenchment that we observe in wikis seem general to communities without pre-defined hierarchies or formal procedures to facilitate newcomer socialization. This builds from earlier work on oligarchy (see Shaw & Hill, 2014) and on "the tyranny of structurelessness" in democratic organizations and feminist advocacy groups reported by (Freeman, 1972). We will add a citation to (Freeman, 1972). We strongly agree with the reviewers that future research should further pursue social and psychological mechanisms of entrenchment, oligarchy formation, and newcomer socialization in open collaboration and peer production communities.
+
++  Add text to Study 3 methods paragraph highlighting the ways in which operationalizing norm entrenchment through namespace 4 is quite different from looking at specific policy pages on Wikipedia and flagging this decision a potential threat to the validity 
+of our finding.
+
++ Add bootstrapped 95% confidence intervals to Figure 1 (DONE)
+
++ Add back a concise summary of RAD's methodology.  Keep this short in the form of 3 additional paragraphs. 
+
++ Add a couple of sentences in the methods section to describe limitations of the RAD analysis. We think the biggest issues are related to the ways that the original RAD paper could not evaluate whether unique aspects of EN:Wikipedia drove the findings. This includes questions about whether something particular happened around 2008, such as the rise of Facebook and other social media platforms, that drove the editor decline.
+
++ Fix typo in interpretation of bot reverted coefficient. (DONE)
+
++ Update supplementary material with the weighted models analysis 
+
++ Add a short paragraph to the discussion section that summarizes the threat, describes our approach to addressing it with weighted models, explains that our substantive conclusions about the replicability of RAD are maintained, and points to an opportunity for future work to study how newcomer socialization may vary between communities of different size and activity levels.
+
+3. 
\ No newline at end of file
diff --git a/regen.all.sh b/regen.all.sh
new file mode 100755 (executable)
index 0000000..613bc9d
--- /dev/null
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+./01_build_datasets.R && ./02_model_newcomer_survival.R && ./03_generate_plots.R ./04_model_namespace4.R && ./05_power_simulation.R
diff --git a/runwikiq.sh b/runwikiq.sh
new file mode 100755 (executable)
index 0000000..ebd53b8
--- /dev/null
@@ -0,0 +1,10 @@
+#!/bin/bash
+wikiq_path="PATH_TO_WIKIQ"
+raw_data_path="wikia_2010_04"
+of_path="wikiq_wikia_2010_all_nopersistance"
+dumps="$(ls $raw_data_path)"
+
+for dumpname in $dumps
+do
+python3 $wikiq_path -u $raw_data_path/$dumpname -o $of_path
+done
diff --git a/userroles_scraper_scripts/list_of_wikis.csv b/userroles_scraper_scripts/list_of_wikis.csv
new file mode 100644 (file)
index 0000000..9f3b933
--- /dev/null
@@ -0,0 +1,761 @@
+24,http://24.wikia.com/
+6teen,http://6teen.wikia.com/
+8bittheater,http://8bittheater.wikia.com/
+aachen,http://aachen.wikia.com/
+abridgedseries,http://abridgedseries.wikia.com/
+absurdopedia,http://absurdopedia.wikia.com/
+academia,http://academia.wikia.com/
+academicjobs,http://academicjobs.wikia.com/
+aceattorney,http://aceattorney.wikia.com/
+acecombat,http://acecombat.wikia.com/
+actionzone,http://actionzone.wikia.com/
+admintools,http://admintools.wikia.com/
+advancewars,http://advancewars.wikia.com/
+adventurequestworlds,http://adventurequestworlds.wikia.com/
+ageofempires,http://ageofempires.wikia.com/
+aion,http://aion.wikia.com/
+airframes,http://airframes.wikia.com/
+alicesoft,http://alicesoft.wikia.com/
+aliens,http://aliens.wikia.com/
+althistory,http://althistory.wikia.com/
+americandad,http://americandad.wikia.com/
+americangirl,http://americangirl.wikia.com/
+anegkyklopaideia,http://frikipaideia.wikia.com/
+animalcrossing,http://animalcrossing.wikia.com/
+anime,http://anime.wikia.com/
+animeanswers,http://anime.answers.wikia.com/
+annex,http://annex.wikia.com/
+answers,http://answers.wikia.com/
+aoc,http://aoc.wikia.com/
+aokb,http://anarchyonline.wikia.com/
+applesparkles,http://applesparkles.wikia.com/
+aq,http://adventurequest.wikia.com/
+archiesonic,http://archiesonic.wikia.com/
+artemisfowl,http://artemisfowl.wikia.com/
+aruarose,http://aruarose.wikia.com/
+asdastory,http://asdastory.wikia.com/
+aselia,http://aselia.wikia.com/
+asheron,http://asheron.wikia.com/
+assassinscreed,http://assassinscreed.wikia.com/
+astroempires,http://astroempires.wikia.com/
+astronomy,http://astronomy.wikia.com/
+atlantica,http://atlantica.wikia.com/
+audrey,http://audrey.wikia.com/
+avatar,http://avatar.wikia.com/
+avatardev,http://avatardev.wikia.com/
+avatarfanfiction,http://avatarfanon.wikia.com/
+avatartest,http://avatartest.wikia.com/
+avp,http://avp.wikia.com/
+babylon5,http://babylon5.wikia.com/
+bakugan,http://bakugan.wikia.com/
+banjokazooie,http://banjokazooie.wikia.com/
+baseball,http://baseball.wikia.com/
+basilicus,http://basilicus.wikia.com/
+batman,http://batman.wikia.com/
+battlefield,http://battlefield.wikia.com/
+battleforge,http://battleforge.wikia.com/
+battlefront,http://battlefront.wikia.com/
+beidipedia,http://beidipedia.wikia.com/
+ben10,http://ben10.wikia.com/
+bionicle,http://bionicle.wikia.com/
+bioshock,http://bioshock.wikia.com/
+bleachfanfiction,http://bleachfanfiction.wikia.com/
+blogging,http://blogging.wikia.com/
+bmet,http://bmet.wikia.com/
+board8,http://board8.wikia.com/
+borderlands,http://borderlands.wikia.com/
+brawlhacks,http://brawlhacks.wikia.com/
+bttf,http://bttf.wikia.com/
+buffy,http://buffy.wikia.com/
+bullygame,http://bullygame.wikia.com/
+bungie,http://bungie.wikia.com/
+burnoutparadise,http://burnout.wikia.com/
+buzzoutloud,http://buzzoutloud.wikia.com/
+bzpcomics,http://bzpcomics.wikia.com/
+campaigns,http://campaigns.wikia.com/
+candh,http://calvinandhobbes.wikia.com/
+cartoonnetwork,http://cartoonnetwork.wikia.com/
+castlecrashers,http://castlecrashers.wikia.com/
+castlevania,http://castlevania.wikia.com/
+caw,http://caw.wikia.com/
+ceramica,http://ceramica.wikia.com/
+championsonline,http://championsonline.wikia.com/
+chaotic,http://chaotic.wikia.com/
+charmed,http://charmed.wikia.com/
+chdk,http://chdk.wikia.com/
+chowder,http://chowder.wikia.com/
+christianity,http://christianity.wikia.com/
+christmasspecials,http://christmas-specials.wikia.com/
+chrono,http://chrono.wikia.com/
+cimmeria_aoc,http://cimmeria-aoc.wikia.com/
+cities,http://cities.wikia.com/
+cloverfield,http://cloverfield.wikia.com/
+clubchicken,http://unclubpenguin.wikia.com/
+clubpenguin,http://clubpenguin.wikia.com/
+cnc,http://cnc.wikia.com/
+codegeass,http://codegeass.wikia.com/
+codelyoko,http://codelyoko.wikia.com/
+combatarms,http://combatarms.wikia.com/
+comics,http://comics.wikia.com/
+communitytest,http://communitytest.wikia.com/
+compedia,http://compedia.wikia.com/
+conlang,http://conlang.wikia.com/
+conmyth,http://conmyth.wikia.com/
+conworld,http://conworld.wikia.com/
+cookbook_import,http://recipes.wikia.com/
+costuming,http://halocostuming.wikia.com/
+crashban,http://crashbandicoot.wikia.com/
+crazybobs,http://crazybobs.wikia.com/
+creatures,http://creatures.wikia.com/
+crimeprobe,http://publicsafety.wikia.com/
+criticalmass,http://criticalmass.wikia.com/
+custombionicle,http://custombionicle.wikia.com/
+cybernations,http://cybernations.wikia.com/
+d20npcs,http://d20npcs.wikia.com/
+danball,http://danball.wikia.com/
+darkfall,http://darkfall.wikia.com/
+darkhorizons,http://darkhorizons.wikia.com/
+darkmoonfaire,http://darkmoonfaire.wikia.com/
+darkrunescape,http://darkrunescape.wikia.com/
+darrenshan,http://cirquedufreak.wikia.com/
+darth,http://darth.wikia.com/
+dauncyclopedia,http://spademanns.wikia.com/
+dcanimated,http://dcanimated.wikia.com/
+de,http://de.wikia.com/
+deadfrontier,http://deadfrontier.wikia.com/
+deadoralive,http://deadoralive.wikia.com/
+deadrising,http://deadrising.wikia.com/
+deadspace,http://deadspace.wikia.com/
+deaokb,http://de.anarchyonline.wikia.com/
+deathnote,http://deathnote.wikia.com/
+debionicle,http://de.bionicle.wikia.com/
+debleachwiki,http://de.bleach.wikia.com/
+dedenaruto3,http://de.naruto.wikia.com/
+dedigimon,http://de.digimon.wikia.com/
+dedofus,http://de.dofuswiki.wikia.com/
+dedragonball,http://de.dragonball.wikia.com/
+deerziehungs,http://de.erziehungs.wikia.com/
+deffxi,http://de.wiki.ffxiclopedia.org/
+defiaseu,http://defiaseu.wikia.com/
+deflyff,http://de.flyff.wikia.com/
+degelsenkirchen,http://gelsenkirchen.wikia.com/
+degrassi,http://degrassi.wikia.com/
+degta,http://de.gta.wikia.com/
+dehalo,http://de.halo.wikia.com/
+dekuenstler,http://de.kuenstler.wikia.com/
+deliteratur,http://de.literatur.wikia.com/
+delostpedia,http://de.lostpedia.wikia.com/
+delphi,http://delphi.wikia.com/
+demario,http://de.mario.wikia.com/
+demarjorie,http://marjorie.wikia.com/
+dememoryalpha,http://memory-alpha.org/de/
+demini,http://de.mini.wikia.com/
+demusic,http://musik.wikia.com/
+deonepiece,http://de.onepiece.wikia.com/
+dephineasandferb,http://de.phineasandferb.wikia.com/
+derunescape,http://de.runescape.wikia.com/
+descrubs,http://de.scrubs.wikia.com/
+desencyclopedie,http://desencyclopedie.wikia.com/
+desperate,http://desperatehousewives.wikia.com/
+desportverein,http://de.sportverein.wikia.com/
+detestwiki,http://de.testwiki.wikia.com/
+deunternehmen,http://unternehmen.wikia.com/
+devilmaycry,http://devilmaycry.wikia.com/
+deyugioh2,http://de.yugioh.wikia.com/
+dgrayman,http://dgrayman.wikia.com/
+diablo,http://diablo.wikia.com/
+diealdor,http://diealdor.wikia.com/
+digibutter,http://digibutter.wikia.com/
+digimon,http://digimon.wikia.com/
+dinosaurs,http://dinosaurs.wikia.com/
+disney,http://disney.wikia.com/
+disneychannel,http://disneychannel.wikia.com/
+dissidia,http://dissidia.wikia.com/
+dizzywood,http://dizzywood.wikia.com/
+dnd,http://dnd.wikia.com/
+dndta,http://dndta.wikia.com/
+dofus,http://dofuswiki.wikia.com/
+dollhouse,http://dollhouse.wikia.com/
+doom,http://doom.wikia.com/
+dothack,http://dothack.wikia.com/
+dragonage,http://dragonage.wikia.com/
+dragonball,http://dragonball.wikia.com/
+drakensang,http://drakensang.wikia.com/
+dune,http://dune.wikia.com/
+dungeons,http://dungeons.wikia.com/
+dynastywarriors,http://koei.wikia.com/
+earthbound,http://earthbound.wikia.com/
+earthenring,http://earthenring.wikia.com/
+eberron,http://eberron.wikia.com/
+ed,http://ed.wikia.com/
+edfanon,http://edfanon.wikia.com/
+education,http://education.wikia.com/
+egamia,http://gaming.wikia.com/
+eincyclopedia,http://eincyclopedia.wikia.com/
+elderscrolls,http://elderscrolls.wikia.com/
+elementsthegame,http://elementsthegame.wikia.com/
+elona,http://elona.wikia.com/
+elwow,http://el.wow.wikia.com/
+enbleach,http://bleach.wikia.com/
+endcdatabase,http://dc.wikia.com/
+endomo,http://domo.wikia.com/
+enfairlyoddparents,http://fairlyoddparents.wikia.com/
+enhabbo,http://habbo.wikia.com/
+enkirby,http://kirby.wikia.com/
+enmarveldatabase,http://marvel.wikia.com/
+enmemoryalpha,http://memory-alpha.org/en/
+ennintendo,http://nintendo.wikia.com/
+enprinceofpersia,http://princeofpersia.wikia.com/
+enshamanking,http://shamanking.wikia.com/
+entabulawiki,http://tabularasa.wikia.com/
+entekken,http://tekken.wikia.com/
+entertainment1,http://entertainment.wikia.com/
+entranshumanism,http://transhumanism.wikia.com/
+entravel,http://travel.wikia.com/
+entravian,http://travian.wikia.com/
+eq2i,http://eq2.wikia.com/
+es,http://es.wikia.com/
+esanswers,http://respuestas.wikia.com/
+esben10,http://es.ben10.wikia.com/
+esbionicle,http://es.bionicle.wikia.com/
+esbleach,http://es.bleach.wikia.com/
+esdigimon,http://es.digimon.wikia.com/
+esdofus,http://es.dofuswiki.wikia.com/
+esdragonball,http://es.dragonball.wikia.com/
+esdrama,http://es.drama.wikia.com/
+esfakemon,http://es.fakemon.wikia.com/
+esfinalfantasy,http://es.finalfantasy.wikia.com/
+esgta,http://es.gta.wikia.com/
+eshalo,http://es.halo.wikia.com/
+esharrypotter,http://es.harrypotter.wikia.com/
+esiao,http://imperiumao.wikia.com/
+eslossimpson,http://es.simpsons.wikia.com/
+eslostpedia,http://es.lostpedia.wikia.com/
+esmortalkombat,http://es.mortalkombat.wikia.com/
+esnaruto,http://es.naruto.wikia.com/
+espokemon,http://es.pokemon.wikia.com/
+esrunescape,http://es.runescape.wikia.com/
+esstarwars,http://es.starwars.wikia.com/
+estibia,http://es.tibia.wikia.com/
+esticnologia,http://es.ticnologia.wikia.com/
+eswow,http://es.wow.wikia.com/
+esyugioh,http://es.yugioh.wikia.com/
+eszelda,http://es.zelda.wikia.com/
+evchk,http://evchk.wikia.com/
+evctw,http://evctw.wikia.com/
+eve,http://eve.wikia.com/
+events2,http://wowwikievent.wikia.com/
+ewrestling,http://ewrestling.wikia.com/
+fable,http://fable.wikia.com/
+facepunch,http://facepunch.wikia.com/
+fahrrad,http://fahrrad.wikia.com/
+fairytail,http://fairytail.wikia.com/
+fallout,http://fallout.wikia.com/
+falloutfanon,http://falloutfanon.wikia.com/
+falloutmods,http://falloutmods.wikia.com/
+familyguy,http://familyguy.wikia.com/
+fanfiction,http://fanfiction.wikia.com/
+fanon,http://fanon.wikia.com/
+fantendo,http://fantendo.wikia.com/
+fargofilmmaking,http://fargofilmmaking.wikia.com/
+farmville,http://farmville.wikia.com/
+farscape,http://farscape.wikia.com/
+fci,http://fci.wikia.com/
+fear,http://fear.wikia.com/
+ferrocarriles,http://ferrocarriles.wikia.com/
+ffxi,http://wiki.ffxiclopedia.org/
+fibionicle,http://fi.bionicle.wikia.com/
+film,http://film.wikia.com/
+filmguide,http://filmguide.wikia.com/
+finalfantasy,http://finalfantasy.wikia.com/
+fireemblem,http://fireemblem.wikia.com/
+firefly,http://firefly.wikia.com/
+firunescape,http://fi.runescape.wikia.com/
+fishwrangler,http://fishwrangler.wikia.com/
+fistarwars,http://fi.starwars.wikia.com/
+flapjack,http://flapjack.wikia.com/
+flashforward,http://flashforward.wikia.com/
+fma,http://fma.wikia.com/
+forgottenrealms,http://forgottenrealms.wikia.com/
+forscherliga,http://forscherliga.wikia.com/
+fraleatexte,http://fr.aleatexte.wikia.com/
+franswers,http://reponses.wikia.com/
+frbrunux,http://fr.brunux.wikia.com/
+frcontemporain,http://fr.contemporain.wikia.com/
+freeciv,http://freeciv.wikia.com/
+freespeech,http://freespeech.wikia.com/
+frffxi,http://fr.wiki.ffxiclopedia.org/
+frfilm,http://fr.film.wikia.com/
+frfr,http://fr.wikia.com/
+frguildwars,http://fr.guildwars.wikia.com/
+friends,http://friends.wikia.com/
+fringe,http://fringe.wikia.com/
+frlostpedia,http://fr.lostpedia.wikia.com/
+frmemoryalpha,http://memory-alpha.org/fr/
+frugooscape,http://frugooscape.wikia.com/
+frwow,http://fr.wowwiki.com/
+funorb,http://funorb.wikia.com/
+furry,http://furry.wikia.com/
+fusionfall,http://fusionfall.wikia.com/
+futurama,http://futurama.wikia.com/
+future,http://future.wikia.com/
+gaia,http://gaia.wikia.com/
+galava,http://galava.wikia.com/
+galciv,http://galciv.wikia.com/
+games,http://games.wikia.com/
+gearsofwar,http://gearsofwar.wikia.com/
+geekfeminism,http://geekfeminism.wikia.com/
+genealogy,http://familypedia.wikia.com/
+ghostbusters,http://ghostbusters.wikia.com/
+gijoe,http://gijoe.wikia.com/
+girlgenius,http://girlgenius.wikia.com/
+globalmedia,http://globalmedia.wikia.com/
+godfather,http://godfather.wikia.com/
+godofwar,http://godofwar.wikia.com/
+godzilla,http://godzilla.wikia.com/
+goldensun,http://goldensun.wikia.com/
+google,http://google.wikia.com/
+government,http://government.wikia.com/
+gowfanon,http://gowfanon.wikia.com/
+grandchase,http://grandchase.wikia.com/
+green,http://green.wikia.com/
+greenlantern,http://greenlantern.wikia.com/
+gtawiki,http://gta.wikia.com/
+guestbook,http://guestbook.wikia.com/
+guildopedia,http://guildopedia.wikia.com/
+guilds,http://neopets.wikia.com/
+guildwarsguilds,http://guildwarsguilds.wikia.com/
+guitarhero,http://guitarhero.wikia.com/
+gundam,http://gundam.wikia.com/
+guns,http://guns.wikia.com/
+gwguild,http://guildwars.wikia.com/
+halflife,http://half-life.wikia.com/
+halo,http://halo.wikia.com/
+halofanon,http://halofanon.wikia.com/
+halomachinima,http://halomachinima.wikia.com/
+hannahmontana,http://hannahmontana.wikia.com/
+happytreefriends,http://happytreefriends.wikia.com/
+harrypotter,http://harrypotter.wikia.com/
+harrypotterfanon,http://harrypotterfanon.wikia.com/
+help,http://help.wikia.com/
+heroes,http://heroes.wikia.com/
+hetalia,http://hetalia.wikia.com/
+hilfe,http://hilfe.wikia.com/
+hiliterature,http://hi.literature.wikia.com/
+hitman,http://hitman.wikia.com/
+hm,http://harvestmoon.wikia.com/
+homeandaway,http://homeandaway.wikia.com/
+homeworld,http://homeworld.wikia.com/
+horrormovies,http://horror-movies.wikia.com/
+hotwheels,http://hotwheels.wikia.com/
+house,http://house.wikia.com/
+howto,http://how-to.wikia.com/
+hrwooky,http://hrwooky.wikia.com/
+hsm,http://high-school-musical.wikia.com/
+huspam,http://hu.spam.wikia.com/
+hustarwars,http://hu.kaminopedia.wikia.com/
+icarly,http://icarly.wikia.com/
+icehockey,http://icehockey.wikia.com/
+ikariam,http://ikariam.wikia.com/
+ikkepedia,http://ikkjepedia.wikia.com/
+illogicopedia,http://wackypedia.wikia.com/
+imagine,http://imagine.wikia.com/
+inciclopedia,http://inciclopedia.wikia.com/
+india,http://india.wikia.com/
+indianajones,http://indianajones.wikia.com/
+infamous,http://infamous.wikia.com/
+inheritance,http://inheritance.wikia.com/
+internationalbusiness,http://internationalbusiness.wikia.com/
+inuyasha,http://inuyasha.wikia.com/
+ipod,http://apple.wikia.com/
+italia,http://italia.wikia.com/
+ithalo,http://it.halo.wikia.com/
+itlostpedia,http://it.lostpedia.wikia.com/
+itrunescape,http://it.runescape.wikia.com/
+itsoggettopedia,http://it.soggettopedia.wikia.com/
+ja,http://ja.wikia.com/
+jaeq2,http://ja.eq2.wikia.com/
+jakanddaxter,http://jakanddaxter.wikia.com/
+jamesbond,http://jamesbond.wikia.com/
+jamescameronsavatar,http://james-camerons-avatar.wikia.com/
+jet,http://jet.wikia.com/
+jfc,http://jfc.wikia.com/
+jfx,http://jfx.wikia.com/
+joinme,http://joinme.wikia.com/
+jurassicpark,http://jurassicpark.wikia.com/
+jvs,http://jvs.wikia.com/
+kanzaka,http://kanzaka.wikia.com/
+killzone,http://killzone.wikia.com/
+kimpossible,http://kimpossible.wikia.com/
+kingdomhearts,http://kingdomhearts.wikia.com/
+kitsch,http://kitsch.wikia.com/
+knd,http://knd.wikia.com/
+kongregate,http://kongregate.wikia.com/
+kosova,http://kosova.wikia.com/
+l5r,http://l5r.wikia.com/
+landbeforetime,http://landbeforetime.wikia.com/
+lastremnant,http://lastremnant.wikia.com/
+lawandorder,http://lawandorder.wikia.com/
+leagueoflegends,http://leagueoflegends.wikia.com/
+left,http://left.wikia.com/
+left4dead,http://left4dead.wikia.com/
+legacy,http://legacy.wikia.com/
+lego,http://lego.wikia.com/
+legobatman,http://legobatman.wikia.com/
+legoindianajones,http://legoindianajones.wikia.com/
+legostarwars,http://legostarwars.wikia.com/
+lgbt,http://lgbt.wikia.com/
+liberapedia,http://liberapedia.wikia.com/
+linux,http://linux.wikia.com/
+list,http://list.wikia.com/
+literature,http://literature.wikia.com/
+littlebigplanet,http://littlebigplanet.wikia.com/
+logocreation,http://logocreation.wikia.com/
+londonbirders,http://londonbirders.wikia.com/
+looneytunes,http://looneytunes.wikia.com/
+lost,http://lost.wikia.com/
+lostodyssey,http://lostodyssey.wikia.com/
+lostpedia,http://lostpedia.wikia.com/
+lotr,http://lotr.wikia.com/
+lotrowiki,http://lotro.wikia.com/
+lunarwars,http://lunarwars.wikia.com/
+madnesscombat,http://madnesscombat.wikia.com/
+mafiawars,http://mafiawars.wikia.com/
+maplestory,http://maplestory.wikia.com/
+mario,http://mario.wikia.com/
+mariokart,http://mariokart.wikia.com/
+marvelmovies,http://marvel-movies.wikia.com/
+masseffect,http://masseffect.wikia.com/
+math,http://math.wikia.com/
+matrixfilms,http://matrix.wikia.com/
+mcleodgaming,http://mcleodgaming.wikia.com/
+mechquest,http://mechquest.wikia.com/
+mechscape,http://stellardawn.wikia.com/
+megaman,http://megaman.wikia.com/
+megamitensei,http://megamitensei.wikia.com/
+memory_alfa,http://memory-alpha.org/it/
+memory_gamma,http://memory-gamma.wikia.com/
+metal,http://metal.wikia.com/
+metalgear,http://metalgear.wikia.com/
+metroid,http://metroid.wikia.com/
+micronaciones,http://micronaciones.wikia.com/
+micronations,http://micronations.wikia.com/
+micropedia,http://micropedia.wikia.com/
+mind_control,http://mind-control.wikia.com/
+misterwikki,http://misterwikki.wikia.com/
+mk,http://mortalkombat.wikia.com/
+monde,http://monde.wikia.com/
+monsterhunter,http://monsterhunter.wikia.com/
+moomin,http://moomin.wikia.com/
+moonguard,http://moonguard.wikia.com/
+mozilla,http://mozilla.wikia.com/
+mpd,http://mpd.wikia.com/
+mspafa,http://mspafa.wikia.com/
+mspaintadventures,http://mspaintadventures.wikia.com/
+mst3k,http://mst3k.wikia.com/
+mtg,http://mtg.wikia.com/
+mu,http://mu.wikia.com/
+muppet,http://muppet.wikia.com/
+music,http://music.wikia.com/
+mybio,http://mybio.wikia.com/
+myeloma,http://myeloma.wikia.com/
+mylegonetwork,http://mylegonetwork.wikia.com/
+mysims,http://mysims.wikia.com/
+n,http://n.wikia.com/
+namcotales,http://namcotales.wikia.com/
+narnia,http://narnia.wikia.com/
+naruto,http://naruto.wikia.com/
+narutofanon,http://narutofanon.wikia.com/
+nascar,http://thirdturn.wikia.com/
+nation,http://nation.wikia.com/
+nationstates,http://nationstates.wikia.com/
+ncis,http://ncis.wikia.com/
+necyklopedie,http://necyklopedie.wikia.com/
+nethack,http://nethack.wikia.com/
+newgrounds,http://newgrounds.wikia.com/
+nfl,http://nfl.wikia.com/
+nickelodeon,http://nickelodeon.wikia.com/
+nlmemoryalpha,http://memory-alpha.org/nl/
+nlrunescape,http://nl.runescape.wikia.com/
+nlstarwars,http://nl.starwars.wikia.com/
+nonciclopedia,http://nonciclopedia.wikia.com/
+nonsensopedia,http://nonsensopedia.wikia.pl/
+norunescape,http://no.runescape.wikia.com/
+novelas,http://fiction.wikia.com/
+nowow,http://no.wowwiki.com/
+nwn,http://nwn.wikia.com/
+nwn2,http://nwn2.wikia.com/
+nwp,http://nwp.wikia.com/
+oblivion,http://oblivion.wikia.com/
+oddworld,http://oddworld.wikia.com/
+offenbach,http://offenbach.wikia.com/
+ogame,http://ogame.wikia.com/
+olympians,http://percyjackson.wikia.com/
+onepiece,http://onepiece.wikia.com/
+onepiecefanon,http://onepiecefanon.wikia.com/
+opensource,http://opensource.wikia.com/
+othertitles,http://othertitles.wikia.com/
+otherverse,http://otherverse.wikia.com/
+ourbrant,http://ourbrant.wikia.com/
+oz,http://oz.wikia.com/
+pakistan,http://pakistan.wikia.com/
+papermario,http://papermario.wikia.com/
+paradisa,http://paradisa.wikia.com/
+paragon,http://cityofheroes.wikia.com/
+particracy,http://particracy.wikia.com/
+pathfinder,http://pathfinder.wikia.com/
+pffanon,http://pffanon.wikia.com/
+phineasandferb,http://phineasandferb.wikia.com/
+pikmin,http://pikmin.wikia.com/
+pinball,http://pinball.wikia.com/
+pirates,http://pirates.wikia.com/
+piratesonline,http://piratesonline.wikia.com/
+pixar,http://pixar.wikia.com/
+plbionicle,http://pl.bionicle.wikia.com/
+plcustombionicles,http://pl.custombionicles.wikia.com/
+plfallout,http://pl.fallout.wikia.com/
+plgothic,http://pl.gothic.wikia.com/
+plharrypotter,http://pl.harrypotter.wikia.com/
+pllostpedia,http://pl.lostpedia.wikia.com/
+plogame,http://pl.ogame.wikia.com/
+plstarwars,http://pl.starwars.wikia.com/
+plwikia,http://pl.wikia.com/
+plwow,http://pl.wow.wikia.com/
+plwykopedia,http://pl.wykopedia.wikia.com/
+pokemon,http://pokemon.wikia.com/
+pokemonfanon,http://pokemonfanon.wikia.com/
+pokemononline339,http://pokemononline.wikia.com/
+poker,http://poker.wikia.com/
+potbs,http://potbs.wikia.com/
+pov,http://opinion.wikia.com/
+powerrangers,http://powerrangers.wikia.com/
+ppc,http://ppc.wikia.com/
+primeval,http://primeval.wikia.com/
+prisonbreak,http://prisonbreak.wikia.com/
+prototype,http://prototype.wikia.com/
+prowrestling,http://prowrestling.wikia.com/
+psychology,http://psychology.wikia.com/
+ptdofus,http://pt.dofuspedia.wikia.com/
+ptlostpedia,http://pt.lostpedia.wikia.com/
+ptmicronations,http://pt.micronations.wikia.com/
+ptsimpsons,http://pt.simpsons.wikia.com/
+ptstarwars,http://pt.starwars.wikia.com/
+pttibia,http://pt.tibia.wikia.com/
+punchout,http://punchout.wikia.com/
+puppet,http://puppet.wikia.com/
+pushingdaisies,http://pushing-daisies.wikia.com/
+pvx,http://pvx.wikia.com/
+rainbowsix,http://rainbowsix.wikia.com/
+rappelz,http://rappelz.wikia.com/
+ratchet,http://ratchet.wikia.com/
+reboot,http://reboot.wikia.com/
+reborn,http://reborn.wikia.com/
+reddwarf,http://reddwarf.wikia.com/
+redwall,http://redwall.wikia.com/
+religionwiki,http://religion.wikia.com/
+renaissancekingdoms,http://renaissancekingdoms.wikia.com/
+requests,http://www.wikia.com/
+residentevil,http://residentevil.wikia.com/
+resistancefallofman,http://resistance.wikia.com/
+robotchicken3,http://robotchicken.wikia.com/
+robotwars,http://robotwars.wikia.com/
+rocketboom,http://rocketboom.wikia.com/
+rom,http://rom.wikia.com/
+routes,http://routes.wikia.com/
+rpg,http://rpg.wikia.com/
+rpgmaker,http://rpgmaker.wikia.com/
+ruharrypotter,http://ru.harrypotter.wikia.com/
+ruhistory,http://ru.history.wikia.com/
+rulostpedia,http://ru.lostpedia.wikia.com/
+runescape,http://runescape.wikia.com/
+runescapeclans,http://runescapeclans.wikia.com/
+runescapefanfiction,http://runescapefanfiction.wikia.com/
+rurpg,http://ru.rpg.wikia.com/
+ruscience,http://ru.science.wikia.com/
+rustarwars,http://ru.starwars.wikia.com/
+ruvlab,http://ru.vlab.wikia.com/
+ruwikia,http://ru.wikia.com/
+rvb,http://rvb.wikia.com/
+sacredseasons,http://sacredseasons.wikia.com/
+sailormoon,http://sailormoon.wikia.com/
+saintsrow,http://saintsrow.wikia.com/
+sarahtestwiki,http://sarahtestwiki.wikia.com/
+sca21,http://sca21.wikia.com/
+scarteleu,http://scarteleu.wikia.com/
+schoolcomputing,http://schoolcomputing.wikia.com/
+schools,http://schools.wikia.com/
+schulen,http://schulen.wikia.com/
+scoobydoo,http://scoobydoo.wikia.com/
+scratchpad,http://scratchpad.wikia.com/
+scrubs,http://scrubs.wikia.com/
+scum,http://scum.wikia.com/
+sealonline,http://sealonline.wikia.com/
+search,http://searchwiki.wikia.com/
+seattle,http://seattle.wikia.com/
+secondlife,http://secondlife.wikia.com/
+secretsaturdays,http://secretsaturdays.wikia.com/
+sega,http://sega.wikia.com/
+sentinels,http://sentinels.wikia.com/
+shaiya,http://shaiya.wikia.com/
+shugochara,http://shugochara.wikia.com/
+silent,http://silenthill.wikia.com/
+simpsons,http://simpsons.wikia.com/
+sims,http://sims.wikia.com/
+sknecyklopedia,http://necyklopedia.wikia.com/
+skyrates,http://skyrates.wikia.com/
+smallville,http://smallville.wikia.com/
+smashtasm,http://smashtasm.wikia.com/
+snicket,http://snicket.wikia.com/
+snk,http://snk.wikia.com/
+solarcooking,http://solarcooking.wikia.com/
+sonic,http://sonic.wikia.com/
+sonicfanon,http://sonicfanon.wikia.com/
+sonnywithachance,http://sonnywithachance.wikia.com/
+sot,http://sot.wikia.com/
+soulcalibur,http://soulcalibur.wikia.com/
+souleater,http://souleater.wikia.com/
+southpark,http://southpark.wikia.com/
+spacequest,http://spacequest.wikia.com/
+spiderman,http://spiderman.wikia.com/
+splintercell,http://splintercell.wikia.com/
+spongebob,http://spongebob.wikia.com/
+spongefan,http://spongefan.wikia.com/
+spore,http://spore.wikia.com/
+spyro,http://spyro.wikia.com/
+ssb,http://super-smash-bros.wikia.com/
+stad,http://stad.wikia.com/
+stalker,http://stalker.wikia.com/
+stanford,http://stanford.wikia.com/
+starcraft,http://starcraft.wikia.com/
+starfox,http://starfox.wikia.com/
+stargate,http://stargate.wikia.com/
+stargatewars,http://stargatewars.wikia.com/
+starocean,http://starocean.wikia.com/
+startingstrength,http://startingstrength.wikia.com/
+startrek,http://memory-beta.wikia.com/
+starwarsexodus,http://starwars-exodus.wikia.com/
+starwarsfr,http://fr.starwars.wikia.com/
+starwarsmush,http://starwarsmush.wikia.com/
+stexpanded,http://stexpanded.wikia.com/
+streetfighter,http://streetfighter.wikia.com/
+students,http://students.wikia.com/
+suitelife,http://suitelife.wikia.com/
+supcom,http://supcom.wikia.com/
+superman,http://superman.wikia.com/
+supernatural,http://supernatural.wikia.com/
+svtibia,http://sv.tibia.wikia.com/
+sw1mush,http://sw1mush.wikia.com/
+swfanon,http://swfanon.wikia.com/
+swfans,http://swfans.wikia.com/
+swg,http://swg.wikia.com/
+swgames,http://swgames.wikia.com/
+swmerchandise,http://swmerchandise.wikia.com/
+swrp,http://swrp.wikia.com/
+swrpg,http://swrpg.wikia.com/
+tamagotchi,http://tamagotchi.wikia.com/
+tardis,http://tardis.wikia.com/
+tdicamps,http://tdicamps.wikia.com/
+teentitans,http://teentitans.wikia.com/
+terminator,http://terminator.wikia.com/
+tesfanon,http://tesfanon.wikia.com/
+thatguywiththeglasses,http://thatguywiththeglasses.wikia.com/
+thegungancouncil,http://thegungancouncil.wikia.com/
+theoffice,http://theoffice.wikia.com/
+theppn_backup,http://ppn.wikia.com/
+theshatareu,http://theshatareu.wikia.com/
+theworldcc,http://theworldcc.wikia.com/
+thoriumbrotherhood,http://thoriumbrotherhood.wikia.com/
+tibiawiki,http://tibia.wikia.com/
+timesplitters,http://timesplitters.wikia.com/
+tmnt,http://tmnt.wikia.com/
+toastmasters,http://toastmasters.wikia.com/
+tokipona,http://tokipona.wikia.com/
+tolololpedia,http://tolololpedia.wikia.com/
+toohuman,http://toohuman.wikia.com/
+tor5,http://tor5.wikia.com/
+totalannihilation,http://totalannihilation.wikia.com/
+totaldramaisland,http://totaldramaisland.wikia.com/
+totaldramaislandfanfiction,http://totaldramaislandfanfiction.wikia.com/
+touhou,http://touhou.wikia.com/
+transfanon,http://transfanon.wikia.com/
+transformers,http://transformers.wikia.com/
+transformers2005,http://transformers2005.wikia.com/
+traveller,http://traveller.wikia.com/
+trekkipedia,http://memory-alpha.org/es/
+trsg,http://trsg.wikia.com/
+trukz,http://trukz.wikia.com/
+tryenisehir,http://tr.yenisehir.wikia.com/
+tsdc,http://tsdc.wikia.com/
+ttte,http://ttte.wikia.com/
+turtledove,http://turtledove.wikia.com/
+twewy,http://twewy.wikia.com/
+twilightsaga,http://twilightsaga.wikia.com/
+u5lazarus,http://ultima.wikia.com/
+uktransport,http://uktransport.wikia.com/
+unanswers,http://unanswers.wikia.com/
+uncharted,http://uncharted.wikia.com/
+uncyclopedia_de,http://de.uncyclopedia.org/
+unhalo,http://unhalo.wikia.com/
+unmario,http://unmario.wikia.com/
+unrunescape,http://unrunescape.wikia.com/
+utau,http://utau.wikia.com/
+valenciclopedia,http://valenciclopedia.wikia.com/
+vampireknight,http://vampireknight.wikia.com/
+varamozhi,http://varamozhi.wikia.com/
+vereins,http://vereins.wikia.com/
+videogamelies,http://videogamelies.wikia.com/
+videojuego,http://videojuego.wikia.com/
+vikipedo,http://vikipedo.wikia.com/
+villains,http://villains.wikia.com/
+vim,http://vim.wikia.com/
+vintagepatterns,http://vintagepatterns.wikia.com/
+vocaloid,http://vocaloid.wikia.com/
+vongopedia,http://vongopedia.wikia.com/
+vsrecommendedgames,http://vsrecommendedgames.wikia.com/
+wakfu,http://wakfu.wikia.com/
+war,http://war.wikia.com/
+warhammer40k,http://warhammer40k.wikia.com/
+warhammeronline,http://warhammeronline.wikia.com/
+warriors,http://warriors.wikia.com/
+warriorsfanfic,http://warriorsfanfic.wikia.com/
+warszawa,http://warszawa.wikia.com/
+watchmen,http://watchmen.wikia.com/
+webkinz,http://webkinz.wikia.com/
+weirdal,http://weirdal.wikia.com/
+westwing,http://westwing.wikia.com/
+whitewolf,http://whitewolf.wikia.com/
+wiccana_aoc,http://wiccana-aoc.wikia.com/
+wiedzmin,http://wiedzmin.wikia.com/
+wii,http://wii.wikia.com/
+wikiality,http://wikiality.wikia.com/
+wikicities,http://community.wikia.com/
+wikimac,http://mac.wikia.com/
+wirtschaftpedia,http://wirtschaftpedia.wikia.com/
+witcher,http://witcher.wikia.com/
+wizard101,http://wizard101.wikia.com/
+wizardsofwaverlyplace,http://wizardsofwaverlyplace.wikia.com/
+wonderlandonline,http://wonderlandonline.wikia.com/
+woot,http://woot.wikia.com/
+wot,http://wot.wikia.com/
+wowrp,http://wowrp.wikia.com/
+wowwiki,http://www.wowwiki.com/
+wswiki,http://websitewiki.wikia.com/
+wyrmrest,http://wyrmrest.wikia.com/
+xboxanswers,http://xbox.answers.wikia.com/
+xfiles,http://x-files.wikia.com/
+xiaolinshowdown,http://xiaolinshowdown.wikia.com/
+xmen,http://x-men.wikia.com/
+yahoomediaplayer,http://yahoomediaplayer.wikia.com/
+ycm,http://ycm.wikia.com/
+yellowikis,http://yellowikis.wikia.com/
+youtube,http://youtube.wikia.com/
+yoyo,http://yoyowiki.org/
+yugioh,http://yugioh.wikia.com/
+zelda,http://zelda.wikia.com/
+zeldafanon,http://zeldafanon.wikia.com/
+zhhkhongkong,http://zh-hk.hongkong.wikia.com/
+zhscratchpad,http://zh.scratchpad.wikia.com/
+zhuncyclopedia,http://zh.uncyclopedia.wikia.com/
+zombie,http://zombie.wikia.com/
+zomg,http://zomg.wikia.com/
+zootycoon,http://zootycoon.wikia.com/
diff --git a/userroles_scraper_scripts/userroles_from_listusers.py b/userroles_scraper_scripts/userroles_from_listusers.py
new file mode 100755 (executable)
index 0000000..fac27f5
--- /dev/null
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import time, re, os
+import sys
+from importlib import reload
+reload(sys)
+
+import urllib
+import requests
+import json
+import gzip
+
+from pprint import pprint
+from itertools import islice
+import csv
+
+roles = ['bot','sysop', 'bureaucrat','staff','rollback', # 'util',
+         'helper', 'vstf', 'checkuser-global', 'bot-global',
+         'council','authenticated', 'checkuser', 'chatmoderator',
+         'adminmentor','steward','oversight','founder','rollbacker','checkuser','researcher']
+output_path = "userlist-2017/"
+class ListUserAPI():
+    def __init__(self, url_root,wikitype):
+        self.wikitype = wikitype
+        if self.wikitype=="wikia":
+            self._api_url = url_root + 'index.php?action=ajax&rs=ListusersAjax::axShowUsers'
+        else: # wikitype == "wikipedia"
+            self._api_url = url_root + 'api.php'
+
+    def _fetch_http(self, url, params):
+        if self.wikitype == "wikia":
+            response = requests.get(url=url, params=params,headers={'Accept-encoding':'gzip'})
+            return(response.text)
+        else: #wikitype == "wikipedia"
+            response = requests.get(url=url, params=params)
+            return(response)
+            
+    def call(self, params):
+        response = self._fetch_http(self._api_url, params)
+        if self.wikitype == "wikia":
+            return json.loads(response)
+        else:
+            return response.json()
+
+
+def write_user_csvfile(output_file, user_list):
+    csvfile = csv.writer(output_file, delimiter='\t',
+                         quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+
+    # construct and output the header
+    csvfile.writerow(['username', 'groups',
+                      'edits', 'last.logged', 'last.edited'])
+
+    for user in user_list:
+        csvfile.writerow(user)
+        
+
+def get_administrators_for_wiki(wikiname, url_root, wikitype="wikia"):
+    increment_size = 500
+    offset = 0
+
+    if wikitype == "wikia":
+
+        query = {'groups' :'bot,sysop,bureaucrat,',
+                 'edits' : 0,
+                 'limit' : increment_size,
+                 'offset' : offset,
+                 'numOrder' : 1,
+                 'order' : 'username:asc' }
+
+    else: # wikitype == "wikipedia"
+        query = {'action': 'query',
+                 'list': 'allusers',
+                 'augroup' : "|".join(roles),
+                 'auprop' : 'groups',
+                 'aulimit' : 500,
+                 'format' : 'json'}
+
+    ## FIND THE CORRECT URL (there may be redirects)
+
+    if wikitype=="wikia":
+        url_root = requests.get(url_root).url
+        re_str = "^http://(community|www).wikia.com/"
+        if re.match(re_str, url_root):
+            # api_url 'http://community.wikia.com/wiki/Community_Central:Not_a_valid_Wikia':
+            print("ERROR: %s no longer exists" % wikiname)
+
+            return "deleted"
+    try:
+        wiki = ListUserAPI(url_root,wikitype=wikitype)
+        rv = wiki.call(query)
+
+    except requests.ConnectionError as e:
+         print("ERROR: cannot read the event log: %s" % wikiname)
+         notauthorized.append(wikiname)
+         return "notauthorized"
+
+    if wikitype == "wikia":
+        raw_userlist = rv['aaData']
+
+        while (rv['iTotalRecords'] + offset) < rv['iTotalDisplayRecords']:
+            # increment the offset and make a new query
+            offset = offset + increment_size
+            query['offset'] = offset
+            rv = wiki.call(query)
+            raw_userlist.extend(rv['aaData'])
+            print("Another one: offset is %s" % offset)
+
+        # go through and edit the html output of the json
+        processed_userlist = []
+        for row in raw_userlist:
+            row[0] = re.sub(r'^.*?<a href=.*?>(.*?)<.*$', r'\1', row[0])
+            row[4] = re.sub(r'^.*oldid=(\d+)".*$', r'\1', row[4])
+            row[4] = re.sub(r'^\-$', r'', row[4])
+            processed_userlist.append(row)
+
+        output_file = open("{0}/{1}.tsv".format(output_path, wikiname),'w')
+        write_user_csvfile(output_file, processed_userlist)
+        output_file.close()
+
+    else: 
+        output_file = open("{0}/{1}.tsv".format(output_path, wikiname),'w')
+        raw_userlist = rv['query']['allusers']        
+        outlines = ['\t'.join(["username","groups"])]
+        outlines.extend(['\t'.join([q['name'],','.join(q['groups'])]) for q in raw_userlist])
+        output_file.write('\n'.join(outlines))
+        outlines = []
+
+        while 'continue' in rv:
+            query['continue'] = str(rv['continue'])
+            query['aufrom']= str(rv['continue']['aufrom'])
+            rv = wiki.call(query)
+            raw_userlist = rv['query']['allusers']
+            outlines.extend(['\t'.join([q['name'],','.join(q['groups'])]) for q in raw_userlist])
+            output_file.write('\n'.join(outlines))
+            output_file.flush()
+            outlines = []
+
+
+    # open and then send data to the output data file
+    
+# read in the a list of files so we can skip them if we're already
+# downloaded them
+files = [os.path.join(output_path, i) for i in os.listdir(output_path)]
+
+# iterate through the list of files
+
+# for line in open("list_of_wikis.csv", "r").readlines():
+# next line useful for working with a reduced list:
+d = [(line.split(",")[0], line.split(",")[1]) for line in  islice(open("../wikis.needing.userroles.csv"),1,None)]
+
+deleted = []
+notauthorized = []
+for wiki, url in d:
+    wiki = wiki.strip()
+    url = url.strip()
+    print(url)
+    if os.path.join(output_path,wiki+".tsv") in files:
+        print("SKIPPING: file \"%s\" already exists)" % wiki)
+        continue
+
+    print("Processing wiki: %s" % wiki)
+    if "wikipedia.org" in url:
+        wikitype = "wikipedia"
+        url = url + '/w/'
+    if "wikia.com" in url:
+        wikitype = "wikia"
+
+    result = get_administrators_for_wiki(wiki, url, wikitype=wikitype)
+    if result == "deleted":
+        deleted.append(wiki)
+    elif result == "notauthorized":
+        notauthorized.append(wiki)
+    else:
+        pass
+    time.sleep(1)
+
+df = open("allusers_WP_error_deleted.txt",'w')
+df.write('\n'.join(deleted))
+df.close()
+
+na = open("allusers_WP_error_notauthorized.txt",'w')
+na.write('\n'.join(notauthorized))
+na.close()
diff --git a/userroles_scraper_scripts/userroles_from_logevents.py b/userroles_scraper_scripts/userroles_from_logevents.py
new file mode 100755 (executable)
index 0000000..8a8a3a5
--- /dev/null
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2018  Nathan TeBlunthuis
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import time, re, os
+import sys
+import requests
+from mw import api
+from pprint import pprint
+from json.decoder import JSONDecodeError
+from itertools import islice
+
+def write_logevents(logevents,out):
+    for logevent in logevents:
+        # if there is hidden information, we skip this one because there
+        # is nothing to report
+        if 'userhidden' in logevent or 'actionhidden' in logevent or 'commenthidden' in  logevent:
+            continue
+
+        le_output = [logevent['comment'],
+                     str(logevent['logid']),
+                     str(logevent['ns']),
+                     str(logevent['pageid']),
+                     logevent['timestamp'],
+                     logevent['title'],
+                     logevent['type'],
+                     str(logevent['user'])]
+
+        if "params" in logevent:
+            params = logevent["params"]
+        else:
+            params = {}
+            
+        if "rights" in logevent:
+            le_output.extend(['false', 
+                              logevent['rights']['new'],
+                              logevent['rights']['old']])
+
+
+        elif "newgroups" in params and "oldgroups" in params:
+                le_output.extend(['false',
+                                  ','.join(params['newgroups']),
+                                  ','.join(params['oldgroups'])])
+        else:
+            le_output.extend(['true', '', ''])
+            
+        out.write("\t".join(le_output) + "\n")
+    out.flush()
+        # output data
+
+def get_events_for_wiki(wikiname, url, wikitype="wikia"):
+    if url[-1] != '/':
+        url = url + '/'
+
+    #out = open("../wikipedias/adminlist_output/logevents/nobackup/%s.tsv" % wikiname, "w")
+    out = open("logevents-2017/%s.tsv" % wikiname, "w")
+    out.write("\t".join(['comment', 'logid', 'ns', 'pageid', 'timestamp','title', 'type', 'user', 'ancient', 'rights-new', 'rights-old\n']))
+
+    if wikitype == "wikia":
+        api_url = url + 'api.php'
+    else: #wikitype == wikipedia
+        api_url = url + "w/api.php"
+        
+    query = {'action': 'query',
+             'list': 'logevents',
+             'letype' : 'rights',
+             'lelimit' : '500',
+             'format':'json',
+             'ledir':'newer'}
+
+    response = requests.get(api_url, params=query)
+    hit_url = response.url
+
+    if wikitype == "wikia":
+        re_str = "^http://(community|www)\.wikia\.com/"
+        if re.match(re_str, hit_url):
+            # api_url 'http://community.wikia.com/wiki/Community_Central:Not_a_valid_Wikia':
+            print("ERROR: %s no longer exists" % wikiname)
+            return
+        else:
+            re_str = "^(http|https)://.*\.wikia.com/api\.php"
+            if re.match(re_str, hit_url):
+                try:
+                    ## this is the only way out
+                    rv = response.json()
+                    ## check that we hit the right wiki
+                except (JSONDecodeError):
+                    print(" New Error! ")
+            else:
+                re_str = "^((http|https)://.*\.wikia\.com)"
+                new_url = re.findall(re_str, hit_url)[0][0]
+                return get_events_for_wiki(wikiname, new_url, wikitype=wikitype)
+
+    try:
+        logevents = rv['query']['logevents']
+        write_logevents(logevents, out)
+    except KeyError as e:
+        print("ERROR: %s contains no logevent data" % wikiname)
+        print(e)
+        return
+    
+    while 'query-continue' in rv or  'continue' in rv:
+        if 'query-continue' in rv:
+            query['lestart'] = rv['query-continue']['logevents']['lestart']
+        else:
+            query['continue'] = str(rv['continue'])
+            query['lecontinue'] = str(rv['continue']['lecontinue'])
+        
+        response = requests.get(api_url,params=query)
+        rv = response.json()
+        logevents=rv['query']['logevents']
+        write_logevents(logevents, out)
+
+    out.close()
+
+files = [re.sub('\.tsv$', '', i) for i in os.listdir("logevents-2017")]
+
+# interate through the list of wikis
+#for line in ["anime,http://anime.wikia.com/"]:
+#for line in ["blogging,http://blogging.wikia.com/"]:
+header = True
+if header:
+    i = 1
+else:
+    i = 0
+
+# for line in open("list_of_wikis.csv", "r").readlines():
+for line in islice(open("../wikis.needing.userroles.csv", "r"),i,None):
+
+    (wiki, url) = line.split(",")
+    url = url.strip()
+    print("Processing wiki: %s" % wiki)
+
+    if wiki in files:
+        print("SKIPPING: file \"%s\" already exists)" % wiki)
+        continue
+
+    if "wikia.com" in url:
+        wikitype = "wikia"
+    else:# "wikipedia.org in url":
+        wikitype = "wikipedia"
+    
+
+    get_events_for_wiki(wiki, url, wikitype=wikitype)
+    time.sleep(1)

Community Data Science Collective || Want to submit a patch?