#!usr/bin/env Rscript ## Script used to choose the top 1% of wikis to analyze # Copyright (C) 2018 Nathan TeBlunthuis # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . library("ggplot2") library("data.table") counts.dir <- "../wikiq_wikia_2010_unique_editors/" files <- list.files(counts.dir) read.count.file <- function(f){ return(read.csv(paste0(counts.dir,f),header=FALSE)) } dbname <- gsub("\\.editors",'',files) counts <- c(sapply(files,read.count.file)) counts <- unlist(counts,use.names=FALSE) dt <- data.table(wiki=dbname,n.editors=counts) #ggplot(dt,aes(x=n.editors)) + stat_ecdf(geom="step") + scale_x_log10(minor_breaks=10**(1:10/2)) + scale_y_continuous(minor_breaks=1:20/20) top_1_percentile = quantile(x=dt$n.editors,probs=(1:99)/100)[99] ## lets take all with > 100. This is very close to the top 1%, but it involves nice round numbers :) wiki.list <- dt[n.editors >= top_1_percentile] wiki.list[is.na(url),':='(url=paste0("http://",wiki,".wikia.com/"))] wiki.list$wiki.type="wikia" fwrite(wiki.list,"selected.wikis.csv")