]> code.communitydata.science - mediawiki_dump_tools.git/blobdiff - wikiq_users/run_wikiq_users_cluster.sh
add more variables and support for persistence
[mediawiki_dump_tools.git] / wikiq_users / run_wikiq_users_cluster.sh
index beca0f9a8770aa8baa8b55a519232a8f71c7d7cb..84e23f05152b6fef478d3b4c6acf72cdec508007 100755 (executable)
@@ -1,2 +1,2 @@
 #!/usr/bin/env bash
-spark-submit --master  spark://n0649:18899 wikiq_users_spark.py --output-format parquet  -i "/com/output/wikiq-enwiki-20180301/enwiki-20180301-pages-meta-history*.tsv" -o  "/com/output/wikiq-users-enwiki-20180301-parquet/" --num-partitions 500
+spark-submit --master  spark://n0649:18899 wikiq_users_spark.py --output-format parquet  -i "/com/output/wikiq-enwiki-persist-sequence-20180301/enwiki/enwiki-20180301-pages-meta-history*.tsv" -o  "/com/output/wikiq-users-enwiki-20180301-parquet/" --num-partitions 500 --schema-opt persistence+collapse

Community Data Science Collective || Want to submit a patch?