]> code.communitydata.science - cdsc_reddit.git/blobdiff - ngrams/checkpoint_parallelsql.sbatch
Refactor and reorganze.
[cdsc_reddit.git] / ngrams / checkpoint_parallelsql.sbatch
diff --git a/ngrams/checkpoint_parallelsql.sbatch b/ngrams/checkpoint_parallelsql.sbatch
new file mode 100644 (file)
index 0000000..dd61e65
--- /dev/null
@@ -0,0 +1,26 @@
+#!/bin/bash
+## parallel_sql_job.sh
+#SBATCH --job-name=tf_subreddit_comments
+## Allocation Definition
+#SBATCH --account=comdata-ckpt
+#SBATCH --partition=ckpt
+## Resources
+## Nodes. This should always be 1 for parallel-sql.
+#SBATCH --nodes=1    
+## Walltime (12 hours)
+#SBATCH --time=12:00:00
+## Memory per node
+#SBATCH --mem=32G
+#SBATCH --cpus-per-task=4
+#SBATCH --ntasks=1
+#SBATCH -D /gscratch/comdata/users/nathante/cdsc-reddit
+source ./bin/activate
+module load parallel_sql
+echo $(which perl)
+conda list pyarrow
+which python3
+#Put here commands to load other modules (e.g. matlab etc.)
+#Below command means that parallel_sql will get tasks from the database
+#and run them on the node (in parallel). So a 16 core node will have
+#16 tasks running at one time.
+parallel-sql --sql -a parallel --exit-on-term --jobs 4

Community Data Science Collective || Want to submit a patch?