X-Git-Url: https://code.communitydata.science/cdsc_reddit.git/blobdiff_plain/a60747292e91a47d122158659182f82bfd2e922a..e6294b5b90135a5163441c8dc62252dd6a188412:/ngrams/checkpoint_parallelsql.sbatch?ds=inline diff --git a/ngrams/checkpoint_parallelsql.sbatch b/ngrams/checkpoint_parallelsql.sbatch new file mode 100644 index 0000000..dd61e65 --- /dev/null +++ b/ngrams/checkpoint_parallelsql.sbatch @@ -0,0 +1,26 @@ +#!/bin/bash +## parallel_sql_job.sh +#SBATCH --job-name=tf_subreddit_comments +## Allocation Definition +#SBATCH --account=comdata-ckpt +#SBATCH --partition=ckpt +## Resources +## Nodes. This should always be 1 for parallel-sql. +#SBATCH --nodes=1 +## Walltime (12 hours) +#SBATCH --time=12:00:00 +## Memory per node +#SBATCH --mem=32G +#SBATCH --cpus-per-task=4 +#SBATCH --ntasks=1 +#SBATCH -D /gscratch/comdata/users/nathante/cdsc-reddit +source ./bin/activate +module load parallel_sql +echo $(which perl) +conda list pyarrow +which python3 +#Put here commands to load other modules (e.g. matlab etc.) +#Below command means that parallel_sql will get tasks from the database +#and run them on the node (in parallel). So a 16 core node will have +#16 tasks running at one time. +parallel-sql --sql -a parallel --exit-on-term --jobs 4