datasets/checkpoint_parallelsql.sbatch

   1 #!/bin/bash
   2 ## parallel_sql_job.sh
   3 #SBATCH --job-name=tf_subreddit_comments
   4 ## Allocation Definition
   5 #SBATCH --account=comdata-ckpt
   6 #SBATCH --partition=ckpt
   7 ## Resources
   8 ## Nodes. This should always be 1 for parallel-sql.
   9 #SBATCH --nodes=1
  10 ## Walltime (12 hours)
  11 #SBATCH --time=12:00:00
  12 ## Memory per node
  13 #SBATCH --mem=32G
  14 #SBATCH --cpus-per-task=4
  15 #SBATCH --ntasks=1
  16 #SBATCH -D /gscratch/comdata/users/nathante/cdsc-reddit
  17 source ./bin/activate
  18 module load parallel_sql
  19 echo $(which perl)
  20 conda list pyarrow
  21 which python3
  22 #Put here commands to load other modules (e.g. matlab etc.)
  23 #Below command means that parallel_sql will get tasks from the database
  24 #and run them on the node (in parallel). So a 16 core node will have
  25 #16 tasks running at one time.
  26 parallel-sql --sql -a parallel --exit-on-term --jobs 4