#!/bin/sh

# This script does some very basic benchmarks with 'qsv' on a city population
# data set (which is a strict subset of the `worldcitiespop` data set). If it
# doesn't exist on your system, it will be downloaded to /tmp for you.
#
# These aren't meant to be overly rigorous, but they should be enough to catch
# significant regressions.
#
# Make sure you're using an `qsv` generated by `cargo build --release`.

set -e

pat="$1"
data=/tmp/worldcitiespop_mil.csv
countrydata=/tmp/countrynames.csv
data_idx=/tmp/worldcitiespop_mil.csv.idx
if [ ! -r "$data" ]; then
  curl -sS https://burntsushi.net/stuff/worldcitiespop_mil.csv > "$data"
fi
data_size=$(stat --format '%s' "$data")
if [ ! -r "$countydata" ]; then
  curl -sS https://gist.githubusercontent.com/anonymous/063cb470e56e64e98cf1/raw/98e2589b801f6ca3ff900b01a87fbb7452eb35c7/countrynames.csv > "$countrydata"
fi

function real_seconds {
  cmd=$(echo $@ "> /dev/null 2>&1")
  t=$(
    $(which time) -p sh -c "$cmd" 2>&1 \
      | grep '^real' \
      | awk '{print $2}')
  if [ $(echo "$t < 0.01" | bc) = 1 ]; then
    t=0.01
  fi
  echo $t
}

function benchmark {
  rm -f "$data_idx"
  t1=$(real_seconds "$@")
  rm -f "$data_idx"
  t2=$(real_seconds "$@")
  rm -f "$data_idx"
  t3=$(real_seconds "$@")
  echo "scale=2; ($t1 + $t2 + $t3) / 3" | bc
}

function benchmark_with_index {
  rm -f "$data_idx"
  qsv index "$data"
  t1=$(real_seconds "$@")
  t2=$(real_seconds "$@")
  t3=$(real_seconds "$@")
  rm -f "$data_idx"
  echo "scale=2; ($t1 + $t2 + $t3) / 3" | bc
}

function run {
  index=
  while true; do
    case "$1" in
      --index) index="yes" && shift ;;
      *) break ;;
    esac
  done
  name="$1"
  shift

  if [ -z "$pat" ] || echo "$name" | grep -E -q "^$pat$"; then
    if [ -z "$index" ]; then
      t=$(benchmark "$@")
    else
      t=$(benchmark_with_index "$@")
    fi
    mb_per=$(echo "scale=2; ($data_size / $t) / 2^20" | bc)
    printf "%s\t%0.02f seconds\t%s MB/sec\n" $name $t $mb_per
  fi
}

run count qsv count "$data"
run --index count_index qsv count "$data"
run fill qsv fill -v Unknown Population "$data"
run fixlengths qsv fixlengths "$data"
run flatten qsv flatten "$data"
run flatten_condensed qsv flatten "$data" --condense 50
run fmt qsv fmt --crlf "$data"
run frequency qsv frequency "$data"
run --index frequency_index qsv frequency "$data"
run frequency_selregex qsv frequency -s /^R/ "$data"
run index qsv index "$data"
run join qsv join --no-case Country "$data" Abbrev "$countrydata"
run partition qsv partition Region /tmp/partitioned "$data"
run rename qsv rename 'country,city,accent_city,region,population,lat,long' "$data"
run reverse qsv reverse "$data"
run sample_10 qsv sample 10 "$data"
run --index sample_10_index qsv sample 10 "$data"
run sample_1000 qsv sample 1000 "$data"
run --index sample_1000_index qsv sample 1000 "$data"
run sample_100000 qsv sample 100000 "$data"
run --index sample_100000_index qsv sample 100000 "$data"
run --index sample_25pct_index qsv sample 0.25 "$data"
run search qsv search -s Country "'(?i)us'" "$data"
run select qsv select Country "$data"
run select_regex qsv select /^L/ "$data"
run sort qsv sort -s AccentCity "$data"
run slice_one_middle qsv slice -i 500000 "$data"
run --index slice_one_middle_index qsv slice -i 500000 "$data"
run stats qsv stats "$data"
run --index stats_index qsv stats "$data"
run stats_everything qsv stats "$data" --everything
run --index stats_everything_index qsv stats "$data" --everything
run table qsv table "$data"
run transpose qsv transpose "$data"
