#!/bin/bash

# The removes system and file information from the trace and only maintain consistent hashes.
# This has the following signature.
#
# usage: dftracer_anonymize [-fcv] [-d input_directory] [-o output_directory]
#   -f                      override output directory
#   -c                      compress outputs
#   -v                      enable verbose mode
#   -h                      display help
#   -d input_directory      specify input directories. should contain .pfw or .pfw.gz files.
#   -o output_directory     specify output directory.

LOG_DIR=$PWD
OUTPUT_DIR=$PWD/output
override=0
compressed=0

function usage {
  echo "usage: $(basename "$0") [-fcv] [-d input_directory] [-o output_directory]"
  echo "  -f                      override output directory"
  echo "  -c                      compress output file"
  echo "  -v                      enable verbose mode"
  echo "  -h                      display help"
  echo "  -d input_directory      specify input directories. should contain .pfw or .pfw.gz files."
  echo "  -o output_directory     specify output directory."
  exit 1
}

function log {
  echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
}

while getopts ':cvfd:o:h' opt; do
  case "$opt" in
  d)
    LOG_DIR="${OPTARG}"
    ;;
  o)
    OUTPUT_DIR="${OPTARG}"
    ;;
  f)
    override=1
    ;;
  v)
    set -x
    ;;
  c)
    compressed=1
    ;;
  h)
    usage
    ;;

  :)
    echo -e "option requires an argument.\n"
    usage
    ;;

  ?)
    echo -e "Invalid command option.\n"
    usage
    ;;
  esac
done

LOG_DIR=$(realpath "$LOG_DIR")
OUTPUT_DIR=$(realpath "$OUTPUT_DIR")

shift "$(($OPTIND -1))"

log "Creating output directory: ${OUTPUT_DIR}"
mkdir -p "${OUTPUT_DIR}"

if [ -n "$( ls -A "${OUTPUT_DIR}" )" ] && [ $override -eq 0 ]; then
  log "The directory is not empty. Please pass a clean directory or pass -f flag."
  exit 0
fi

log "Setting up output directory"
rm -rf "${OUTPUT_DIR}"
mkdir -p "${OUTPUT_DIR}"

total=0
for file in *.pfw*; do total=1; break; done
if [ $total == 0 ]; then
  log "The folder does not contain any pfw or pfw.gz files."
  exit 0
fi

files=("$LOG_DIR"/*.pfw*)
total=${#files[@]}
log "Found $total files to process."
JOBS_LIMIT=64

# loop over logs
for file_index in "${!files[@]}"; do
  file=${files[$file_index]}
  running_jobs=$(jobs -rp | wc -l)
  if [ $running_jobs -ge $JOBS_LIMIT ]; then
  log "Waiting for running jobs to be less than $JOBS_LIMIT. Current: $running_jobs"
  while [ $running_jobs -ge $JOBS_LIMIT ]
  do        
    sleep 1
    running_jobs=$(jobs -rp | wc -l)
  done
  log "Running jobs are now less than $JOBS_LIMIT. Current: $running_jobs"
  fi
  {
  # only look at files
  if [ -f "$file" ]; then
  filename=$(basename -- "$file")
  ext="${filename##*.}"
  log "Processing file: $filename with extension: $ext"
  cp "$LOG_DIR/$filename" "$OUTPUT_DIR/.tmp.$filename"
  if [ "$ext" == "gz" ]; then
    name=${file_index%.pfw.gz}
    log "Extracted name: $name"
    gunzip -c "$OUTPUT_DIR/.tmp.$filename" | sed -e "s/${USER}/USER/g" > "$OUTPUT_DIR/$name.pfw"
    if [ $compressed == 1 ]; then
    gzip "$OUTPUT_DIR/$name.pfw"
    log "Processed and compressed: $OUTPUT_DIR/$name.pfw.gz"
    else
    log "Processed: $OUTPUT_DIR/$name.pfw"
    fi
  else
    name=${file_index%.pfw}
    log "Extracted name: $name"
    cat "$OUTPUT_DIR/.tmp.$filename" | sed -e "s/${USER}/USER/g" > "$OUTPUT_DIR/$name.pfw"
    if [ $compressed == 1 ]; then
    gzip "$OUTPUT_DIR/$name.pfw"
    log "Processed and compressed: $OUTPUT_DIR/$name.pfw.gz"
    else
    log "Processed: $OUTPUT_DIR/$name.pfw"
    fi
  fi
  rm "$OUTPUT_DIR/.tmp.$filename"
  fi
  } &
done

wait

log "Finished anonymization of traces."
