#! /bin/bash
#
# Annotates all records files in HFDS system using Hadoop Streaming MAP API
# 
#
# Written by Ikaro Silva
# Last modified August 27, 2014
#
#
#This script require that PhysioNet files are on HDFS already. 
#Make sure that the HDFS cluster has been configured by running
#
#./prepare-dataset.sh
#For more tuning information see : http://hadoop.apache.org/docs/r0.18.3/streaming.pdf

if [ "$1" == "-h" ]; then
  echo -e "\n\tUsage: `basename $0` recordListFile annotationCommand [annotationArgs]\n"
  echo -e  "\nRuns the WFDB command 'annotationCommand' in all records in recordListFile"
  echo -e "generating annotation files in Hadoop's HDFS.\n"
  echo -e "\trecordList \t-A '*.ind' in HDFS containing all the records to be run in batch mode."
  echo -e "\t\t\tIf the recordListFile is a '*.ind' (similar to the ones generated by 'prepare-datset.sh' script,"
  echo -e "\t\t\tthen the nodes will download the records and process them through the local file system."
  exit 0
fi

#Load configuration variables - This includes the database to be processed.
source wfdb-hadoop-configuration.sh


#The annotation command to be run in batch mode
FILE=${1}

#Check if the command is properly installed
wqrs -h > /dev/null 2>&1
if [ "$?" != "0" ] ; then
   echo "Exiting: Annotation command ${ANN} cannot be executed! " >&2
   exit
fi

DB=`basename ${FILE%/*}`
echo "Setting WFDB enviroment: export WFDB="${DATA_DIR}/${DB}/" " >&2
echo "export WFDB=\"\.:${DATA_DIR}/${DB}/\"" >&2
export WFDB=".:${DATA_DIR}/${DB}/"
echo "export LD_LIBRARY_PATH=${DATA_DIR}/mcode/nativelibs/linux-amd64/lib64/:\$LD_LIBRARY_PATH"
export LD_LIBRARY_PATH=${DATA_DIR}/mcode/nativelibs/linux-amd64/lib64/:$LD_LIBRARY_PATH
echo "export PATH=${DATA_DIR}/mcode/nativelibs/linux-amd64/bin/:\$PATH"
export PATH=${DATA_DIR}/mcode/nativelibs/linux-amd64/bin/:$PATH

#Call Hadoop Streaming
hadoop jar /usr/lib/hadoop-0.20/contrib/streaming/hadoop-streaming-*.jar \
  -D mapreduce.task.timeout=1000000 \
  -D mapred.child.java.opts=-Xmx2g \
  -D mapred.child.java.opts=-Xms2g \
  -D mapred.child.java.opts=-XX:-UseConcMarkSweepGC \
  -input ${FILE} \
  -output output \
  -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat \
  -mapper mse-map.sh \
  -file mse-map.sh \
  -file surrogate-test.sh \
  -file least_sqfit.m \
  -file shuffle.m \
  -cmdenv HDFS_ROOT=${HDFS_ROOT} \
  -cmdenv LD_LIBRARY_PATH=$LD_LIBRARY_PATH \
  -cmdenv PATH=$PATH \
  -cmdenv DATA_DIR=${DATA_DIR} \
  -cmdenv WFDB=$WFDB