diff --git a/bin/gobblin-mapreduce.sh b/bin/gobblin-mapreduce.sh old mode 100644 new mode 100755 index 055aee2ad56..efad5346c9f --- a/bin/gobblin-mapreduce.sh +++ b/bin/gobblin-mapreduce.sh @@ -4,36 +4,129 @@ ############### Run Gobblin Jobs on Hadoop MR ################ ############################################################## -USAGE="gobblin-mapreduce.sh " +FWDIR="$(cd `dirname $0`/..; pwd)" +FWDIR_LIB=$FWDIR/lib +FWDIR_CONF=$FWDIR/conf + +function print_usage(){ + echo "Usage: gobblin-mapreduce.sh [OPTION] --conf " + echo "Where OPTION can be:" + echo " --jt Job submission URL: if not set, taken from \${HADOOP_HOME}/conf" + echo " --fs Target file system: if not set, taken from \${HADOOP_HOME}/conf" + echo " --jars Job jar(s): if not set, "$FWDIR_LIB" is examined" + echo " --workdir Gobblin's base work directory: if not set, taken from \${GOBBLIN_WORK_DIR}" + echo " --help Display this help and exit" +} + +# Print an error message and exit +function die() { + echo -e "\nError: $@\n" 1>&2 + print_usage + exit 1 +} + +for i in "$@" +do + case "$1" in + --jt) + JOB_TRACKER_URL="$2" + shift + ;; + --fs) + FS_URL="$2" + shift + ;; + --workdir) + WORK_DIR="$2" + shift + ;; + --jars) + JARS="$2" + shift + ;; + --conf) + JOB_CONFIG_FILE="$2" + shift + ;; + --help) + print_usage + exit 0 + ;; + *) + ;; + esac + shift +done -if [ "$#" -ne 3 ]; then - echo "Usage:" - echo $USAGE - exit 1 +if [ -z "$JOB_CONFIG_FILE" ]; then + die "No job configuration file set!" fi -JOB_TRACKER_URL=$1 -FS_URL=$2 -JOB_CONFIG_FILE=$3 +# User defined work directory overrides $GOBBLIN_WORK_DIR +if [ -n "$WORK_DIR" ]; then + export GOBBLIN_WORK_DIR="$WORK_DIR" +fi -FWDIR="$(cd `dirname $0`/..; pwd)" +if [ -z "$GOBBLIN_WORK_DIR" ]; then + die "GOBBLIN_WORK_DIR is not set!" +fi + +. $FWDIR_CONF/gobblin-env.sh + +USER_JARS="" +separator='' +set_user_jars(){ + if [ -n "$1" ]; then + IFS=',' + read -ra userjars <<< "$1" + for userjar in ${userjars[@]}; do + add_user_jar "$userjar" + done + unset IFS + fi +} + +add_user_jar(){ + local dirname=`dirname "$1"` + local jarname=`basename "$1"` + dirname=`cd "$dirname">/dev/null; pwd` + USER_JARS+="$separator$dirname/$jarname" + separator=',' +} + +# Add the absoulte path of the user defined job jars to the LIBJARS first +set_user_jars "$JARS" # Jars Gobblin runtime depends on -LIBJARS=$FWDIR/lib/common-api.jar,$FWDIR/lib/metastore.jar,$FWDIR/lib/qualitychecker.jar,\ -$FWDIR/lib/source.jar,$FWDIR/lib/converter.jar,$FWDIR/lib/writer.jar,$FWDIR/lib/utility.jar,\ -$FWDIR/lib/guava-15.0.jar,$FWDIR/lib/avro-1.7.1.jar,$FWDIR/lib/avro-mapred-1.7.6.jar,\ -$FWDIR/lib/metrics-core-3.0.2.jar,$FWDIR/lib/gson-2.2.4.jar +LIBJARS=$USER_JARS$separator$FWDIR_LIB/gobblin-metastore.jar,$FWDIR_LIB/gobblin-metrics.jar,\ +$FWDIR_LIB/gobblin-core.jar,$FWDIR_LIB/gobblin-api.jar,$FWDIR_LIB/gobblin-utility.jar,\ +$FWDIR_LIB/guava-15.0.jar,$FWDIR_LIB/avro-1.7.6.jar,$FWDIR_LIB/avro-mapred-1.7.6.jar,\ +$FWDIR_LIB/metrics-core-3.0.2.jar,$FWDIR_LIB/gson-2.2.4.jar,$FWDIR_LIB/joda-time-1.6.jar,$FWDIR_LIB/data-1.15.9.jar + +# Add libraries to the Hadoop classpath +GOBBLIN_DEP_JARS=`echo "$USER_JARS" | tr ',' ':' ` +for jarFile in `ls $FWDIR_LIB/*` +do + GOBBLIN_DEP_JARS=${GOBBLIN_DEP_JARS}:$jarFile +done + +# Honor Gobblin dependencies +export HADOOP_USER_CLASSPATH_FIRST=true +export HADOOP_CLASSPATH=$GOBBLIN_DEP_JARS:$HADOOP_CLASSPATH -export HADOOP_CLASSPATH=$FWDIR/lib/* +GOBBLIN_CONFIG_FILE=$FWDIR_CONF/gobblin-mapreduce.properties -GOBBLIN_CONFIG_FILE=$FWDIR/conf/gobblin-mapreduce.properties +JT_COMMAND=$([ -z $JOB_TRACKER_URL ] && echo "" || echo "-jt $JOB_TRACKER_URL") +FS_COMMAND=$([ -z $FS_URL ] && echo "" || echo "-fs $FS_URL") # Launch the job to run on Hadoop $HADOOP_BIN_DIR/hadoop jar \ - $FWDIR/lib/runtime.jar \ + $FWDIR_LIB/gobblin-runtime.jar \ gobblin.runtime.mapreduce.CliMRJobLauncher \ - -jt $JOB_TRACKER_URL \ - -fs $FS_URL \ + -D mapreduce.user.classpath.first=true \ + -D mapreduce.job.user.classpath.first=true \ + $JT_COMMAND \ + $FS_COMMAND \ -libjars $LIBJARS \ - $GOBBLIN_CONFIG_FILE \ - $JOB_CONFIG_FILE + -sysconfig $GOBBLIN_CONFIG_FILE \ + -jobconfig $JOB_CONFIG_FILE \ No newline at end of file diff --git a/bin/gobblin-standalone.sh b/bin/gobblin-standalone.sh index 9fe29f88d11..f47111ba498 100755 --- a/bin/gobblin-standalone.sh +++ b/bin/gobblin-standalone.sh @@ -1,30 +1,87 @@ #!/bin/bash -USAGE="gobblin-standalone.sh " +FWDIR="$(cd `dirname $0`/..; pwd)" +FWDIR_LIB=$FWDIR/lib +FWDIR_CONF=$FWDIR/conf + +function print_usage(){ + echo "gobblin-standalone.sh [OPTION]" + echo "Where:" + echo " --workdir Gobblin's base work directory: if not set, taken from \${GOBBLIN_WORK_DIR}" + echo " --jars Job jar(s): if not set, "$FWDIR_LIB" is examined" + echo " --conf Directory of job configuration files: if not set, taken from ${GOBBLIN_JOB_CONFIG_DIR}" + echo " --help Display this help and exit" +} -if [ "$#" -ne 1 ]; then - echo "Usage:" - echo $USAGE +# Print an error message and exit +function die() { + echo -e "\nError: $@\n" 1>&2 + print_usage exit 1 +} + +for i in "$@" +do + case "$1" in + start|stop|restart|status) + ACTION="$1" + ;; + --workdir) + WORK_DIR="$2" + shift + ;; + --jars) + JARS="$2" + shift + ;; + --conf) + JOB_CONFIG_DIR="$2" + shift + ;; + --help) + print_usage + exit 0 + ;; + *) + ;; + esac + shift +done + +if [ -z "$JAVA_HOME" ]; then + die "Environment variable JAVA_HOME not set!" +fi + +check=false +if [ $ACTION == "start" ] || [ $ACTION == "restart" ]; then + check=true fi -if [ -z $GOBBLIN_WORK_DIR ]; then - echo "Environment variable GOBBLIN_WORK_DIR not set" +# User defined job configuration directory overrides $GOBBLIN_JOB_CONFIG_DIR +if [ -n "$JOB_CONFIG_DIR" ]; then + export GOBBLIN_JOB_CONFIG_DIR="$JOB_CONFIG_DIR" fi -if [ -z $JAVA_HOME ]; then - echo "Environment variable JAVA_HOME not set" +if [ -z "$GOBBLIN_JOB_CONFIG_DIR" ] && [ "$check" == true ]; then + die "Environment variable GOBBLIN_JOB_CONFIG_DIR not set!" fi -ACTION=$1 +# User defined work directory overrides $GOBBLIN_WORK_DIR +if [ -n "$WORK_DIR" ]; then + export GOBBLIN_WORK_DIR="$WORK_DIR" +fi -FWDIR="$(cd `dirname $0`/..; pwd)" +if [ -z "$GOBBLIN_WORK_DIR" ] && [ "$check" == true ]; then + die "GOBBLIN_WORK_DIR is not set!" +fi + +. $FWDIR_CONF/gobblin-env.sh -CONFIG_FILE=$FWDIR/conf/gobblin-standalone.properties +CONFIG_FILE=$FWDIR_CONF/gobblin-standalone.properties PID="$GOBBLIN_WORK_DIR/.gobblin-pid" -if [ -f $PID ]; then +if [ -f "$PID" ]; then PID_VALUE=`cat $PID` > /dev/null 2>&1 else PID_VALUE="" @@ -34,22 +91,39 @@ if [ ! -d "$FWDIR/logs" ]; then mkdir "$FWDIR/logs" fi +set_user_jars(){ + local separator='' + if [ -n "$1" ]; then + IFS=',' + read -ra userjars <<< "$1" + for userjar in ${userjars[@]}; do + add_user_jar "$userjar" + done + unset IFS + fi +} + +add_user_jar(){ + local dirname=`dirname "$1"` + local jarname=`basename "$1"` + dirname=`cd "$dirname">/dev/null; pwd` + GOBBLIN_JARS+="$separator$dirname/$jarname" + separator=':' +} + +# Add the absoulte path of the user defined job jars to the GOBBLIN_JARS +set_user_jars "$JARS" + start() { - GOBBLIN_JARS="" - for jar in $(ls -d $FWDIR/lib/*); do + for jar in $(ls -d $FWDIR_LIB/*); do if [ "$GOBBLIN_JARS" != "" ]; then GOBBLIN_JARS+=":$jar" else GOBBLIN_JARS=$jar fi done - - if [ -z $GOBBLIN_JOB_JARS ]; then - CLASSPATH=$GOBBLIN_JARS - else - CLASSPATH=$GOBBLIN_JARS:$GOBBLIN_JOB_JARS - fi - CLASSPATH+=":$FWDIR/conf" + + CLASSPATH="$GOBBLIN_JARS:$FWDIR_CONF" echo "Starting Gobblin standalone daemon" COMMAND="$JAVA_HOME/bin/java -Xmx2g -Xms1g " @@ -59,9 +133,9 @@ start() { COMMAND+="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=$FWDIR/logs/ " COMMAND+="-Xloggc:$FWDIR/logs/gobblin-gc.log " COMMAND+="-Dgobblin.logs.dir=$FWDIR/logs " - COMMAND+="-Dlog4j.configuration=file://$FWDIR/conf/log4j-standalone.xml " + COMMAND+="-Dlog4j.configuration=file://$FWDIR_CONF/log4j-standalone.xml " COMMAND+="-cp $CLASSPATH " - COMMAND+="-Dorg.quartz.properties=$FWDIR/conf/quartz.properties " + COMMAND+="-Dorg.quartz.properties=$FWDIR_CONF/quartz.properties " COMMAND+="gobblin.scheduler.SchedulerDaemon $CONFIG_FILE" echo "Running command:" echo "$COMMAND" @@ -69,7 +143,7 @@ start() { } stop() { - if [ -f $PID ]; then + if [ -f "$PID" ]; then if kill -0 $PID_VALUE > /dev/null 2>&1; then echo 'Stopping Gobblin standalone daemon' kill $PID_VALUE @@ -88,7 +162,7 @@ stop() { # Check the status of the process status() { - if [ -f $PID ]; then + if [ -f "$PID" ]; then echo "Looking into file: $PID" if kill -0 $PID_VALUE > /dev/null 2>&1; then echo "Gobblin standalone daemon is running with status: " @@ -101,7 +175,7 @@ status() { fi } -case $ACTION in +case "$ACTION" in "start") start ;; diff --git a/build.gradle b/build.gradle index 503d01ea850..cd4cff8d848 100644 --- a/build.gradle +++ b/build.gradle @@ -41,7 +41,7 @@ if (!project.hasProperty('pegasusVersion')) { ext.externalDependency = [ "antlrRuntime": "org.antlr:antlr-runtime:3.0.1", - "avro": "org.apache.avro:avro:1.7.1", + "avro": "org.apache.avro:avro:1.7.6", "avroMapred": "org.apache.avro:avro-mapred:1.7.6", "commonsCli": "commons-cli:commons-cli:1.2", "commonsDbcp": "commons-dbcp:commons-dbcp:1.4", @@ -135,6 +135,9 @@ subprojects { } else { compile externalDependency.hadoop } + compile(externalDependency.guava) { + force = true + } } } diff --git a/conf/gobblin-env.sh b/conf/gobblin-env.sh new file mode 100755 index 00000000000..384eafb935a --- /dev/null +++ b/conf/gobblin-env.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +# Set Gobblin specific environment variables here. diff --git a/conf/gobblin-mapreduce.properties b/conf/gobblin-mapreduce.properties index 9f5d09a637d..7636dfb2624 100644 --- a/conf/gobblin-mapreduce.properties +++ b/conf/gobblin-mapreduce.properties @@ -8,7 +8,7 @@ taskretry.threadpool.coresize=1 taskretry.threadpool.maxsize=2 # File system URIs -fs.uri=file:/// +fs.uri=hdfs://localhost:8020 writer.fs.uri=${fs.uri} state.store.fs.uri=${fs.uri} diff --git a/gobblin-runtime/src/main/java/gobblin/runtime/mapreduce/CliMRJobLauncher.java b/gobblin-runtime/src/main/java/gobblin/runtime/mapreduce/CliMRJobLauncher.java index 9af01788c61..52245e26894 100644 --- a/gobblin-runtime/src/main/java/gobblin/runtime/mapreduce/CliMRJobLauncher.java +++ b/gobblin-runtime/src/main/java/gobblin/runtime/mapreduce/CliMRJobLauncher.java @@ -23,6 +23,7 @@ import org.apache.commons.configuration.PropertiesConfiguration; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; @@ -76,6 +77,11 @@ public static void printUsage(Options options) { public static void main(String[] args) throws Exception { + + Configuration conf = new Configuration(); + // Parse generic options + String[] genericCmdLineOpts = new GenericOptionsParser(conf, args).getCommandLine().getArgs(); + // Build command-line options Option sysConfigOption = OptionBuilder .withArgName("system configuration file") @@ -100,7 +106,7 @@ public static void main(String[] args) options.addOption(helpOption); // Parse command-line options - CommandLine cmd = new BasicParser().parse(options, args); + CommandLine cmd = new BasicParser().parse(options, genericCmdLineOpts); if (cmd.hasOption('h')) { printUsage(options); @@ -119,6 +125,6 @@ public static void main(String[] args) ConfigurationConverter.getProperties(new PropertiesConfiguration(cmd.getOptionValue("jobconfig"))); // Launch and run the job - System.exit(ToolRunner.run(new Configuration(), new CliMRJobLauncher(sysConfig, jobConfig), args)); + System.exit(ToolRunner.run(conf, new CliMRJobLauncher(sysConfig, jobConfig), args)); } }