-
Notifications
You must be signed in to change notification settings - Fork 385
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Regularly poll executors to track their utilization #613
base: master
Are you sure you want to change the base?
Changes from 8 commits
8908db3
ffeee81
6bdd9be
89f4cb5
656422c
9008a55
f71adb1
fb0db76
b7db7b6
8441f11
718af51
f8bba7a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
package spark.scheduler | ||
|
||
import scala.collection.mutable | ||
import spark.scheduler.cluster.StandaloneSchedulerBackend | ||
import spark.scheduler.local.LocalScheduler | ||
import spark.Logging | ||
|
||
/** | ||
* | ||
*/ | ||
abstract class ExecutorStatusPoller extends Logging { | ||
val waitBetweenPolls = System.getProperty(ExecutorStatusPoller.OPEN_POLLS_WAIT_KEY, "100").toLong | ||
val executorToLastPoll = mutable.Map[String, Long]() | ||
|
||
//simple round-robin poll of each executor. throttle the polling | ||
val t = new Thread("executor-poller"){ | ||
setDaemon(true) | ||
override def run() { | ||
while(true) { | ||
val now = System.currentTimeMillis() | ||
//if we also had the results come through this class, we could also throttle in terms of number of open polls | ||
var minWait = waitBetweenPolls | ||
executorList.foreach{executorId => | ||
val lastPoll = executorToLastPoll.getOrElseUpdate(executorId, now) | ||
val remainingWait = waitBetweenPolls - (now - lastPoll) | ||
if ( remainingWait <= 0) { | ||
pollExecutor(executorId) | ||
executorToLastPoll(executorId) = System.currentTimeMillis() | ||
} else if (remainingWait < minWait){ | ||
minWait = remainingWait | ||
} | ||
} | ||
Thread.sleep(minWait) | ||
} | ||
} | ||
} | ||
t.start | ||
|
||
def executorList: Seq[String] | ||
def pollExecutor(executorId: String) | ||
} | ||
|
||
class StandaloneExecutorStatusPoller(val sched: StandaloneSchedulerBackend) extends ExecutorStatusPoller { | ||
override def executorList = sched.allExecutors.toSeq | ||
override def pollExecutor(executorId: String) { | ||
sched.requestExecutorStatus(executorId) | ||
} | ||
} | ||
|
||
class LocalExecutorStatusPoller(val sched: LocalScheduler) extends ExecutorStatusPoller { | ||
override def executorList = Seq("local") //just needs to have one element, value doesn't matter | ||
override def pollExecutor(executorId: String) { | ||
sched.reportExecutorStatus | ||
} | ||
} | ||
|
||
object ExecutorStatusPoller { | ||
val OPEN_POLLS_WAIT_KEY = "spark.executor_poll.wait_ms" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,27 +6,56 @@ import spark.{Utils, Logging} | |
import spark.executor.TaskMetrics | ||
|
||
trait SparkListener { | ||
|
||
/** | ||
* called when spark starts computing a new stage | ||
*/ | ||
def onStageStarted(stageStarted: StageStarted) | ||
|
||
/** | ||
* called when a stage is completed, with information on the completed stage | ||
*/ | ||
def onStageCompleted(stageCompleted: StageCompleted) | ||
|
||
/** | ||
* called when there is information on the status of an executor. This may get called at any time. There may not be | ||
* any active stages when this is called. Furthermore, it may be called often, so don't do anything expensive here. | ||
*/ | ||
def onExecutorStatusUpdate(executorStatus: ExecutorStatus) | ||
} | ||
|
||
sealed trait SparkListenerEvents | ||
|
||
case class StageStarted(val stage: Stage) extends SparkListenerEvents | ||
|
||
case class StageCompleted(val stageInfo: StageInfo) extends SparkListenerEvents | ||
|
||
case class ExecutorStatus(val executorId: String, val activeTasks: Int, val availableCores: Int) | ||
extends SparkListenerEvents | ||
|
||
|
||
/** | ||
* Simple SparkListener that logs a few summary statistics when each stage completes | ||
*/ | ||
class StatsReportListener extends SparkListener with Logging { | ||
|
||
var activeStageToExecutorStatus = Map[Int, ExecutorActivitySummary]() | ||
|
||
def onStageStarted(stageStarted: StageStarted) { | ||
activeStageToExecutorStatus += stageStarted.stage.id -> ExecutorActivitySummary(0,0) | ||
} | ||
|
||
def onStageCompleted(stageCompleted: StageCompleted) { | ||
import spark.scheduler.StatsReportListener._ | ||
implicit val sc = stageCompleted | ||
val execStatus = activeStageToExecutorStatus(stageCompleted.stageInfo.stage.id) | ||
activeStageToExecutorStatus -= stageCompleted.stageInfo.stage.id | ||
this.logInfo("Finished stage: " + stageCompleted.stageInfo) | ||
showMillisDistribution("task runtime:", (info, _) => Some(info.duration)) | ||
|
||
//overall work distribution | ||
this.logInfo("executor utilization: %2.0f %%".format(execStatus.activePercent)) | ||
|
||
//shuffle write | ||
showBytesDistribution("shuffle bytes written:",(_,metric) => metric.shuffleWriteMetrics.map{_.shuffleBytesWritten}) | ||
|
||
|
@@ -44,6 +73,13 @@ class StatsReportListener extends SparkListener with Logging { | |
showDistribution("other time pct: ", Distribution(runtimePcts.map{_.other * 100}), "%2.0f %%") | ||
} | ||
|
||
def onExecutorStatusUpdate(executorStatus: ExecutorStatus) { | ||
//update ALL active stages | ||
activeStageToExecutorStatus.foreach{case(k,v) => | ||
activeStageToExecutorStatus += k -> (v + executorStatus) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that (You could achieve the same effect w/out actually sending all the messages when there are no active stages, but the cluster is idle anyway, so why not.) |
||
} | ||
|
||
} | ||
|
||
object StatsReportListener extends Logging { | ||
|
@@ -131,6 +167,16 @@ object StatsReportListener extends Logging { | |
} | ||
} | ||
|
||
case class ExecutorActivitySummary(activeCores: Int, totalCores: Int) { | ||
def +(execStatus: ExecutorStatus): ExecutorActivitySummary = { | ||
ExecutorActivitySummary(activeCores + execStatus.activeTasks, totalCores + execStatus.availableCores) | ||
} | ||
|
||
def activePercent: Double = (activeCores.toDouble / totalCores) * 100 | ||
} | ||
|
||
|
||
|
||
|
||
|
||
case class RuntimePercentage(executorPct: Double, fetchPct: Option[Double], other: Double) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,12 +23,14 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor | |
|
||
// Use an atomic variable to track total number of cores in the cluster for simplicity and speed | ||
var totalCoreCount = new AtomicInteger(0) | ||
var allExecutors = new HashSet[String] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of having two state tracking variables (allExecutors and totalCores), could we not just have a single Map[String, Int] where the key is the executor id and the value would be # of cores? Something like: val allExecutors = Map.empty[String, Int].withDefaultValue(0) |
||
|
||
class DriverActor(sparkProperties: Seq[(String, String)]) extends Actor { | ||
private val executorActor = new HashMap[String, ActorRef] | ||
private val executorAddress = new HashMap[String, Address] | ||
private val executorHostPort = new HashMap[String, String] | ||
private val freeCores = new HashMap[String, Int] | ||
private val totalCores = new HashMap[String, Int] | ||
private val actorToExecutorId = new HashMap[ActorRef, String] | ||
private val addressToExecutorId = new HashMap[Address, String] | ||
|
||
|
@@ -46,9 +48,11 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor | |
logInfo("Registered executor: " + sender + " with ID " + executorId) | ||
sender ! RegisteredExecutor(sparkProperties) | ||
context.watch(sender) | ||
allExecutors.synchronized(allExecutors += executorId) | ||
executorActor(executorId) = sender | ||
executorHostPort(executorId) = hostPort | ||
freeCores(executorId) = cores | ||
totalCores(executorId) = cores | ||
executorAddress(executorId) = sender.path.address | ||
actorToExecutorId(sender) = executorId | ||
addressToExecutorId(sender.path.address) = executorId | ||
|
@@ -82,6 +86,14 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor | |
|
||
case RemoteClientShutdown(transport, address) => | ||
addressToExecutorId.get(address).foreach(removeExecutor(_, "remote Akka client shutdown")) | ||
|
||
case res@RequestExecutorStatus(executorId) => | ||
executorActor(executorId) ! res | ||
|
||
case es: ExecutorStatus => | ||
//convert to public api executor status, which includes num cores on the executor | ||
val executorStatus = spark.scheduler.ExecutorStatus(es.executorId, es.activeThreads, totalCores(es.executorId)) | ||
scheduler.executorStatusUpdate(executorStatus) | ||
} | ||
|
||
// Make fake resource offers on all executors | ||
|
@@ -114,6 +126,8 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor | |
executorActor -= executorId | ||
executorHostPort -= executorId | ||
freeCores -= executorId | ||
totalCores -= executorId | ||
allExecutors.synchronized(allExecutors -= executorId) | ||
executorHostPort -= executorId | ||
totalCoreCount.addAndGet(-numCores) | ||
scheduler.executorLost(executorId, SlaveLost(reason)) | ||
|
@@ -156,6 +170,10 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor | |
driverActor ! ReviveOffers | ||
} | ||
|
||
def requestExecutorStatus(executorId: String) { | ||
driverActor ! RequestExecutorStatus(executorId) | ||
} | ||
|
||
override def defaultParallelism() = Option(System.getProperty("spark.default.parallelism")) | ||
.map(_.toInt).getOrElse(math.max(totalCoreCount.get(), 2)) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd suggest using a java.util.concurrent.ScheduledThreadPoolExecutor instead of an infinite while loop. This would also let you schedule the poller at a fixed interval without having to manage the sleep "catchup" time yourself, e.g.:
This also lets you gracefully stop the poller via:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good point, I will make that change. This also got me thinking -- do I even want to create a new thread at all? Is there an appropriate thread pool for these repeated tasks already?