-
Notifications
You must be signed in to change notification settings - Fork 0
/
Framework
42 lines (40 loc) · 99.1 KB
/
Framework
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
<!DOCTYPE html>
<html>
<head>
<meta name="databricks-html-version" content="1">
<title>SparkImplementation - Databricks</title>
<meta charset="utf-8">
<meta name="google" content="notranslate">
<meta name="robots" content="nofollow">
<meta http-equiv="Content-Language" content="en">
<meta http-equiv="Content-Type" content="text/html; charset=UTF8">
<link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/7f9cefa92f0da43a505f7213ef5a6bb5d4a409ec4e0540a0a55701946063455d/css/main.css">
<link rel="stylesheet" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/7f9cefa92f0da43a505f7213ef5a6bb5d4a409ec4e0540a0a55701946063455d/css/print.css" media="print">
<link rel="icon" type="image/png" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/7f9cefa92f0da43a505f7213ef5a6bb5d4a409ec4e0540a0a55701946063455d/img/favicon.ico"/>
<script>window.settings = {"enableUsageDeliveryConfiguration":false,"enableNotebookNotifications":true,"enableSshKeyUI":false,"defaultInteractivePricePerDBU":0.4,"enableClusterMetricsUI":true,"enableOnDemandClusterType":true,"enableAutoCompleteAsYouType":[],"devTierName":"Community Edition","enableJobsPrefetching":true,"workspaceFeaturedLinks":[{"linkURI":"https://docs.databricks.com/index.html","displayName":"Documentation","icon":"question"},{"linkURI":"https://docs.databricks.com/release-notes/product/index.html","displayName":"Release Notes","icon":"code"},{"linkURI":"https://docs.databricks.com/spark/latest/training/index.html","displayName":"Training & Tutorials","icon":"graduation-cap"}],"enableReservoirTableUI":false,"enableClearStateFeature":true,"dbcForumURL":"http://forums.databricks.com/","enableProtoClusterInfoDeltaPublisher":true,"enableAttachExistingCluster":true,"resetJobListOnConnect":true,"serverlessDefaultSparkVersion":"latest-stable-scala2.11","maxCustomTags":45,"serverlessDefaultMaxWorkers":20,"enableInstanceProfilesUIInJobs":true,"nodeInfo":{"node_types":[{"support_ssh":false,"spark_heap_memory":4800,"instance_type_id":"r3.2xlarge","spark_core_oversubscription_factor":8.0,"node_type_id":"dev-tier-node","description":"Community Optimized","support_cluster_tags":false,"container_memory_mb":6000,"node_instance_type":{"instance_type_id":"r3.2xlarge","provider":"AWS","local_disk_size_gb":160,"compute_units":26.0,"number_of_ips":14,"local_disks":1,"reserved_compute_units":3.64,"gpus":0,"memory_mb":62464,"num_cores":8,"local_disk_type":"AHCI","max_attachable_disks":0,"supported_disk_types":[{"ebs_volume_type":"GENERAL_PURPOSE_SSD"},{"ebs_volume_type":"THROUGHPUT_OPTIMIZED_HDD"}],"reserved_memory_mb":4800},"memory_mb":6144,"is_hidden":false,"category":"Community Edition","num_cores":0.88,"support_port_forwarding":false,"support_ebs_volumes":false,"is_deprecated":false}],"default_node_type_id":"dev-tier-node"},"sqlAclsDisabledMap":{"spark.databricks.acl.enabled":"false","spark.databricks.acl.sqlOnly":"false"},"enableDatabaseSupportClusterChoice":true,"enableClusterAcls":true,"notebookRevisionVisibilityHorizon":999999,"serverlessClusterProductName":"Serverless Pool","showS3TableImportOption":true,"maxEbsVolumesPerInstance":10,"enableRStudioUI":false,"isAdmin":true,"deltaProcessingBatchSize":1000,"timerUpdateQueueLength":100,"sqlAclsEnabledMap":{"spark.databricks.acl.enabled":"true","spark.databricks.acl.sqlOnly":"true"},"enableLargeResultDownload":true,"maxElasticDiskCapacityGB":5000,"serverlessDefaultMinWorkers":2,"zoneInfos":[{"id":"us-west-2c","isDefault":true},{"id":"us-west-2b","isDefault":false},{"id":"us-west-2a","isDefault":false}],"enableCustomSpotPricingUIByTier":false,"serverlessClustersEnabled":false,"enableWorkspaceBrowserSorting":true,"enableSentryLogging":false,"enableFindAndReplace":true,"disallowUrlImportExceptFromDocs":false,"defaultStandardClusterModel":{"cluster_name":"","node_type_id":"dev-tier-node","spark_version":"3.5.x-scala2.11","num_workers":0,"aws_attributes":{"first_on_demand":0,"availability":"ON_DEMAND","zone_id":"us-west-2c","spot_bid_price_percent":100},"autotermination_minutes":120,"default_tags":{"Vendor":"Databricks","Creator":"benny.avelin@math.uu.se","ClusterName":null,"ClusterId":"<Generated after creation>"}},"enableEBSVolumesUIForJobs":true,"enablePublishNotebooks":true,"enableBitbucketCloud":true,"createTableInNotebookS3Link":{"url":"https://docs.databricks.com/_static/notebooks/data-import/s3.html","displayName":"S3","workspaceFileName":"S3 Example"},"sanitizeHtmlResult":true,"enableJobAclsConfig":false,"enableFullTextSearch":false,"enableElasticSparkUI":false,"enableNewClustersCreate":true,"clusters":true,"allowRunOnPendingClusters":true,"useAutoscalingByDefault":false,"enableAzureToolbar":false,"fileStoreBase":"FileStore","enableEmailInAzure":false,"enableRLibraries":true,"enableTableAclsConfig":false,"enableSshKeyUIInJobs":true,"enableDetachAndAttachSubMenu":true,"configurableSparkOptionsSpec":[{"keyPattern":"spark\\.kryo(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.kryo.*","valuePatternDisplay":"*","description":"Configuration options for Kryo serialization"},{"keyPattern":"spark\\.io\\.compression\\.codec","valuePattern":"(lzf|snappy|org\\.apache\\.spark\\.io\\.LZFCompressionCodec|org\\.apache\\.spark\\.io\\.SnappyCompressionCodec)","keyPatternDisplay":"spark.io.compression.codec","valuePatternDisplay":"snappy|lzf","description":"The codec used to compress internal data such as RDD partitions, broadcast variables and shuffle outputs."},{"keyPattern":"spark\\.serializer","valuePattern":"(org\\.apache\\.spark\\.serializer\\.JavaSerializer|org\\.apache\\.spark\\.serializer\\.KryoSerializer)","keyPatternDisplay":"spark.serializer","valuePatternDisplay":"org.apache.spark.serializer.JavaSerializer|org.apache.spark.serializer.KryoSerializer","description":"Class to use for serializing objects that will be sent over the network or need to be cached in serialized form."},{"keyPattern":"spark\\.rdd\\.compress","valuePattern":"(true|false)","keyPatternDisplay":"spark.rdd.compress","valuePatternDisplay":"true|false","description":"Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER). Can save substantial space at the cost of some extra CPU time."},{"keyPattern":"spark\\.speculation","valuePattern":"(true|false)","keyPatternDisplay":"spark.speculation","valuePatternDisplay":"true|false","description":"Whether to use speculation (recommended off for streaming)"},{"keyPattern":"spark\\.es(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"es(\\.([^\\.]+))+","valuePattern":".*","keyPatternDisplay":"es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"spark\\.(storage|shuffle)\\.memoryFraction","valuePattern":"0?\\.0*([1-9])([0-9])*","keyPatternDisplay":"spark.(storage|shuffle).memoryFraction","valuePatternDisplay":"(0.0,1.0)","description":"Fraction of Java heap to use for Spark's shuffle or storage"},{"keyPattern":"spark\\.streaming\\.backpressure\\.enabled","valuePattern":"(true|false)","keyPatternDisplay":"spark.streaming.backpressure.enabled","valuePatternDisplay":"true|false","description":"Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` if they are set."},{"keyPattern":"spark\\.streaming\\.receiver\\.maxRate","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.receiver.maxRate","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRatePerPartition","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRatePerPartition","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the Kafka direct stream API introduced in Spark 1.3. See the Kafka Integration guide for more details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRetries","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRetries","valuePatternDisplay":"numeric","description":"Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the Kafka direct stream API introduced in Spark 1.3."},{"keyPattern":"spark\\.streaming\\.ui\\.retainedBatches","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.ui.retainedBatches","valuePatternDisplay":"numeric","description":"How many batches the Spark Streaming UI and status APIs remember before garbage collecting."}],"enableReactNotebookComments":true,"enableAdminPasswordReset":false,"checkBeforeAddingAadUser":false,"enableResetPassword":true,"maxClusterTagValueLength":255,"enableJobsSparkUpgrade":true,"createTableInNotebookDBFSLink":{"url":"https://docs.databricks.com/_static/notebooks/data-import/dbfs.html","displayName":"DBFS","workspaceFileName":"DBFS Example"},"perClusterAutoterminationEnabled":false,"enableNotebookCommandNumbers":true,"allowStyleInSanitizedHtml":true,"sparkVersions":[{"key":"1.6.3-db2-hadoop2-scala2.10","displayName":"Spark 1.6.3-db2 (Hadoop 2, Scala 2.10)","packageLabel":"spark-image-aba860a0ffce4f3471fb14aefdcb1d768ac66a53a5ad884c48745ef98aeb9d67","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"3.3.x-gpu-scala2.11","displayName":"3.3 (includes Apache Spark 2.2.0, GPU, Scala 2.11)","packageLabel":"spark-image-86b4917bb6586289ca64e65f64fd23678c297274be6cd6aa6aa01d7b91fed29c","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"2.1.1-db5-scala2.11","displayName":"Spark 2.1.1-db5 (Scala 2.11)","packageLabel":"spark-image-08d9fc1551087e0876236f19640c4a83116b1649f15137427d21c9056656e80e","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"1.6.x-ubuntu15.10","displayName":"Spark 1.6.x (Hadoop 1)","packageLabel":"spark-image-8cea23fb9094e174bf5815d79009f4a8e383eb86cf2909cf6c6434ed8da2a16a","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"3.3.x-scala2.10","displayName":"3.3 (includes Apache Spark 2.2.0, Scala 2.10)","packageLabel":"spark-image-d7df74e188103a4093ff4467dbf0d32886366c984097f6997e0cd87d0f6b2fa5","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"1.4.x-ubuntu15.10","displayName":"Spark 1.4.1 (Hadoop 1, deprecated)","packageLabel":"spark-image-f710650fb8aaade8e4e812368ea87c45cd8cd0b5e6894ca6c94f3354e8daa6dc","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"2.2.x-scala2.11","displayName":"3.0 (includes Apache Spark 2.2.0, Scala 2.11)","packageLabel":"spark-image-67ab3a06d1e83d5b60df7063245eb419a2e9fe329aeeb7e7d9713332c669bb17","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"2.1.1-db6-scala2.10","displayName":"Spark 2.1.1-db6 (Scala 2.10)","packageLabel":"spark-image-177f3f02a6a3432d30068332dc857b9161345bdd2ee8a2d2de05bb05cb4b0f4c","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"2.1.0-db2-scala2.11","displayName":"Spark 2.1.0-db2 (Scala 2.11)","packageLabel":"spark-image-267c4490a3ab8a39acdbbd9f1d36f6decdecebf013e30dd677faff50f1d9cf8b","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"4.0.x-scala2.11","displayName":"4.0 (includes Apache Spark 2.3.0, Scala 2.11)","packageLabel":"spark-image-fc9368293e1b3b6c37181d7af3123a4b9de5f7fa03cfd6dfaa038753256380c9","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION","SUPPORTS_TABLE_ACLS"]},{"key":"3.5.x-rc-scala2.11","displayName":"3.5.3 RC (Scala 2.11)","packageLabel":"spark-image-1ae0d5f0f2c1b33f0a0aa8fcbc515930e41ef30934964076419a0c775b8e4a55","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION","SUPPORTS_TABLE_ACLS"]},{"key":"2.1.x-gpu-scala2.11","displayName":"Spark 2.1 (Auto-updating, GPU, Scala 2.11 experimental)","packageLabel":"spark-image-d613235f93e0f29838beb2079a958c02a192ed67a502192bc67a8a5f2fb37f35","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"2.0.0-ubuntu15.10-scala2.10","displayName":"Spark 2.0.0 (Scala 2.10)","packageLabel":"spark-image-073c1b52ace74f251fae2680624a0d8d184a8b57096d1c21c5ce56c29be6a37a","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"4.0.x-rc-scala2.11","displayName":"4.0.1 RC (Scala 2.11)","packageLabel":"spark-image-3e2ac409b0e9dd6329322bfc8fcd0b56518b891b12bc6dcdd618936bfda04e8a","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION","SUPPORTS_TABLE_ACLS"]},{"key":"latest-stable-gpu-scala2.11","displayName":"Latest stable (GPU, Scala 2.11)","packageLabel":"spark-image-7a1e78fbfc4d1645e7478daa28377389b900aec38764df46bd836c1a9925499b","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"3.4.x-scala2.11","displayName":"3.4 (includes Apache Spark 2.2.0, Scala 2.11)","packageLabel":"spark-image-f91cb0b3822c6641a9d346ef6c149118fb859b5e511ee01c31e958892ba23c7a","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"2.0.2-db3-scala2.10","displayName":"Spark 2.0.2-db3 (Scala 2.10)","packageLabel":"spark-image-584091dedb690de20e8cf22d9e02fdcce1281edda99eedb441a418d50e28088f","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"3.2.x-scala2.10","displayName":"3.2 (includes Apache Spark 2.2.0, Scala 2.10)","packageLabel":"spark-image-557788bea0eea16bbf7a8ba13ace07e64dd7fc86270bd5cea086097fe886431f","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"latest-experimental-scala2.10","displayName":"Latest experimental (Scala 2.10)","packageLabel":"spark-image-ec81b6840af02ee2321dd8dfe2587437bbcddf024d4ae287f326a98fac406a6c","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"3.4.x-rc-gpu-scala2.11","displayName":"3.4.3 RC (GPU, Scala 2.11)","packageLabel":"spark-image-ee017217c379ee15a84e697efa3ae05993d19fb690137fce1dfb1a6b89347dd7","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"4.0.x-gpu-scala2.11","displayName":"4.0 (includes Apache Spark 2.3.0, GPU, Scala 2.11)","packageLabel":"spark-image-b543c0700f83413b0055359ea9feaf285f2e2f3350fb7f301ea0e18b018b5cb5","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"2.1.0-db1-scala2.11","displayName":"Spark 2.1.0-db1 (Scala 2.11)","packageLabel":"spark-image-e8ad5b72cf0f899dcf2b4720c1f572ab0e87a311d6113b943b4e1d4a7edb77eb","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"2.1.1-db4-scala2.11","displayName":"Spark 2.1.1-db4 (Scala 2.11)","packageLabel":"spark-image-52bca0ca866e3f4243d3820a783abf3b9b3b553edf234abef14b892657ceaca9","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"latest-rc-scala2.11","displayName":"Latest RC (4.1 snapshot, Scala 2.11)","packageLabel":"spark-image-64471328a3c4495efbd4da8ef5b1e3e1249b4ea86334ed6a9a90bd8b84311b8f","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION","SUPPORTS_TABLE_ACLS"]},{"key":"latest-stable-scala2.11","displayName":"Latest stable (Scala 2.11)","packageLabel":"spark-image-c919ecd682175957255cbc87041d82633406312f7b74e018e165c4fe94943b5f","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION","SUPPORTS_TABLE_ACLS"]},{"key":"2.1.0-db2-scala2.10","displayName":"Spark 2.1.0-db2 (Scala 2.10)","packageLabel":"spark-image-a2ca4f6b58c95f78dca91b1340305ab3fe32673bd894da2fa8e1dc8a9f8d0478","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"3.3.x-rc-scala2.11","displayName":"3.3.3 RC (Scala 2.11)","packageLabel":"spark-image-e726385a019c875dfa6b766d599eb7a3b55b666aaf490b0857193529220ae29d","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"3.4.x-rc-scala2.11","displayName":"3.4.3 RC (Scala 2.11)","packageLabel":"spark-image-18aa11edf17163d7d4c003a2ce90c18aea7aff86a30836b6b0991938b90cc7de","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"1.6.x-ubuntu15.10-hadoop1","displayName":"Spark 1.6.x (Hadoop 1)","packageLabel":"spark-image-8cea23fb9094e174bf5815d79009f4a8e383eb86cf2909cf6c6434ed8da2a16a","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"2.0.2-db4-scala2.11","displayName":"Spark 2.0.2-db4 (Scala 2.11)","packageLabel":"spark-image-7dbc7583e8271765b8a1508cb9e832768e35489bbde2c4c790bc6766aee2fd7f","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"1.6.1-ubuntu15.10-hadoop1","displayName":"Spark 1.6.1 (Hadoop 1)","packageLabel":"spark-image-21d1cac181b7b8856dd1b4214a3a734f95b5289089349db9d9c926cb87d843db","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"2.0.x-gpu-scala2.11","displayName":"Spark 2.0 (Auto-updating, GPU, Scala 2.11 experimental)","packageLabel":"spark-image-968b89f1d0ec32e1ee4dacd04838cae25ef44370a441224177a37980d539d83a","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"3.3.x-rc-gpu-scala2.11","displayName":"3.3.3 RC (GPU, Scala 2.11)","packageLabel":"spark-image-ff4e225c98a5d7ff9090511db615f4749d49f06b700534178c5ac0686874566c","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"1.6.2-ubuntu15.10-hadoop1","displayName":"Spark 1.6.2 (Hadoop 1)","packageLabel":"spark-image-8cea23fb9094e174bf5815d79009f4a8e383eb86cf2909cf6c6434ed8da2a16a","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"next-major-version-scala2.11","displayName":"Next major version (4.0 snapshot, Scala 2.11)","packageLabel":"spark-image-04bb47b0bae8165f760972376ce05083bc6102645f3f3851cd1cdf9cba13d6fe","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION","SUPPORTS_TABLE_ACLS"]},{"key":"1.6.3-db1-hadoop2-scala2.10","displayName":"Spark 1.6.3-db1 (Hadoop 2, Scala 2.10)","packageLabel":"spark-image-eaa8d9b990015a14e032fb2e2e15be0b8d5af9627cd01d855df728b67969d5d9","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"1.6.3-db2-hadoop1-scala2.10","displayName":"Spark 1.6.3-db2 (Hadoop 1, Scala 2.10)","packageLabel":"spark-image-14112ea0645bea94333a571a150819ce85573cf5541167d905b7e6588645cf3b","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"4.0.x-rc-gpu-scala2.11","displayName":"4.0.1 RC (GPU, Scala 2.11)","packageLabel":"spark-image-b9c9903edd0bd3ecae649e0855fd2a989d5e79a69a30ad28a48a8b1eb2cccd61","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"3.5.x-scala2.10","displayName":"3.5 LTS (includes Apache Spark 2.2.1, Scala 2.10)","packageLabel":"spark-image-5e4f1f2feb631875a6036dffb069ec14b436939b5efe0ecb3ff8220c835298d6","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION","SUPPORTS_TABLE_ACLS"]},{"key":"1.6.2-ubuntu15.10-hadoop2","displayName":"Spark 1.6.2 (Hadoop 2)","packageLabel":"spark-image-161245e66d887cd775e23286a54bab0b146143e1289f25bd1732beac454a1561","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"1.6.1-ubuntu15.10-hadoop2","displayName":"Spark 1.6.1 (Hadoop 2)","packageLabel":"spark-image-4cafdf8bc6cba8edad12f441e3b3f0a8ea27da35c896bc8290e16b41fd15496a","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"2.0.2-db2-scala2.10","displayName":"Spark 2.0.2-db2 (Scala 2.10)","packageLabel":"spark-image-36d48f22cca7a907538e07df71847dd22aaf84a852c2eeea2dcefe24c681602f","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"2.0.x-ubuntu15.10-scala2.11","displayName":"Spark 2.0 (Ubuntu 15.10, Scala 2.11, deprecated)","packageLabel":"spark-image-8e1c50d626a52eac5a6c8129e09ae206ba9890f4523775f77af4ad6d99a64c44","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"3.3.x-rc-scala2.10","displayName":"3.3.3 RC (Scala 2.10)","packageLabel":"spark-image-c13aa5a4bc0df3a5617f9e0f1f8876bc54ce81adc5332c371426068f41c39671","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"2.0.x-scala2.10","displayName":"Spark 2.0 (Auto-updating, Scala 2.10)","packageLabel":"spark-image-859e88079f97f58d50e25163b39a1943d1eeac0b6939c5a65faba986477e311a","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"2.1.1-db4-scala2.10","displayName":"Spark 2.1.1-db4 (Scala 2.10)","packageLabel":"spark-image-c7c0224de396cd1563addc1ae4bca6ba823780b6babe6c3729ddf73008f29ba4","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"latest-rc-scala2.10","displayName":"Latest RC (Scala 2.10)","packageLabel":"spark-image-ec81b6840af02ee2321dd8dfe2587437bbcddf024d4ae287f326a98fac406a6c","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"latest-stable-scala2.10","displayName":"Latest stable (Scala 2.10)","packageLabel":"spark-image-5e4f1f2feb631875a6036dffb069ec14b436939b5efe0ecb3ff8220c835298d6","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION","SUPPORTS_TABLE_ACLS"]},{"key":"2.0.2-db1-scala2.11","displayName":"Spark 2.0.2-db1 (Scala 2.11)","packageLabel":"spark-image-c2d623f03dd44097493c01aa54a941fc31978ebe6d759b36c75b716b2ff6ab9c","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"3.4.x-rc-scala2.10","displayName":"3.4.3 RC (Scala 2.10)","packageLabel":"spark-image-19463f3de5439de105ff37bdad871dc1bbb9ea4d57a9b671033aa0929c0d6a05","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"2.0.2-db4-scala2.10","displayName":"Spark 2.0.2-db4 (Scala 2.10)","packageLabel":"spark-image-859e88079f97f58d50e25163b39a1943d1eeac0b6939c5a65faba986477e311a","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"2.1.1-db5-scala2.10","displayName":"Spark 2.1.1-db5 (Scala 2.10)","packageLabel":"spark-image-74133df2c13950431298d1cab3e865c191d83ac33648a8590495c52fc644c654","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"3.4.x-gpu-scala2.11","displayName":"3.4 (includes Apache Spark 2.2.0, GPU, Scala 2.11)","packageLabel":"spark-image-66d1366768039140a9f5409f3bab414cb7477ebd8d4bbf8b32cb885120f9f705","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2 (Hadoop 1, deprecated)","packageLabel":"spark-image-c9d2a8abf41f157a4acc6d52bc721090346f6fea2de356f3a66e388f54481698","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"latest-experimental-gpu-scala2.11","displayName":"Latest experimental (4.1 snapshot, GPU, Scala 2.11)","packageLabel":"spark-image-47ffe674c46e62f410a1160e126d9852f6aa6b0bcdf9b3a1fc479110509ffd39","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"2.2.x-scala2.10","displayName":"3.0 (includes Apache Spark 2.2.0, Scala 2.10)","packageLabel":"spark-image-d549f2d4a523994ecdf37e531b51d5ec7d8be51534bb0ca5322eaad28ba8f557","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"3.0.x-scala2.11","displayName":"3.0 (includes Apache Spark 2.2.0, Scala 2.11)","packageLabel":"spark-image-67ab3a06d1e83d5b60df7063245eb419a2e9fe329aeeb7e7d9713332c669bb17","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"2.0.x-scala2.11","displayName":"Spark 2.0 (Auto-updating, Scala 2.11)","packageLabel":"spark-image-7dbc7583e8271765b8a1508cb9e832768e35489bbde2c4c790bc6766aee2fd7f","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"2.1.x-scala2.10","displayName":"Spark 2.1 (Auto-updating, Scala 2.10)","packageLabel":"spark-image-177f3f02a6a3432d30068332dc857b9161345bdd2ee8a2d2de05bb05cb4b0f4c","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"3.1.x-scala2.11","displayName":"3.1 (includes Apache Spark 2.2.0, Scala 2.11)","packageLabel":"spark-image-241fa8b78ee6343242b1756b18076270894385ff40a81172a6fb5eadf66155d3","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"2.1.0-db3-scala2.10","displayName":"Spark 2.1.0-db3 (Scala 2.10)","packageLabel":"spark-image-25a17d070af155f10c4232dcc6248e36a2eb48c24f8d4fc00f34041b86bd1626","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"2.0.2-db2-scala2.11","displayName":"Spark 2.0.2-db2 (Scala 2.11)","packageLabel":"spark-image-4fa852ba378e97815083b96c9cada7b962a513ec23554a5fc849f7f1dd8c065a","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"3.5.x-rc-gpu-scala2.11","displayName":"3.5.3 RC (GPU, Scala 2.11)","packageLabel":"spark-image-d9b80be2590957541c72bead4f6bbea066358d590a324d078a99e69209019b18","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"3.1.x-scala2.10","displayName":"3.1 (includes Apache Spark 2.2.0, Scala 2.10)","packageLabel":"spark-image-7efac6b9a8f2da59cb4f6d0caac46cfcb3f1ebf64c8073498c42d0360f846714","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"3.3.x-scala2.11","displayName":"3.3 (includes Apache Spark 2.2.0, Scala 2.11)","packageLabel":"spark-image-0badc3d8dfc8cddd55795d02c0b31c76330cfe687d588414f91278197fbc9416","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"next-major-version-gpu-scala2.11","displayName":"Next major version (4.0 snapshot, GPU, Scala 2.11)","packageLabel":"spark-image-41e21a0db3b77bc857f10358917ccbf5fbd85290e8429c2176a5fc7a29ce4f18","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"3.5.x-gpu-scala2.11","displayName":"3.5 LTS (includes Apache Spark 2.2.1, GPU, Scala 2.11)","packageLabel":"spark-image-7a1e78fbfc4d1645e7478daa28377389b900aec38764df46bd836c1a9925499b","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"1.3.x-ubuntu15.10","displayName":"Spark 1.3.0 (Hadoop 1, deprecated)","packageLabel":"spark-image-40d2842670bc3dc178b14042501847d76171437ccf70613fa397a7a24c48b912","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"2.0.1-db1-scala2.11","displayName":"Spark 2.0.1-db1 (Scala 2.11)","packageLabel":"spark-image-10ab19f634bbfdb860446c326a9f76dc25bfa87de6403b980566279142a289ea","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"2.0.2-db3-scala2.11","displayName":"Spark 2.0.2-db3 (Scala 2.11)","packageLabel":"spark-image-7fd7aaa89d55692e429115ae7eac3b1a1dc4de705d50510995f34306b39c2397","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"2.1.1-db6-scala2.11","displayName":"Spark 2.1.1-db6 (Scala 2.11)","packageLabel":"spark-image-fdad9ef557700d7a8b6bde86feccbcc3c71d1acdc838b0fd299bd19956b1076e","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"1.6.3-db1-hadoop1-scala2.10","displayName":"Spark 1.6.3-db1 (Hadoop 1, Scala 2.10)","packageLabel":"spark-image-d50af1032799546b8ccbeeb76889a20c819ebc2a0e68ea20920cb30d3895d3ae","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"2.0.2-db1-scala2.10","displayName":"Spark 2.0.2-db1 (Scala 2.10)","packageLabel":"spark-image-654bdd6e9bad70079491987d853b4b7abf3b736fff099701501acaabe0e75c41","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"2.0.x-ubuntu15.10","displayName":"Spark 2.0 (Ubuntu 15.10, Scala 2.10, deprecated)","packageLabel":"spark-image-a659f3909d51b38d297b20532fc807ecf708cfb7440ce9b090c406ab0c1e4b7e","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"3.5.x-scala2.11","displayName":"3.5 LTS (includes Apache Spark 2.2.1, Scala 2.11)","packageLabel":"spark-image-c919ecd682175957255cbc87041d82633406312f7b74e018e165c4fe94943b5f","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION","SUPPORTS_TABLE_ACLS"]},{"key":"latest-experimental-scala2.11","displayName":"Latest experimental (4.1 snapshot, Scala 2.11)","packageLabel":"spark-image-64471328a3c4495efbd4da8ef5b1e3e1249b4ea86334ed6a9a90bd8b84311b8f","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION","SUPPORTS_TABLE_ACLS"]},{"key":"3.2.x-scala2.11","displayName":"3.2 (includes Apache Spark 2.2.0, Scala 2.11)","packageLabel":"spark-image-5537926238bc55cb6cd76ee0f0789511349abead3781c4780721a845f34b5d4e","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":[]},{"key":"2.0.1-db1-scala2.10","displayName":"Spark 2.0.1-db1 (Scala 2.10)","packageLabel":"spark-image-5a13c2db3091986a4e7363006cc185c5b1108c7761ef5d0218506cf2e6643840","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"2.1.x-scala2.11","displayName":"Spark 2.1 (Auto-updating, Scala 2.11)","packageLabel":"spark-image-fdad9ef557700d7a8b6bde86feccbcc3c71d1acdc838b0fd299bd19956b1076e","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"2.1.0-db1-scala2.10","displayName":"Spark 2.1.0-db1 (Scala 2.10)","packageLabel":"spark-image-f0ab82a5deb7908e0d159e9af066ba05fb56e1edb35bdad41b7ad2fd62a9b546","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"3.0.x-scala2.10","displayName":"3.0 (includes Apache Spark 2.2.0, Scala 2.10)","packageLabel":"spark-image-d549f2d4a523994ecdf37e531b51d5ec7d8be51534bb0ca5322eaad28ba8f557","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"1.6.0-ubuntu15.10","displayName":"Spark 1.6.0 (Hadoop 1)","packageLabel":"spark-image-10ef758029b8c7e19cd7f4fb52fff9180d75db92ca071bd94c47f3c1171a7cb5","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"1.6.x-ubuntu15.10-hadoop2","displayName":"Spark 1.6.x (Hadoop 2)","packageLabel":"spark-image-161245e66d887cd775e23286a54bab0b146143e1289f25bd1732beac454a1561","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"2.0.0-ubuntu15.10-scala2.11","displayName":"Spark 2.0.0 (Scala 2.11)","packageLabel":"spark-image-b4ec141e751f201399f8358a82efee202560f7ed05e1a04a2ae8778f6324b909","upgradable":true,"deprecated":true,"customerVisible":false,"capabilities":[]},{"key":"2.1.0-db3-scala2.11","displayName":"Spark 2.1.0-db3 (Scala 2.11)","packageLabel":"spark-image-ccbc6b73f158e2001fc1fb8c827bfdde425d8bd6d65cb7b3269784c28bb72c16","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]},{"key":"latest-rc-gpu-scala2.11","displayName":"Latest RC (4.1 snapshot, GPU, Scala 2.11)","packageLabel":"spark-image-47ffe674c46e62f410a1160e126d9852f6aa6b0bcdf9b3a1fc479110509ffd39","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":[]},{"key":"3.5.x-rc-scala2.10","displayName":"3.5.3 RC (Scala 2.10)","packageLabel":"spark-image-0230907fb5b3e0033eeaa4d5bf64945ea6a5f3c9dcd6b81338c25a4eb28c6f8d","upgradable":true,"deprecated":false,"customerVisible":false,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION","SUPPORTS_TABLE_ACLS"]},{"key":"3.4.x-scala2.10","displayName":"3.4 (includes Apache Spark 2.2.0, Scala 2.10)","packageLabel":"spark-image-867d7300605c0c54b2b1394d1bba7b88b28ed5841b3575253cded34db6ce6454","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION"]}],"enablePresentationMode":false,"enableClearStateAndRunAll":true,"enableTableAclsByTier":false,"enableRestrictedClusterCreation":true,"enableFeedback":true,"enableClusterAutoScaling":false,"enableUserVisibleDefaultTags":true,"defaultNumWorkers":0,"serverContinuationTimeoutMillis":10000,"jobsUnreachableThresholdMillis":60000,"driverStderrFilePrefix":"stderr","enableNotebookRefresh":false,"createTableInNotebookImportedFileLink":{"url":"https://docs.databricks.com/_static/notebooks/data-import/imported-file.html","displayName":"Imported File","workspaceFileName":"Imported File Example"},"accountsOwnerUrl":"https://accounts.cloud.databricks.com/registration.html#login","tableAclsDisabledMap":{"spark.databricks.acl.dfAclsEnabled":"false"},"driverStdoutFilePrefix":"stdout","showDbuPricing":true,"databricksDocsBaseHostname":"docs.databricks.com","defaultNodeTypeToPricingUnitsMap":{"r3.2xlarge":2,"i3.4xlarge":4,"class-node":1,"m4.2xlarge":1.5,"r4.xlarge":1,"m4.4xlarge":3,"Standard_DS5_v2":3,"Standard_D2s_v3":0.5,"Standard_DS4_v2_Promo":1.5,"Standard_DS14":4,"Standard_DS11_v2_Promo":0.5,"r4.16xlarge":16,"Standard_DS11":0.5,"Standard_D2_v3":0.5,"Standard_DS14_v2_Promo":4,"Standard_D64s_v3":12,"p2.8xlarge":16,"m4.10xlarge":8,"Standard_D8s_v3":1.5,"Standard_E32s_v3":8,"Standard_DS3":0.75,"Standard_DS2_v2":0.5,"r3.8xlarge":8,"r4.4xlarge":4,"dev-tier-node":1,"Standard_L8s":2,"Standard_DS13_v2_Promo":2,"Standard_E4s_v3":1,"Standard_D3_v2":0.75,"Standard_DS15_v2":5,"Standard_D16s_v3":3,"Standard_D5_v2":3,"Standard_E8s_v3":2,"Standard_DS2_v2_Promo":0.5,"c3.8xlarge":4,"Standard_D4_v3":0.75,"Standard_E2s_v3":0.5,"Standard_D32_v3":6,"Standard_DS3_v2":0.75,"r3.4xlarge":4,"Standard_DS4":1.5,"i2.4xlarge":6,"Standard_DS3_v2_Promo":0.75,"m4.xlarge":0.75,"r4.8xlarge":8,"Standard_H16":4,"Standard_DS14_v2":4,"r4.large":0.5,"Standard_DS12":1,"development-node":1,"i2.2xlarge":3,"g2.8xlarge":6,"i3.large":0.75,"memory-optimized":1,"m4.large":0.4,"Standard_D16_v3":3,"Standard_F4s":0.5,"p2.16xlarge":24,"i3.8xlarge":8,"Standard_D32s_v3":6,"i3.16xlarge":16,"Standard_DS12_v2":1,"Standard_L32s":8,"Standard_D4s_v3":0.75,"Standard_DS13":2,"Standard_DS11_v2":0.5,"Standard_DS12_v2_Promo":1,"Standard_DS13_v2":2,"c3.2xlarge":1,"Standard_L4s":1,"Standard_F16s":2,"c4.2xlarge":1,"Standard_L16s":4,"i2.xlarge":1.5,"Standard_DS2":0.5,"compute-optimized":1,"c4.4xlarge":2,"Standard_DS5_v2_Promo":3,"Standard_D64_v3":12,"Standard_D2_v2":0.5,"Standard_D8_v3":1.5,"i3.2xlarge":2,"Standard_E16s_v3":4,"Standard_F8s":1,"c3.4xlarge":2,"g2.2xlarge":1.5,"p2.xlarge":2,"m4.16xlarge":12,"Standard_DS4_v2":1.5,"c4.8xlarge":4,"i3.xlarge":1,"r3.xlarge":1,"r4.2xlarge":2,"i2.8xlarge":12},"tableFilesBaseFolder":"/tables","enableSparkDocsSearch":true,"sparkHistoryServerEnabled":true,"enableClusterAppsUIOnServerless":false,"enableEBSVolumesUI":false,"homePageWelcomeMessage":"Welcome to ","metastoreServiceRowLimit":1000000,"enableIPythonImportExport":true,"enableClusterTagsUIForJobs":true,"enableClusterTagsUI":false,"enableNotebookHistoryDiffing":true,"branch":"2.66.997","accountsLimit":3,"enableSparkEnvironmentVariables":true,"enableX509Authentication":false,"useAADLogin":false,"enableStructuredStreamingNbOptimizations":true,"enableNotebookGitBranching":true,"local":false,"enableNotebookLazyRenderWrapper":false,"enableClusterAutoScalingForJobs":true,"enableStrongPassword":false,"showReleaseNote":true,"displayDefaultContainerMemoryGB":6,"broadenedEditPermission":false,"disableS3TableImport":false,"enableArrayParamsEdit":true,"deploymentMode":"production","useSpotForWorkers":true,"removePasswordInAccountSettings":false,"preferStartTerminatedCluster":false,"enableUserInviteWorkflow":true,"createTableConnectorOptionLinks":[{"url":"https://docs.databricks.com/_static/notebooks/redshift.html","displayName":"Amazon Redshift","workspaceFileName":"Amazon Redshift Example"},{"url":"https://docs.databricks.com/_static/notebooks/structured-streaming-kinesis.html","displayName":"Amazon Kinesis","workspaceFileName":"Amazon Kinesis Example"},{"url":"https://docs.databricks.com/_static/notebooks/data-import/jdbc.html","displayName":"JDBC","workspaceFileName":"JDBC Example"},{"url":"https://docs.databricks.com/_static/notebooks/cassandra.html","displayName":"Cassandra","workspaceFileName":"Cassandra Example"},{"url":"https://docs.databricks.com/_static/notebooks/structured-streaming-etl-kafka.html","displayName":"Kafka","workspaceFileName":"Kafka Example"},{"url":"https://docs.databricks.com/_static/notebooks/redis.html","displayName":"Redis","workspaceFileName":"Redis Example"},{"url":"https://docs.databricks.com/_static/notebooks/elasticsearch.html","displayName":"Elasticsearch","workspaceFileName":"Elasticsearch Example"}],"enableStaticNotebooks":true,"enableNewLineChart":true,"sandboxForUrlSandboxFrame":"allow-scripts allow-popups allow-popups-to-escape-sandbox allow-forms","enableCssTransitions":true,"serverlessEnableElasticDisk":true,"minClusterTagKeyLength":1,"showHomepageFeaturedLinks":true,"pricingURL":"https://databricks.com/product/pricing","enableClusterEdit":true,"enableClusterAclsConfig":false,"useTempS3UrlForTableUpload":false,"notifyLastLogin":false,"enableSshKeyUIByTier":false,"enableCreateClusterOnAttach":true,"defaultAutomatedPricePerDBU":0.2,"enableNotebookGitVersioning":true,"defaultMinWorkers":2,"files":"files/","feedbackEmail":"feedback@databricks.com","enableDriverLogsUI":true,"enableExperimentalCharts":false,"defaultMaxWorkers":8,"enableWorkspaceAclsConfig":false,"serverlessRunPythonAsLowPrivilegeUser":false,"dropzoneMaxFileSize":2047,"enableNewClustersList":true,"enableNewDashboardViews":true,"enableJobListPermissionFilter":true,"driverLog4jFilePrefix":"log4j","enableSingleSignOn":true,"enableMavenLibraries":true,"displayRowLimit":1000,"deltaProcessingAsyncEnabled":true,"enableSparkEnvironmentVariablesUI":false,"defaultSparkVersion":{"key":"3.5.x-scala2.11","displayName":"3.5 LTS (includes Apache Spark 2.2.1, Scala 2.11)","packageLabel":"spark-image-c919ecd682175957255cbc87041d82633406312f7b74e018e165c4fe94943b5f","upgradable":true,"deprecated":false,"customerVisible":true,"capabilities":["SUPPORTS_END_TO_END_ENCRYPTION","SUPPORTS_TABLE_ACLS"]},"enableNewLineChartParams":false,"deprecatedEnableStructuredDataAcls":false,"enableCustomSpotPricing":false,"enableMountAclsConfig":false,"defaultAutoterminationMin":120,"useDevTierHomePage":true,"disableExportNotebook":false,"enableClusterClone":true,"enableNotebookLineNumbers":true,"enablePublishHub":false,"notebookHubUrl":"http://hub.dev.databricks.com/","showSqlEndpoints":false,"enableNotebookDatasetInfoView":true,"defaultTagKeys":{"CLUSTER_NAME":"ClusterName","VENDOR":"Vendor","CLUSTER_TYPE":"ResourceClass","CREATOR":"Creator","CLUSTER_ID":"ClusterId"},"enableClusterAclsByTier":false,"databricksDocsBaseUrl":"https://docs.databricks.com/","azurePortalLink":"https://portal.azure.com","cloud":"AWS","customSparkVersionPrefix":"custom:","disallowAddingAdmins":true,"enableSparkConfUI":true,"enableClusterEventsUI":false,"featureTier":"DEVELOPER_BASIC_TIER","mavenCentralSearchEndpoint":"http://search.maven.org/solrsearch/select","defaultServerlessClusterModel":{"cluster_name":"","node_type_id":"i3.2xlarge","spark_version":"latest-stable-scala2.11","num_workers":null,"enable_jdbc_auto_start":true,"custom_tags":{"ResourceClass":"Serverless"},"autoscale":{"min_workers":2,"max_workers":20},"spark_conf":{"spark.databricks.cluster.profile":"serverless","spark.databricks.repl.allowedLanguages":"sql,python,r","spark.databricks.acl.enabled":"false","spark.databricks.acl.sqlOnly":"false"},"aws_attributes":{"ebs_volume_count":null,"availability":"ON_DEMAND","first_on_demand":1,"ebs_volume_type":null,"spot_bid_price_percent":100,"zone_id":"us-west-2c","ebs_volume_size":null},"autotermination_minutes":0,"enable_elastic_disk":false,"default_tags":{"Vendor":"Databricks","Creator":"benny.avelin@math.uu.se","ClusterName":null,"ClusterId":"<Generated after creation>"}},"enableOrgSwitcherUI":true,"bitbucketCloudBaseApiV2Url":"https://api.bitbucket.org/2.0","clustersLimit":1,"enableJdbcImport":true,"enableClusterAppsUIOnNormalClusters":false,"enableElasticDisk":false,"logfiles":"logfiles/","enableRelativeNotebookLinks":true,"enableMultiSelect":true,"homePageLogo":"login/databricks_logoTM_rgb_TM.svg","enableWebappSharding":true,"enableNotebookParamsEdit":true,"enableClusterDeltaUpdates":true,"enableSingleSignOnLogin":false,"separateTableForJobClusters":true,"ebsVolumeSizeLimitGB":{"GENERAL_PURPOSE_SSD":[100,4096],"THROUGHPUT_OPTIMIZED_HDD":[500,4096]},"enableMountAcls":false,"requireEmailUserName":true,"enableRServerless":true,"dbcFeedbackURL":"mailto:feedback@databricks.com","enableMountAclService":true,"showVersion":true,"serverlessClustersByDefault":false,"enableWorkspaceAcls":false,"maxClusterTagKeyLength":127,"gitHash":"","clusterTagReservedPrefixes":[],"tableAclsEnabledMap":{"spark.databricks.acl.dfAclsEnabled":"true"},"showWorkspaceFeaturedLinks":true,"signupUrl":"https://databricks.com/try-databricks","databricksDocsNotebookPathPrefix":"^https://docs\\.databricks\\.com/_static/notebooks/.+$","serverlessAttachEbsVolumesByDefault":false,"enableTokensConfig":false,"allowFeedbackForumAccess":true,"enablePythonVersionUI":true,"enableImportFromUrl":true,"allowDisplayHtmlByUrl":true,"enableTokens":false,"enableMiniClusters":true,"enableNewJobList":true,"enableDebugUI":false,"enableStreamingMetricsDashboard":true,"allowNonAdminUsers":true,"enableSingleSignOnByTier":false,"enableJobsRetryOnTimeout":true,"loginLogo":"/login/databricks_logoTM_rgb_TM.svg","useStandardTierUpgradeTooltips":true,"staticNotebookResourceUrl":"https://databricks-prod-cloudfront.cloud.databricks.com/static/7f9cefa92f0da43a505f7213ef5a6bb5d4a409ec4e0540a0a55701946063455d/","enableSpotClusterType":true,"enableSparkPackages":true,"checkAadUserInWorkspaceTenant":false,"dynamicSparkVersions":true,"useIframeForHtmlResult":false,"enableClusterTagsUIByTier":false,"enableNotebookHistoryUI":true,"addWhitespaceAfterLastNotebookCell":true,"enableClusterLoggingUI":true,"enableDatabaseDropdownInTableUI":true,"showDebugCounters":false,"enableInstanceProfilesUI":false,"enableFolderHtmlExport":true,"homepageFeaturedLinks":[{"linkURI":"https://docs.databricks.com/_static/notebooks/gentle-introduction-to-apache-spark.html","displayName":"Introduction to Apache Spark on Databricks","icon":"img/home/Python_icon.svg"},{"linkURI":"https://docs.databricks.com/_static/notebooks/databricks-for-data-scientists.html","displayName":"Databricks for Data Scientists","icon":"img/home/Scala_icon.svg"},{"linkURI":"https://docs.databricks.com/_static/notebooks/structured-streaming-python.html","displayName":"Introduction to Structured Streaming","icon":"img/home/Python_icon.svg"}],"enableClusterStart":false,"maxImportFileVersion":5,"enableEBSVolumesUIByTier":false,"enableTableAclService":true,"removeSubCommandCodeWhenExport":true,"upgradeURL":"https://accounts.cloud.databricks.com/registration.html#login","maxAutoterminationMinutes":10000,"showResultsFromExternalSearchEngine":true,"autoterminateClustersByDefault":true,"notebookLoadingBackground":"#fff","sshContainerForwardedPort":2200,"enableServerAutoComplete":true,"enableStaticHtmlImport":true,"enableInstanceProfilesByTier":false,"showForgotPasswordLink":true,"defaultMemoryPerContainerMB":6000,"enablePresenceUI":true,"minAutoterminationMinutes":10,"accounts":true,"useOnDemandClustersByDefault":true,"useFramedStaticNotebooks":false,"enableNewProgressReportUI":true,"enableAutoCreateUserUI":true,"defaultCoresPerContainer":4,"showTerminationReason":true,"enableNewClustersGet":true,"showPricePerDBU":false,"showSqlProxyUI":true,"enableNotebookErrorHighlighting":true};</script>
<script>var __DATABRICKS_NOTEBOOK_MODEL = {"version":"NotebookV1","origId":3730491142721527,"name":"SparkImplementation","language":"scala","commands":[{"version":"CommandV1","origId":1765094099584240,"guid":"fc175711-dbda-4587-acee-f91da50ac08e","subtype":"command","commandType":"auto","position":0.43768310546875,"command":"/**************************************************************************\n* Copyright 2018 Benny Avelin\n*\n* Licensed under the Apache License, Version 2.0 (the \"License\");\n* you may not use this file except in compliance with the License.\n* You may obtain a copy of the License at\n*\n* http://www.apache.org/licenses/LICENSE-2.0\n*\n* Unless required by applicable law or agreed to in writing, software\n* distributed under the License is distributed on an \"AS IS\" BASIS,\n* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n* See the License for the specific language governing permissions and\n* limitations under the License.\n**************************************************************************/","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1520698508559,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"0d6cf790-d51a-4604-b40c-6828bf921318"},{"version":"CommandV1","origId":2071800560869153,"guid":"991febe3-8db7-4143-a16b-eb082cd91890","subtype":"command","commandType":"auto","position":0.8753662109375,"command":"%md # Description of the ML framework\n<img src=\"https://drive.google.com/a/combient.com/uc?id=1y8VSMDOR-nWi-_buut9Ti0dTHen8uG_f&export=download\" align=\"middle\"></img>\n\nThe above diagram describes the process which data is processed using the streaming machine learning framework. We need to be careful with `SStreamingMLAggregator` since this does the aggregation in a way that is natural when you hear it.\n\n<img src=\"https://drive.google.com/a/combient.com/uc?id=1NXuqMa4xjF4ASXJzGpgBvyiEVlz6RB4P&export=download\"></img>\n\nWhenever a new batch gets processed, it arrives in the input stream and gets split into each `SStreamingMLNode`, each Node does `updateAcrossBatch` and sends an updated output which becomes `Batch1.x`. The first batch gets aggregated into a running state and each new batch arrives, `Batch2` in the above, it gets processed and aggregated ontop of the running state.\n\nThis implies that if we wish to train a model in a parallel way, i.e. we have one model per `SStreamingMLNode`, we need to be a bit more clever when we do the final aggregation since we always have a running state when the new batch arrives. I.e. as it is solved in `TestSStreamingMLxx`, we have the running state as a `Map`, keeping track of each `TestSStreamingMLNode`s output and updates that nodes state in the map. ","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":0,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"0c19ae41-d1bc-41d6-a0b8-2334a1e39a38"},{"version":"CommandV1","origId":2071800560869154,"guid":"068e38fd-1abc-4358-9249-ced921c3a95c","subtype":"command","commandType":"auto","position":0.87554931640625,"command":"%md # Updating the model\n* FunctionRegister holds the update functions for use when creating the sink, i.e. we first need to register our specific update function with a unique key value that is later passed as an option when creating the sink.\n\n<img src=\"https://drive.google.com/a/combient.com/uc?id=1LC7GOFAWe4tsyr_L-VKKZ4fSQ-rIiV5Q&export=download\"></img>\n\nWhen we create a `SStreamingML` we need to make sure that in the constructor that the particular update function i.e. `this.update` is registered in the `FunctionRegister`, a template for how this is done can be found in TestSStreamingML. Next whenever we call `fit` on a `SStreamingML` we are creating a `DataStreamWriter[SStreamingML]`, this needs a `Sink` and the way this works is that a `SinkProvider` is called and we can only pass strings as options to this. Thus we have used the functionregister to register the update function that later can be pulled by the SinkProvider based on a unique string sent using an option. Exactly how this works can be seen in the diagram and looking through the example code regarding `TestSStreamingxx`","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":0,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"8695821e-cbef-4891-a92e-463ad569d1ba"},{"version":"CommandV1","origId":1594016828504851,"guid":"965487db-1d61-46ac-ba2d-24f2edde0bbf","subtype":"command","commandType":"auto","position":0.87646484375,"command":"%md # Structured streaming machine learning framework\nWe need the following components:\n* A model to store the current state of the model\n* A grouping operation, which essentially becomes the partitioning of the data\n* An aggregation over the first grouping of the data, this is an aggregation that is run on the node\n* An aggregation over the output from the first aggregation.\n\nWe can think of this process as splitting the dataset into streams, process each stream individually and return an aggregated output from each stream. Lastly aggregate on the aggregated output to produce a join of the models over all the streams to a single output. ","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":0,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"6f8c1cd9-6ede-40e3-979b-fd708a237b5f"},{"version":"CommandV1","origId":2071800560869160,"guid":"4139536a-9609-4b32-a853-6eb72d0e9071","subtype":"command","commandType":"auto","position":0.8768310546875,"command":"import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode, GroupState, StreamingQuery}\nimport org.apache.spark.sql.expressions.Aggregator\nimport org.apache.spark.sql.{Encoder, Encoders, DataFrame, Dataset}\nimport org.apache.spark.sql.functions._\nimport org.apache.spark.ml.linalg.Vector\n\nimport scala.reflect.ClassTag\n\nimport scala.util.Random\n\nabstract class SStreamingMLModel extends Serializable {\n def addBatch(batch: Iterator[Row]): SStreamingMLModel\n def mergeModels(right : SStreamingMLModel):SStreamingMLModel\n def toOutput : SStreamingMLModel\n}\n\nabstract class SStreamingMLNode[T](var features:String = \"features\") extends Serializable{\n \n def setFeaturesCol(features:String) = {\n this.features = features\n this\n }\n \n def updateAcrossBatch(index:Int, inputs: Iterator[Row], oldState: GroupState[T]):Iterator[T]\n}\n\nabstract class SStreamingMLAggregator[T <: SStreamingMLModel] extends Aggregator[T,T,T] {\n // Combine two values to produce a new value. For performance, the function may modify `buffer`\n // and return it instead of constructing a new object\n def reduce(b1: T, b2: T): T = {\n //All scala knows at compilation is that b1.mergeModels will return SStreamingMLModel and we thus\n //need to cast it to type T, which is the actual type of the object. This is the price to pay for\n //dealing with languages that have to know the type at compilation time\n b1.mergeModels(b2).asInstanceOf[T]\n }\n // Merge two intermediate values\n def merge(b1: T, b2: T): T = {\n b1.mergeModels(b2).asInstanceOf[T]\n }\n // Transform the output of the reduction\n def finish(reduction: T): T = {\n reduction.toOutput.asInstanceOf[T]\n }\n}\n\nabstract class SStreamingML(var features:String = \"features\") extends Serializable{\n def setFeaturesCol(features:String) = {\n this.features = features\n this\n }\n def fit(df: DataFrame,checkPointLocation:String) : StreamingQuery\n def transform(df: DataFrame): Dataset[_]\n}","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":0,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"12cec517-9247-404d-b331-059505b9dd62"},{"version":"CommandV1","origId":2071800560869157,"guid":"4d8f5fd8-0c88-46cf-a130-de525e3f7474","subtype":"command","commandType":"auto","position":0.877197265625,"command":"package customsink\n\nimport org.apache.spark.internal.Logging\nimport org.apache.spark.SparkException\nimport org.apache.spark.sql.{DataFrame,Dataset, SaveMode, SQLContext}\nimport org.apache.spark.sql.execution.SQLExecution\nimport org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, StreamSinkProvider}\nimport org.apache.spark.sql.streaming.OutputMode\nimport org.apache.spark.sql.types.StructType\nimport org.apache.spark.sql.execution.streaming.Sink\n\n\nobject FunctionRegister {\n private val nullMap : Map[String,(DataFrame) => Unit] = Map[String,(DataFrame) => Unit]((\"null\",(_:DataFrame) => {\n throw new SparkException(\"MLUpdateSink: Null update function called!\")\n }))\n var updateFunctions : Map[String,(DataFrame) => Unit] = nullMap\n \n def register(uniqueID:String, updateFunc : (DataFrame) => Unit) = {\n updateFunctions = updateFunctions ++ Map((uniqueID,updateFunc))\n }\n def remove(uniqueID:String) : Boolean = {\n val exists = updateFunctions.get(uniqueID) match {\n case Some(_) => true\n case None => false\n }\n updateFunctions = updateFunctions-uniqueID\n exists\n }\n def clear() = {\n updateFunctions = nullMap\n }\n}\n\nclass MLUpdateSink(options: Map[String, String]) extends Sink with Logging {\n // Track the batch id\n private var lastBatchId = -1L\n \n private val update : (DataFrame) => Unit = FunctionRegister\n .updateFunctions\n .getOrElse( options.getOrElse(\"updateFunction\",\"null\")\n ,FunctionRegister.updateFunctions(\"null\"))\n\n override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized {\n //Do something\n update(data)\n }\n\n override def toString(): String = s\"AlertSink\"\n}\n\n\nclass MLUpdateSinkProvider extends StreamSinkProvider\n{\n def createSink(\n sqlContext: SQLContext,\n parameters: Map[String, String],\n partitionColumns: Seq[String],\n outputMode: OutputMode): Sink = {\n new MLUpdateSink(parameters)\n }\n def shortName(): String = \"alert\"\n}","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":0,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"6927ce68-c3f4-4646-a9ec-687a7ea55eac"},{"version":"CommandV1","origId":1594016828504850,"guid":"2c9fa53c-8bbf-47b8-a46d-0c752a16960b","subtype":"command","commandType":"auto","position":0.8779296875,"command":"import scala.collection.mutable.WrappedArray\n\nobject TestSStreamingMLAggregator extends SStreamingMLAggregator[TestSStreamingMLModel] {\n // A zero value for this aggregation. Should satisfy the property that any b + zero = b\n def zero: TestSStreamingMLModel = new TestSStreamingMLModel()\n // Specifies the Encoder for the intermediate value type\n def bufferEncoder: Encoder[TestSStreamingMLModel] = Encoders.product\n // Specifies the Encoder for the final output value type\n def outputEncoder: Encoder[TestSStreamingMLModel] = Encoders.product\n}\n\ncase class TestSStreamingMLModel( val states: Map[Int,Array[Double]] = Map[Int,Array[Double]]()\n ,features:String = \"features\") extends SStreamingMLModel {\n \n def addBatch(batch: Iterator[Row]) : TestSStreamingMLModel = {\n this\n }\n \n def addBatch(index:Int, batch: Iterator[Row]) : TestSStreamingMLModel = {\n val it : Iterator[Vector] = batch.map(row => row.getAs[Vector](features))\n addTestBatch(index,it)\n }\n \n def addTestBatch(index:Int, batch : Iterator[Vector]) : TestSStreamingMLModel = {\n //TestSStreamingMLModel(arr++(batch.toArray.flatMap(x => x.toArray)),features)\n TestSStreamingMLModel(Map((index,(batch.toArray.flatMap(x => x.toArray)))),features)\n }\n \n def mergeModels(right: SStreamingMLModel) : TestSStreamingMLModel = {\n val rightKMM = right.asInstanceOf[TestSStreamingMLModel]\n if (rightKMM.states.keys.size == 0) this\n else if (rightKMM.states.keys.size == 1) {\n val rightIndex = rightKMM.states.keys.head\n val newStates = states++Map((rightIndex,rightKMM.states(rightIndex)))\n TestSStreamingMLModel(newStates,features)\n }\n else {\n //Here we have the left and right are actually aggregations themselves\n val newStates = states++rightKMM.states\n TestSStreamingMLModel(newStates,features)\n }\n }\n \n def toOutput : TestSStreamingMLModel = {\n //this\n TestSStreamingMLModel(Map((0,states.valuesIterator.toArray.flatMap(x => x))),features)\n }\n}\n\nclass TestSStreamingMLNode(var inputCol : String = \"features\") extends SStreamingMLNode[TestSStreamingMLModel](inputCol) {\n \n def updateAcrossBatch(index:Int, inputs: Iterator[Row], oldState: GroupState[TestSStreamingMLModel]):Iterator[TestSStreamingMLModel] = {\n var state:TestSStreamingMLModel = if (oldState.exists) oldState.get else new TestSStreamingMLModel(features=this.features)\n \n state = state.addBatch(index,inputs)\n oldState.update(state)\n //Return needs to be an iterator of TestSStreamingMLModel\n Array(state).iterator\n }\n \n}\n\nclass TestSStreamingML(val numPartitions:Int = 10, var inputCol: String = \"features\") extends SStreamingML(inputCol) {\n var model: TestSStreamingMLModel = TestSStreamingMLModel()\n var sstreamingNode = new TestSStreamingMLNode(features)\n this.register()\n \n def fit(df: DataFrame,checkPointLocation:String): StreamingQuery = {\n //Fitting Query\n df.groupByKey(x => Random.nextInt(numPartitions))\n .flatMapGroupsWithState(OutputMode.Append,GroupStateTimeout.NoTimeout)(sstreamingNode.updateAcrossBatch)\n .select(TestSStreamingMLAggregator.toColumn)\n .writeStream\n .outputMode(\"update\")\n .format(\"customsink.MLUpdateSinkProvider\")\n .option(\"checkpointLocation\",checkPointLocation)\n .option(\"updateFunction\",this.toString).start()\n }\n def transform(df : DataFrame): Dataset[_] = {\n df\n }\n def update(ds : DataFrame) = {\n val collectedDataSet = ds.sparkSession.createDataFrame(\n ds.sparkSession.sparkContext.parallelize(ds.collect()), ds.schema).as[TestSStreamingMLModel]\n this.model = collectedDataSet.collect().head\n }\n def register() = {\n customsink.FunctionRegister.register(this.toString,this.update)\n }\n}","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">import scala.collection.mutable.WrappedArray\ndefined object TestSStreamingMLAggregator\ndefined class TestSStreamingMLModel\ndefined class TestSStreamingMLNode\ndefined class TestSStreamingML\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":"<div class=\"ansiout\">notebook:73: error: type mismatch;\n found : Array[Double]\n required: Map[Int,Array[Double]]\n case Row(a:WrappedArray[_],b:String) => TestSStreamingMLModel(a.map(point => point.asInstanceOf[Double]).array,b)\n ^\nnotebook:76: error: value arr is not a member of TestSStreamingMLModel\n println(model.arr.length)\n ^\n</div>","error":null,"workflows":[],"startTime":1518610624949,"submitTime":1518610624931,"finishTime":1518610626069,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"a038a943-cddc-49c9-8574-6d6f94a272ef"},{"version":"CommandV1","origId":2915234140761113,"guid":"5adf93db-dee5-431a-a169-2efd746eb39e","subtype":"command","commandType":"auto","position":0.881591796875,"command":"import org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.feature.VectorAssembler\n\ncustomsink.FunctionRegister.clear // This is used for testing and not production, since we have the same running instance and we wish to avoid registering twice\n\nval rates = spark.\n readStream.\n format(\"rate\"). // <-- use RateStreamSource\n option(\"rowsPerSecond\", 1).\n load\n\nval assembler = new VectorAssembler()\n .setInputCols(Array(\"value\"))\n .setOutputCol(\"features\")\nval pipeline = new Pipeline()\n .setStages(Array(assembler))\nval pipelineModel = pipeline.fit(rates)\nval transformedDF = pipelineModel.transform(rates)\n\nval testStream = new TestSStreamingML(inputCol=\"features\")\nval query = testStream.fit(transformedDF,\"/FileStore/checkpoint\")","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">import org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.feature.VectorAssembler\nrates: org.apache.spark.sql.DataFrame = [timestamp: timestamp, value: bigint]\nassembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_4cad977ee3bb\npipeline: org.apache.spark.ml.Pipeline = pipeline_a8b7b887ec89\npipelineModel: org.apache.spark.ml.PipelineModel = pipeline_a8b7b887ec89\ntransformedDF: org.apache.spark.sql.DataFrame = [timestamp: timestamp, value: bigint ... 1 more field]\ntestStream: TestSStreamingML = TestSStreamingML@51a5d7a\nquery: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@2470e18\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[{"name":"rates","typeStr":"org.apache.spark.sql.DataFrame","schema":{"type":"struct","fields":[{"name":"timestamp","type":"timestamp","nullable":true,"metadata":{}},{"name":"value","type":"long","nullable":true,"metadata":{}}]},"tableIdentifier":null},{"name":"transformedDF","typeStr":"org.apache.spark.sql.DataFrame","schema":{"type":"struct","fields":[{"name":"timestamp","type":"timestamp","nullable":true,"metadata":{}},{"name":"value","type":"long","nullable":true,"metadata":{}},{"name":"features","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{"ml_attr":{"attrs":{"numeric":[{"idx":0,"name":"value"}]},"num_attrs":1}}}]},"tableIdentifier":null}]},"errorSummary":"java.rmi.RemoteException: com.databricks.api.base.DatabricksServiceException: QUOTA_EXCEEDED: You have exceeded the maximum number of allowed files on Databricks Community Edition. To ensure free access, you are limited to 10000 files and 10 GB of storage in DBFS. Please use dbutils.fs to list and clean up files to restore service. You may have to wait a few minutes after cleaning up the files for the quota to be refreshed. (Files found: 11709); nested exception is: ","error":"<div class=\"ansiout\">\tcom.databricks.api.base.DatabricksServiceException: QUOTA_EXCEEDED: You have exceeded the maximum number of allowed files on Databricks Community Edition. To ensure free access, you are limited to 10000 files and 10 GB of storage in DBFS. Please use dbutils.fs to list and clean up files to restore service. You may have to wait a few minutes after cleaning up the files for the quota to be refreshed. (Files found: 11709)\n\tat com.databricks.backend.daemon.data.client.DbfsClient.send0(DbfsClient.scala:100)\n\tat com.databricks.backend.daemon.data.client.DbfsClient.sendIgnoreDraining(DbfsClient.scala:69)\n\tat com.databricks.backend.daemon.data.client.DbfsOutputStream.close0(DbfsOutputStream.scala:99)\n\tat com.databricks.backend.daemon.data.client.DbfsOutputStream.close(DbfsOutputStream.scala:78)\n\tat java.io.FilterOutputStream.close(FilterOutputStream.java:159)\n\tat org.apache.hadoop.fs.FSDataOutputStream$PositionCache.close(FSDataOutputStream.java:72)\n\tat org.apache.hadoop.fs.FSDataOutputStream.close(FSDataOutputStream.java:106)\n\tat sun.nio.cs.StreamEncoder.implClose(StreamEncoder.java:320)\n\tat sun.nio.cs.StreamEncoder.close(StreamEncoder.java:149)\n\tat java.io.OutputStreamWriter.close(OutputStreamWriter.java:233)\n\tat com.fasterxml.jackson.core.json.WriterBasedJsonGenerator.close(WriterBasedJsonGenerator.java:883)\n\tat com.fasterxml.jackson.databind.ObjectMapper._configAndWriteValue(ObjectMapper.java:3561)\n\tat com.fasterxml.jackson.databind.ObjectMapper.writeValue(ObjectMapper.java:2909)\n\tat org.json4s.jackson.Serialization$.write(Serialization.scala:27)\n\tat org.apache.spark.sql.execution.streaming.StreamMetadata$.write(StreamMetadata.scala:78)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$5.apply(StreamExecution.scala:120)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$5.apply(StreamExecution.scala:118)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution.<init>(StreamExecution.scala:118)\n\tat org.apache.spark.sql.streaming.StreamingQueryManager.createQuery(StreamingQueryManager.scala:250)\n\tat org.apache.spark.sql.streaming.StreamingQueryManager.startQuery(StreamingQueryManager.scala:288)\n\tat org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:277)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(command-2915234140761113:27)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(command-2915234140761113:107)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(command-2915234140761113:109)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$read$$iw$$iw$$iw$$iw$$iw.<init>(command-2915234140761113:111)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$read$$iw$$iw$$iw$$iw.<init>(command-2915234140761113:113)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$read$$iw$$iw$$iw.<init>(command-2915234140761113:115)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$read$$iw$$iw.<init>(command-2915234140761113:117)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$read$$iw.<init>(command-2915234140761113:119)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$read.<init>(command-2915234140761113:121)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$read$.<init>(command-2915234140761113:125)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$read$.<clinit>(command-2915234140761113)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$eval$.$print$lzycompute(<notebook>:7)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$eval$.$print(<notebook>:6)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e674.$eval.$print(<notebook>)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:786)\n\tat scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1047)\n\tat scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:638)\n\tat scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:637)\n\tat scala.reflect.internal.util.ScalaClassLoader$class.asContext(ScalaClassLoader.scala:31)\n\tat scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:19)\n\tat scala.tools.nsc.interpreter.IMain$WrappedRequest.loadAndRunReq(IMain.scala:637)\n\tat scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:569)\n\tat scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:565)\n\tat com.databricks.backend.daemon.driver.DriverILoop.execute(DriverILoop.scala:186)\n\tat com.databricks.backend.daemon.driver.ScalaDriverLocal$$anonfun$repl$1.apply$mcV$sp(ScalaDriverLocal.scala:182)\n\tat com.databricks.backend.daemon.driver.ScalaDriverLocal$$anonfun$repl$1.apply(ScalaDriverLocal.scala:182)\n\tat com.databricks.backend.daemon.driver.ScalaDriverLocal$$anonfun$repl$1.apply(ScalaDriverLocal.scala:182)\n\tat com.databricks.backend.daemon.driver.DriverLocal$TrapExitInternal$.trapExit(DriverLocal.scala:457)\n\tat com.databricks.backend.daemon.driver.DriverLocal$TrapExit$.apply(DriverLocal.scala:411)\n\tat com.databricks.backend.daemon.driver.ScalaDriverLocal.repl(ScalaDriverLocal.scala:182)\n\tat com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$3.apply(DriverLocal.scala:235)\n\tat com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$3.apply(DriverLocal.scala:216)\n\tat com.databricks.logging.UsageLogging$$anonfun$withAttributionContext$1.apply(UsageLogging.scala:188)\n\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)\n\tat com.databricks.logging.UsageLogging$class.withAttributionContext(UsageLogging.scala:183)\n\tat com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:40)\n\tat com.databricks.logging.UsageLogging$class.withAttributionTags(UsageLogging.scala:221)\n\tat com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:40)\n\tat com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:216)\n\tat com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$tryExecutingCommand$2.apply(DriverWrapper.scala:601)\n\tat com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$tryExecutingCommand$2.apply(DriverWrapper.scala:601)\n\tat scala.util.Try$.apply(Try.scala:192)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.tryExecutingCommand(DriverWrapper.scala:596)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.getCommandOutputAndError(DriverWrapper.scala:486)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.executeCommand(DriverWrapper.scala:554)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.runInnerLoop(DriverWrapper.scala:391)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.runInner(DriverWrapper.scala:348)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.run(DriverWrapper.scala:215)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: com.databricks.api.base.DatabricksServiceException: QUOTA_EXCEEDED: You have exceeded the maximum number of allowed files on Databricks Community Edition. To ensure free access, you are limited to 10000 files and 10 GB of storage in DBFS. Please use dbutils.fs to list and clean up files to restore service. You may have to wait a few minutes after cleaning up the files for the quota to be refreshed. (Files found: 11709)\n\tat com.databricks.api.rpc.ScalaProtoRpcSerializer.deserializeException(ScalaProtoRpcSerializer.scala:156)\n\tat com.databricks.api.rpc.ScalaProtoRpcSerializer.deserializeException(ScalaProtoRpcSerializer.scala:149)\n\tat com.databricks.rpc.JettyClient$$anonfun$6.apply(JettyClient.scala:320)\n\tat com.databricks.rpc.JettyClient$$anonfun$6.apply(JettyClient.scala:318)\n\tat com.databricks.util.UntrustedUtils$.logUncaughtExceptions(UntrustedUtils.scala:34)\n\tat com.databricks.rpc.JettyClient.getResponseFromExchange(JettyClient.scala:318)\n\tat com.databricks.rpc.JettyClient.com$databricks$rpc$JettyClient$$processResponse0(JettyClient.scala:353)\n\tat com.databricks.rpc.JettyClient$$anonfun$com$databricks$rpc$JettyClient$$processResponse$1.apply(JettyClient.scala:342)\n\tat com.databricks.logging.UsageLogging$$anonfun$withAttributionContext$1.apply(UsageLogging.scala:188)\n\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)\n\tat com.databricks.logging.UsageLogging$class.withAttributionContext(UsageLogging.scala:183)\n\tat com.databricks.rpc.JettyClient.withAttributionContext(JettyClient.scala:31)\n\tat com.databricks.rpc.JettyClient.com$databricks$rpc$JettyClient$$processResponse(JettyClient.scala:340)\n\tat com.databricks.rpc.JettyClient$$anonfun$5.apply(JettyClient.scala:58)\n\tat scala.util.Try$.apply(Try.scala:161)\n\tat com.databricks.rpc.JettyClient.send(JettyClient.scala:58)\n\tat com.databricks.rpc.BoundRPCClient.send(BoundRPCClient.scala:36)\n\tat com.databricks.rpc.ReliableJettyClient$$anonfun$sendNonIdempotent$1.apply(ReliableJettyClient.scala:81)\n\tat com.databricks.rpc.ReliableJettyClient.retryOnNetworkError(ReliableJettyClient.scala:152)\n\tat com.databricks.rpc.ReliableJettyClient.sendNonIdempotent(ReliableJettyClient.scala:80)\n\tat com.databricks.backend.daemon.data.server.DbfsLimitEnforcer.allocate(DbfsLimitEnforcer.scala:22)\n\tat com.databricks.s3a.enforcer.S3AEnforcer$.allocateWithOverwriteCheck(S3AEnforcer.scala:43)\n\tat com.databricks.s3a.aws.transfer.EnforcingDatabricksTransferManager.upload(EnforcingDatabricksTransferManager.scala:35)\n\tat com.databricks.s3a.S3AOutputStream.close(S3AOutputStream.java:139)\n\tat org.apache.hadoop.fs.FSDataOutputStream$PositionCache.close(FSDataOutputStream.java:72)\n\tat org.apache.hadoop.fs.FSDataOutputStream.close(FSDataOutputStream.java:106)\n\tat java.io.FilterOutputStream.close(FilterOutputStream.java:159)\n\tat com.databricks.backend.daemon.data.server.session.IOStreamManager$OutStream.close(IOStreamManager.scala:155)\n\tat com.databricks.backend.daemon.data.server.handler.IOStreamHandler.receive(IOStreamHandler.scala:25)\n\tat com.databricks.backend.daemon.data.server.session.SessionContext$$anonfun$queryHandlers$1.apply(SessionContext.scala:71)\n\tat com.databricks.backend.daemon.data.server.session.SessionContext$$anonfun$queryHandlers$1.apply(SessionContext.scala:70)\n\tat scala.collection.immutable.List.foreach(List.scala:318)\n\tat com.databricks.backend.daemon.data.server.session.SessionContext.queryHandlers(SessionContext.scala:70)\n\tat com.databricks.backend.daemon.data.server.DbfsServerBackend$$anonfun$receive$1.applyOrElse(DbfsServerBackend.scala:272)\n\tat com.databricks.backend.daemon.data.server.DbfsServerBackend$$anonfun$receive$1.applyOrElse(DbfsServerBackend.scala:252)\n\tat com.databricks.rpc.ServerBackend$$anonfun$internalReceive$1$$anonfun$apply$1.apply(ServerBackend.scala:42)\n\tat com.databricks.rpc.ServerBackend$$anonfun$internalReceive$1$$anonfun$apply$1.apply(ServerBackend.scala:38)\n\tat com.databricks.rpc.ServerBackend$$anonfun$com$databricks$rpc$ServerBackend$$commonReceive$1.applyOrElse(ServerBackend.scala:58)\n\tat com.databricks.rpc.ServerBackend$$anonfun$com$databricks$rpc$ServerBackend$$commonReceive$1.applyOrElse(ServerBackend.scala:58)\n\tat com.databricks.rpc.ServerBackend$$anonfun$internalReceive$1.apply(ServerBackend.scala:38)\n\tat com.databricks.logging.UsageLogging$$anonfun$recordOperation$1.apply(UsageLogging.scala:313)\n\tat com.databricks.logging.UsageLogging$$anonfun$withAttributionContext$1.apply(UsageLogging.scala:188)\n\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)\n\tat com.databricks.logging.UsageLogging$class.withAttributionContext(UsageLogging.scala:183)\n\tat com.databricks.rpc.ServerBackend.withAttributionContext(ServerBackend.scala:13)\n\tat com.databricks.logging.UsageLogging$class.withAttributionTags(UsageLogging.scala:221)\n\tat com.databricks.rpc.ServerBackend.withAttributionTags(ServerBackend.scala:13)\n\tat com.databricks.logging.UsageLogging$class.recordOperation(UsageLogging.scala:298)\n\tat com.databricks.rpc.ServerBackend.recordOperation(ServerBackend.scala:13)\n\tat com.databricks.rpc.ServerBackend.internalReceive(ServerBackend.scala:37)\n\tat com.databricks.rpc.JettyServer$RequestManager$$anonfun$10.apply(JettyServer.scala:285)\n\tat scala.util.Try$.apply(Try.scala:161)\n\tat com.databricks.rpc.JettyServer$RequestManager.handleRPC(JettyServer.scala:285)\n\tat com.databricks.rpc.JettyServer$RequestManager.com$databricks$rpc$JettyServer$RequestManager$$handleRequestAndRespond(JettyServer.scala:220)\n\tat com.databricks.rpc.JettyServer$RequestManager$$anonfun$handleHttp$1.apply$mcV$sp(JettyServer.scala:154)\n\tat com.databricks.rpc.JettyServer$RequestManager$$anonfun$handleHttp$1.apply(JettyServer.scala:145)\n\tat com.databricks.rpc.JettyServer$RequestManager$$anonfun$handleHttp$1.apply(JettyServer.scala:145)\n\tat com.databricks.logging.UsageLogging$$anonfun$withAttributionContext$1.apply(UsageLogging.scala:188)\n\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)\n\tat com.databricks.logging.UsageLogging$class.withAttributionContext(UsageLogging.scala:183)\n\tat com.databricks.rpc.JettyServer$.withAttributionContext(JettyServer.scala:81)\n\tat com.databricks.logging.UsageLogging$class.withAttributionTags(UsageLogging.scala:221)\n\tat com.databricks.rpc.JettyServer$.withAttributionTags(JettyServer.scala:81)\n\tat com.databricks.rpc.JettyServer$RequestManager.handleHttp(JettyServer.scala:144)\n\tat com.databricks.rpc.JettyServer$RequestManager.doGet(JettyServer.scala:99)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:687)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:790)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:845)\n\tat org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:584)\n\tat org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:514)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:134)\n\tat org.eclipse.jetty.server.Server.handle(Server.java:523)\n\tat org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:320)\n\tat org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:251)\n\tat org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:273)\n\tat org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:95)\n\tat org.eclipse.jetty.io.SelectChannelEndPoint$2.run(SelectChannelEndPoint.java:93)\n\tat org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.executeProduceConsume(ExecuteProduceConsume.java:303)\n\tat org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.produceConsume(ExecuteProduceConsume.java:148)\n\tat org.eclipse.jetty.util.thread.strategy.ExecuteProduceConsume.run(ExecuteProduceConsume.java:136)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:671)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:589)\n\tat java.lang.Thread.run(Thread.java:748)</div>","workflows":[],"startTime":1518610633970,"submitTime":1518610633970,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"Testing the framework with rates as streaming input","showCommandTitle":true,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{"ce2d0ee1-0f3b-4a29-9fc6-5f8f6818c5fb":[{"name":"ce2d0ee1-0f3b-4a29-9fc6-5f8f6818c5fb","isActive":true,"sources":[],"sink":null,"exception":null,"version":2,"progressJson":null},{"name":"ce2d0ee1-0f3b-4a29-9fc6-5f8f6818c5fb","isActive":true,"sources":[],"sink":null,"exception":null,"version":2,"progressJson":"{\"name\":null,\"timestamp\":\"2018-02-14T12:17:43.947Z\",\"id\":\"b15fa34a-52be-4e06-94c3-6bfc75052777\",\"durationMs\":{\"getBatch\":9,\"queryPlanning\":64,\"triggerExecution\":26651,\"getOffset\":0,\"walCommit\":746,\"addBatch\":25785},\"stateOperators\":[{\"numRowsTotal\":1,\"numRowsUpdated\":1,\"memoryUsedBytes\":414},{\"numRowsTotal\":1,\"numRowsUpdated\":1,\"memoryUsedBytes\":12950}],\"processedRowsPerSecond\":0.037522044200968066}"},{"name":"ce2d0ee1-0f3b-4a29-9fc6-5f8f6818c5fb","isActive":true,"sources":[],"sink":null,"exception":null,"version":2,"progressJson":"{\"name\":null,\"timestamp\":\"2018-02-14T12:18:11.426Z\",\"inputRowsPerSecond\":0.9825685068597838,\"id\":\"b15fa34a-52be-4e06-94c3-6bfc75052777\",\"durationMs\":{\"getBatch\":9,\"queryPlanning\":37,\"triggerExecution\":13302,\"getOffset\":0,\"walCommit\":929,\"addBatch\":12325},\"stateOperators\":[{\"numRowsTotal\":1,\"numRowsUpdated\":1,\"memoryUsedBytes\":814},{\"numRowsTotal\":10,\"numRowsUpdated\":10,\"memoryUsedBytes\":20254}],\"processedRowsPerSecond\":2.029769959404601}"},{"name":"ce2d0ee1-0f3b-4a29-9fc6-5f8f6818c5fb","isActive":true,"sources":[],"sink":null,"exception":null,"version":2,"progressJson":"{\"name\":null,\"timestamp\":\"2018-02-14T12:18:25.629Z\",\"inputRowsPerSecond\":0.9857072449482505,\"id\":\"b15fa34a-52be-4e06-94c3-6bfc75052777\",\"durationMs\":{\"getBatch\":12,\"queryPlanning\":44,\"triggerExecution\":9452,\"getOffset\":0,\"walCommit\":909,\"addBatch\":8483},\"stateOperators\":[{\"numRowsTotal\":1,\"numRowsUpdated\":1,\"memoryUsedBytes\":742},{\"numRowsTotal\":10,\"numRowsUpdated\":9,\"memoryUsedBytes\":20182}],\"processedRowsPerSecond\":1.4811680067710538}"},{"name":"ce2d0ee1-0f3b-4a29-9fc6-5f8f6818c5fb","isActive":false,"sources":[{"description":"RateSource[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=8]","latestOffset":"42"}],"sink":{"description":"AlertSink","latestOffset":"[42]"},"exception":null,"version":2,"progressJson":"{\n \"id\" : \"b15fa34a-52be-4e06-94c3-6bfc75052777\",\n \"runId\" : \"ce2d0ee1-0f3b-4a29-9fc6-5f8f6818c5fb\",\n \"name\" : null,\n \"timestamp\" : \"2018-02-14T12:18:25.629Z\",\n \"batchId\" : 2,\n \"numInputRows\" : 14,\n \"inputRowsPerSecond\" : 0.9857072449482505,\n \"processedRowsPerSecond\" : 1.4811680067710538,\n \"durationMs\" : {\n \"addBatch\" : 8483,\n \"getBatch\" : 12,\n \"getOffset\" : 0,\n \"queryPlanning\" : 44,\n \"triggerExecution\" : 9452,\n \"walCommit\" : 909\n },\n \"stateOperators\" : [ {\n \"numRowsTotal\" : 1,\n \"numRowsUpdated\" : 1,\n \"memoryUsedBytes\" : 742\n }, {\n \"numRowsTotal\" : 10,\n \"numRowsUpdated\" : 9,\n \"memoryUsedBytes\" : 20182\n } ],\n \"sources\" : [ {\n \"description\" : \"RateSource[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=8]\",\n \"startOffset\" : 28,\n \"endOffset\" : 42,\n \"numInputRows\" : 14,\n \"inputRowsPerSecond\" : 0.9857072449482505,\n \"processedRowsPerSecond\" : 1.4811680067710538\n } ],\n \"sink\" : {\n \"description\" : \"AlertSink\"\n }\n}"}]},"nuid":"67eb7a59-2b07-4d15-bd07-1521044de70e"},{"version":"CommandV1","origId":2071800560869155,"guid":"c71be690-faa4-4e54-b0e7-08ca09818542","subtype":"command","commandType":"auto","position":0.881622314453125,"command":"testStream.model.states(0)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res17: Array[Double] = Array(13.0, 22.0, 24.0, 10.0, 16.0, 1.0, 26.0, 2.0, 3.0, 6.0, 17.0, 4.0, 14.0, 27.0, 5.0, 7.0, 11.0, 20.0, 12.0, 15.0, 18.0, 8.0, 9.0, 19.0, 21.0, 23.0, 25.0)\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1518610708400,"submitTime":1518610708383,"finishTime":1518610708658,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"8ce86a3c-d51c-411f-bdca-ca3f7ce66099"},{"version":"CommandV1","origId":2915234140761122,"guid":"41a8881f-af04-470d-a8f1-b8c2fe1d85c9","subtype":"command","commandType":"auto","position":0.88165283203125,"command":"testStream.model.states(0).length","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">res6: Int = 21\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":"java.util.NoSuchElementException: key not found: 0","error":"<div class=\"ansiout\">\tat scala.collection.MapLike$class.default(MapLike.scala:228)\n\tat scala.collection.AbstractMap.default(Map.scala:59)\n\tat scala.collection.MapLike$class.apply(MapLike.scala:141)\n\tat scala.collection.AbstractMap.apply(Map.scala:59)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(command-2915234140761122:1)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(command-2915234140761122:62)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(command-2915234140761122:64)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$read$$iw$$iw$$iw$$iw$$iw.<init>(command-2915234140761122:66)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$read$$iw$$iw$$iw$$iw.<init>(command-2915234140761122:68)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$read$$iw$$iw$$iw.<init>(command-2915234140761122:70)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$read$$iw$$iw.<init>(command-2915234140761122:72)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$read$$iw.<init>(command-2915234140761122:74)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$read.<init>(command-2915234140761122:76)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$read$.<init>(command-2915234140761122:80)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$read$.<clinit>(command-2915234140761122)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$eval$.$print$lzycompute(<notebook>:7)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$eval$.$print(<notebook>:6)\n\tat lineea4fbb6de0ed44d980ede6c28590e6e650.$eval.$print(<notebook>)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:786)\n\tat scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1047)\n\tat scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:638)\n\tat scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:637)\n\tat scala.reflect.internal.util.ScalaClassLoader$class.asContext(ScalaClassLoader.scala:31)\n\tat scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:19)\n\tat scala.tools.nsc.interpreter.IMain$WrappedRequest.loadAndRunReq(IMain.scala:637)\n\tat scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:569)\n\tat scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:565)\n\tat com.databricks.backend.daemon.driver.DriverILoop.execute(DriverILoop.scala:186)\n\tat com.databricks.backend.daemon.driver.ScalaDriverLocal$$anonfun$repl$1.apply$mcV$sp(ScalaDriverLocal.scala:182)\n\tat com.databricks.backend.daemon.driver.ScalaDriverLocal$$anonfun$repl$1.apply(ScalaDriverLocal.scala:182)\n\tat com.databricks.backend.daemon.driver.ScalaDriverLocal$$anonfun$repl$1.apply(ScalaDriverLocal.scala:182)\n\tat com.databricks.backend.daemon.driver.DriverLocal$TrapExitInternal$.trapExit(DriverLocal.scala:457)\n\tat com.databricks.backend.daemon.driver.DriverLocal$TrapExit$.apply(DriverLocal.scala:411)\n\tat com.databricks.backend.daemon.driver.ScalaDriverLocal.repl(ScalaDriverLocal.scala:182)\n\tat com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$3.apply(DriverLocal.scala:235)\n\tat com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$3.apply(DriverLocal.scala:216)\n\tat com.databricks.logging.UsageLogging$$anonfun$withAttributionContext$1.apply(UsageLogging.scala:188)\n\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)\n\tat com.databricks.logging.UsageLogging$class.withAttributionContext(UsageLogging.scala:183)\n\tat com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:40)\n\tat com.databricks.logging.UsageLogging$class.withAttributionTags(UsageLogging.scala:221)\n\tat com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:40)\n\tat com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:216)\n\tat com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$tryExecutingCommand$2.apply(DriverWrapper.scala:601)\n\tat com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$tryExecutingCommand$2.apply(DriverWrapper.scala:601)\n\tat scala.util.Try$.apply(Try.scala:192)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.tryExecutingCommand(DriverWrapper.scala:596)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.getCommandOutputAndError(DriverWrapper.scala:486)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.executeCommand(DriverWrapper.scala:554)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.runInnerLoop(DriverWrapper.scala:391)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.runInner(DriverWrapper.scala:348)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.run(DriverWrapper.scala:215)\n\tat java.lang.Thread.run(Thread.java:748)</div>","workflows":[],"startTime":1518610068495,"submitTime":1518610068475,"finishTime":1518610068782,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"92feef73-88e2-4d3e-8331-e4e677d3b506"},{"version":"CommandV1","origId":2071800560869159,"guid":"4407cc6d-d108-4770-9c54-c718acc63e7c","subtype":"command","commandType":"auto","position":0.94091796875,"command":"%md # Structured Streaming ML\nWhen trying to adapt a *structured streaming* ML workflow, we run in to the issue of how to store the learned model. Holden Karau realized this and implemented what she calls an `EvilStreamingQueryManager`. This is needed since she solves the storage of a learned model by implementing a CustomSink called `ForEachDatasetSink` and a `ForEachDatasetSinkProvider`. I.e. we can only use the provided sinks unless we provide a *Provider*. We can however avoid using the `EvilStreamingQueryManager` by using the classpath for the provider as `.format(\"package.CustomProviderName\")` on a `DataStreamWriter` object.\n\n## Streaming K-Means\nStreaming K-Means and Online K-Means means different things based on who you ask, therefore we will be specific and state that what we mean is that the information that is kept by the model is \"independent\" of the size of the dataset. Why independent is in quotation marks is because if we implement a way for cluster counts to grow with the size of the data, we only keep some log factor of the datasets size. Ignoring this technical issue for now, we will work under the assumption that the data kept is independent of datasize.\n\n### Online K-Means\nThe simplest form of an online K-Means with no way of forgetting, changing size and so on. Begin by fixing the number of clusters `k1`\n\n```python\nState definition:\nKMeansState: a set of points with weights\n```\n\n```python\nOnline K-Means algorithm:\ninput: a point p with weight 1, and a KMeansState state of length <=k1\noutput: clusterpoints with weights of size <= k1\nif size(state) < k1\n state.append((p,1))\n return state\nelse\n point,weight = find_closest(state,p,1)\n state(point).update(average(point,weight,p,1),weight+1)\n return state\n```\n\n#### Forgetfulness in online K-Means\nHow do we handle forgetfulness? One way of dealing with this is to incorporate a decay factor `a` in the following way\n```python\nOnline K-Means algorithm with forgetfulness:\ninput: a point p with weight 1, and a KMeansState state of length <=k1\noutput: clusterpoints with weights of size <= k1\nif size(state) < k1\n state.append((p,1))\n return state\nelse\n point,weight = find_closest(state,p,1)\n state(point).update(average(point,weight*a,p,1),weight*a+1)\n return state\n```\nanother way which is related to the above is to work in mini-batch mode, i.e. instead of adding a point to the online algorithm we add a mini-batch. The only difference from the above is that we perform labeling of each point in the mini-batch and then perform averages over the labels before changing the value of the cluster points.\n\n### Parallelizing the algorithm\nTo parallelize a streaming k-Means, we will split the stream into a predefined number of groups by using a *salted index* type approach, i.e. we group the data with labels drawn from a uniform distribution. In average this means that the streams will be of similar size/rate. How to decide how many streams one needs depends on the computational resources as well as the rate for which data arrives. I.e. we create parallel streams with `k1` cluster points and use shortest pairwise distance to join the clusters. The method proposed below is not commutative which doesnt really make that much sense. **TODO: Fix this**\n\n```python\nMerge of two states:\ninput: two KMeansStates (L1, L2) each of length <= k1\noutput: a KMeansState of length <= k1\noutState = []\nfor point,weight in L1:\n L2point, L2weight = find_closest(L2,point,weight)\n outState.append((average(L2point,L2weight,point, weight),weight+L2weight))\nreturn outState\n```\n\n### Second method to aggregate\nOne way to parallelize the stream is to do as above but changing the way to aggregate. \n\n```python\nAppend merge:\ninput: two KMeansState(L1,L2)\noutput: a KMeansState\nreturn L1++L2\n```\n\nThis results in a \"large\" `KMeansState` containing all the proposed centers from the parallel processes that each runs online K-Means as described above. The way we process this large state variable is to run K-Means++ on it in the following way.\n\n```python\nKMeans++ on states:\ninput: a KMeansState state of size >= k1 and a cluster counts k2 <= k1\noutput: a KMeansState of size k2\n\nreset all the weights in state to be 1 # Does this make sense?\ninit_point = state.sample()\nproposed_centers = [(init_point,1)]\nwhile (size(proposed_centers) < k2)\n distances = squared_distance(proposed_centers,state) #Compute the distance from each point in state to the set of centers\n next_sample = state.sample(pdf = distances/distances.sum)\n proposed_centers.append((next_sample,1))\n proposed_centers.distinct()\n\noutState = state\n .map((point,weight) => (find_closest(proposed_centers,point,weight).point,(point,weight)))\n .groupByKey()\n .count()\nreturn outState\n```","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":0,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"6bf7309a-4ff9-4d22-a725-f54c02cfb2f8"}],"dashboards":[],"guid":"e69d10a9-b052-4471-9aa0-59cac640178f","globalVars":{},"iPythonMetadata":null,"inputWidgets":{}};</script>
<script
src="https://databricks-prod-cloudfront.cloud.databricks.com/static/7f9cefa92f0da43a505f7213ef5a6bb5d4a409ec4e0540a0a55701946063455d/js/metrics-graphics.js"
onerror="window.mainJsLoadError = true;"></script>
<script
src="https://databricks-prod-cloudfront.cloud.databricks.com/static/7f9cefa92f0da43a505f7213ef5a6bb5d4a409ec4e0540a0a55701946063455d/js/notebook-main.js"
onerror="window.mainJsLoadError = true;"></script>
</head>
<body>
<script>
if (window.mainJsLoadError) {
var u = 'https://databricks-prod-cloudfront.cloud.databricks.com/static/7f9cefa92f0da43a505f7213ef5a6bb5d4a409ec4e0540a0a55701946063455d/js/notebook-main.js';
var b = document.getElementsByTagName('body')[0];
var c = document.createElement('div');
c.innerHTML = ('<h1>Network Error</h1>' +
'<p><b>Please check your network connection and try again.</b></p>' +
'<p>Could not load a required resource: ' + u + '</p>');
c.style.margin = '30px';
c.style.padding = '20px 50px';
c.style.backgroundColor = '#f5f5f5';
c.style.borderRadius = '5px';
b.appendChild(c);
}
</script>
</body>
</html>