From 5f3348296aee6b1e891345e1037036984209789a Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Wed, 16 Jul 2014 06:24:32 -0700 Subject: [PATCH 01/22] update pom.xml for hadoop-2.3-cdh50.0 and hbase 0.96.1.1 --- pom.xml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pom.xml b/pom.xml index ae97bf03c53a2..23d724dad7722 100644 --- a/pom.xml +++ b/pom.xml @@ -1069,6 +1069,16 @@ + + hadoop-2.3-cdh5.0.0 + + 2.3.0-cdh5.0.0 + 2.5.0 + 0.9.0 + 0.96.1.1-cdh5.0.0 + + + hadoop-2.4 From 8ddbcce084df8f3310ec4cbac4c16a3bc864d3b4 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Wed, 23 Jul 2014 09:09:26 -0700 Subject: [PATCH 02/22] Mesos workaround --- .../org/apache/spark/executor/MesosExecutorBackend.scala | 2 +- .../spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala index a42c8b43bbf7f..8c44c1696c833 100644 --- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala @@ -41,7 +41,7 @@ private[spark] class MesosExecutorBackend driver.sendStatusUpdate(MesosTaskStatus.newBuilder() .setTaskId(mesosTaskId) .setState(TaskState.toMesos(state)) - .setData(ByteString.copyFrom(data)) +// .setData(ByteString.copyFrom(data)) .build()) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index c717e7c621a8f..8afe2ecfec40a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -116,7 +116,7 @@ private[spark] class MesosSchedulerBackend( ExecutorInfo.newBuilder() .setExecutorId(ExecutorID.newBuilder().setValue(execId).build()) .setCommand(command) - .setData(ByteString.copyFrom(createExecArg())) +// .setData(ByteString.copyFrom(createExecArg())) .addResources(memory) .build() } @@ -253,7 +253,7 @@ private[spark] class MesosSchedulerBackend( .setExecutor(createExecutorInfo(slaveId)) .setName(task.name) .addResources(cpuResource) - .setData(ByteString.copyFrom(task.serializedTask)) +// .setData(ByteString.copyFrom(task.serializedTask)) .build() } From 7ea3391911a472f08121b95a0e19508c43dd3638 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Tue, 22 Jul 2014 21:33:25 -0700 Subject: [PATCH 03/22] SPARK-2638 MapOutputTracker concurrency improvement Rolled back files not intended for checkin --- .../org/apache/spark/MapOutputTracker.scala | 2 +- .../spark/executor/MesosExecutorBackend.scala | 2 +- .../cluster/mesos/MesosSchedulerBackend.scala | 4 +- .../apache/spark/MapOutputTrackerSuite.scala | 67 +++++++++++++++++-- pom.xml | 10 --- 5 files changed, 64 insertions(+), 21 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 894091761485d..560782079ff31 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -130,7 +130,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging if (statuses == null) { logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them") var fetchedStatuses: Array[MapStatus] = null - fetching.synchronized { + shuffleId.toString.intern.synchronized { if (fetching.contains(shuffleId)) { // Someone else is fetching it; wait for them to be done while (fetching.contains(shuffleId)) { diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala index 8c44c1696c833..a42c8b43bbf7f 100644 --- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala @@ -41,7 +41,7 @@ private[spark] class MesosExecutorBackend driver.sendStatusUpdate(MesosTaskStatus.newBuilder() .setTaskId(mesosTaskId) .setState(TaskState.toMesos(state)) -// .setData(ByteString.copyFrom(data)) + .setData(ByteString.copyFrom(data)) .build()) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index 8afe2ecfec40a..c717e7c621a8f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -116,7 +116,7 @@ private[spark] class MesosSchedulerBackend( ExecutorInfo.newBuilder() .setExecutorId(ExecutorID.newBuilder().setValue(execId).build()) .setCommand(command) -// .setData(ByteString.copyFrom(createExecArg())) + .setData(ByteString.copyFrom(createExecArg())) .addResources(memory) .build() } @@ -253,7 +253,7 @@ private[spark] class MesosSchedulerBackend( .setExecutor(createExecutorInfo(slaveId)) .setName(task.name) .addResources(cpuResource) -// .setData(ByteString.copyFrom(task.serializedTask)) + .setData(ByteString.copyFrom(task.serializedTask)) .build() } diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala index 9702838085627..af3e2ecb80589 100644 --- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala @@ -17,6 +17,11 @@ package org.apache.spark +import java.util.Date +import java.util.concurrent._ + +import org.apache.log4j.Logger + import scala.concurrent.Await import akka.actor._ @@ -29,6 +34,7 @@ import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AkkaUtils class MapOutputTrackerSuite extends FunSuite with LocalSparkContext { + val logger = Logger.getLogger(getClass.getName) private val conf = new SparkConf test("compressSize") { assert(MapOutputTracker.compressSize(0L) === 0) @@ -137,29 +143,76 @@ class MapOutputTrackerSuite extends FunSuite with LocalSparkContext { val timeout = AkkaUtils.lookupTimeout(conf) slaveTracker.trackerActor = Await.result(selection.resolveOne(timeout), timeout) - masterTracker.registerShuffle(10, 1) + // Test single shuffle execution + val shuffleId = 10 + val start = new Date().getTime + invokeRemoteFetch(masterTracker, slaveTracker, shuffleId) + val singleFetchDuration = new Date().getTime - start + + // Test Parallel execution of shuffles + val NShuffles = 20 + import collection.mutable.ArrayBuffer + val threads = new ArrayBuffer[Thread](NShuffles) + val barrier = new CyclicBarrier(NShuffles) + val latch = new CountDownLatch(NShuffles) + + class ShuffleTesterThread(threadNum: Int, barrier: CyclicBarrier, latch : CountDownLatch) + extends Thread(s"ShuffleTester$threadNum") { + override def run() = { + val shuffleId = 20+threadNum + barrier.await + + invokeRemoteFetch(masterTracker, slaveTracker, shuffleId) + latch.countDown() + } + } + + for (shuffle <- 1 to NShuffles) { + val t = new ShuffleTesterThread(shuffle, barrier, latch) + threads += t + t.start() + } + val pstart = new Date().getTime + latch.await(10, TimeUnit.SECONDS) + assert(latch.getCount == 0, "Not all shuffles completed within allowed time period") + val parallelFetchDuration = new Date().getTime - pstart + log(s"All $NShuffles fetches Completed: duration=$parallelFetchDuration " + + s"(vs single fetch=$singleFetchDuration)") + assert (parallelFetchDuration > singleFetchDuration + && parallelFetchDuration <= (NShuffles * 0.4) * singleFetchDuration, + "Parallel remote fetch should show strong sub-linear execution time increase " + + s" vs number of remote Fetches. " + + s"Actual: single fetch=$singleFetchDuration $NShuffles-threads=$parallelFetchDuration") + + } + + def invokeRemoteFetch(masterTracker: MapOutputTrackerMaster, + slaveTracker: MapOutputTrackerWorker, shuffleId: Int): Unit = { + masterTracker.registerShuffle(shuffleId, 1) masterTracker.incrementEpoch() slaveTracker.updateEpoch(masterTracker.getEpoch) - intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) } + intercept[FetchFailedException] { slaveTracker.getServerStatuses(shuffleId, 0) } val compressedSize1000 = MapOutputTracker.compressSize(1000L) val size1000 = MapOutputTracker.decompressSize(compressedSize1000) - masterTracker.registerMapOutput(10, 0, new MapStatus( + masterTracker.registerMapOutput(shuffleId, 0, new MapStatus( BlockManagerId("a", "hostA", 1000, 0), Array(compressedSize1000))) masterTracker.incrementEpoch() slaveTracker.updateEpoch(masterTracker.getEpoch) - assert(slaveTracker.getServerStatuses(10, 0).toSeq === + assert(slaveTracker.getServerStatuses(shuffleId, 0).toSeq === Seq((BlockManagerId("a", "hostA", 1000, 0), size1000))) - masterTracker.unregisterMapOutput(10, 0, BlockManagerId("a", "hostA", 1000, 0)) + masterTracker.unregisterMapOutput(shuffleId, 0, BlockManagerId("a", "hostA", 1000, 0)) masterTracker.incrementEpoch() slaveTracker.updateEpoch(masterTracker.getEpoch) - intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) } + intercept[FetchFailedException] { slaveTracker.getServerStatuses(shuffleId, 0) } // failure should be cached - intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) } + intercept[FetchFailedException] { slaveTracker.getServerStatuses(shuffleId, 0) } } + def log(msg: String) = logger.info(msg) + test("remote fetch below akka frame size") { val newConf = new SparkConf newConf.set("spark.akka.frameSize", "1") diff --git a/pom.xml b/pom.xml index 23d724dad7722..ae97bf03c53a2 100644 --- a/pom.xml +++ b/pom.xml @@ -1069,16 +1069,6 @@ - - hadoop-2.3-cdh5.0.0 - - 2.3.0-cdh5.0.0 - 2.5.0 - 0.9.0 - 0.96.1.1-cdh5.0.0 - - - hadoop-2.4 From f780ad12db493634a8a14aa71de93d082db01a49 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 25 Jul 2014 02:15:39 -0700 Subject: [PATCH 04/22] Updated concurrency fix for using same monitor on the synchronized and wait logic --- .../org/apache/spark/MapOutputTracker.scala | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 560782079ff31..681d75d48e5ba 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -18,6 +18,7 @@ package org.apache.spark import java.io._ +import java.util.concurrent.ConcurrentSkipListSet import java.util.zip.{GZIPInputStream, GZIPOutputStream} import scala.collection.mutable.{HashSet, HashMap, Map} @@ -95,7 +96,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging protected val epochLock = new AnyRef /** Remembers which map output locations are currently being fetched on a worker. */ - private val fetching = new HashSet[Int] + private val fetching = new ConcurrentSkipListSet[Int] /** * Send a message to the trackerActor and get its result within a default timeout, or @@ -130,12 +131,13 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging if (statuses == null) { logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them") var fetchedStatuses: Array[MapStatus] = null - shuffleId.toString.intern.synchronized { + val monitor = shuffleId.toString.intern + monitor.synchronized { if (fetching.contains(shuffleId)) { // Someone else is fetching it; wait for them to be done while (fetching.contains(shuffleId)) { try { - fetching.wait() + monitor.wait() } catch { case e: InterruptedException => } @@ -147,7 +149,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging fetchedStatuses = mapStatuses.get(shuffleId).orNull if (fetchedStatuses == null) { // We have to do the fetch, get others to wait for us. - fetching += shuffleId + fetching.add(shuffleId) } } @@ -162,9 +164,9 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging logInfo("Got the output locations") mapStatuses.put(shuffleId, fetchedStatuses) } finally { - fetching.synchronized { - fetching -= shuffleId - fetching.notifyAll() + monitor.synchronized { + fetching.remove(shuffleId) + monitor.notifyAll() } } } From 46bccf551f8b4bf01cdbb18fc153808e5419477b Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 1 Aug 2014 14:56:05 -0700 Subject: [PATCH 05/22] Manually revert custom changes to master --- .../org/apache/spark/MapOutputTracker.scala | 16 ++--- .../spark/executor/MesosExecutorBackend.scala | 2 +- .../cluster/mesos/MesosSchedulerBackend.scala | 4 +- .../apache/spark/MapOutputTrackerSuite.scala | 67 ++----------------- pom.xml | 10 +++ 5 files changed, 27 insertions(+), 72 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 681d75d48e5ba..894091761485d 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -18,7 +18,6 @@ package org.apache.spark import java.io._ -import java.util.concurrent.ConcurrentSkipListSet import java.util.zip.{GZIPInputStream, GZIPOutputStream} import scala.collection.mutable.{HashSet, HashMap, Map} @@ -96,7 +95,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging protected val epochLock = new AnyRef /** Remembers which map output locations are currently being fetched on a worker. */ - private val fetching = new ConcurrentSkipListSet[Int] + private val fetching = new HashSet[Int] /** * Send a message to the trackerActor and get its result within a default timeout, or @@ -131,13 +130,12 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging if (statuses == null) { logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them") var fetchedStatuses: Array[MapStatus] = null - val monitor = shuffleId.toString.intern - monitor.synchronized { + fetching.synchronized { if (fetching.contains(shuffleId)) { // Someone else is fetching it; wait for them to be done while (fetching.contains(shuffleId)) { try { - monitor.wait() + fetching.wait() } catch { case e: InterruptedException => } @@ -149,7 +147,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging fetchedStatuses = mapStatuses.get(shuffleId).orNull if (fetchedStatuses == null) { // We have to do the fetch, get others to wait for us. - fetching.add(shuffleId) + fetching += shuffleId } } @@ -164,9 +162,9 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging logInfo("Got the output locations") mapStatuses.put(shuffleId, fetchedStatuses) } finally { - monitor.synchronized { - fetching.remove(shuffleId) - monitor.notifyAll() + fetching.synchronized { + fetching -= shuffleId + fetching.notifyAll() } } } diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala index a42c8b43bbf7f..8c44c1696c833 100644 --- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala @@ -41,7 +41,7 @@ private[spark] class MesosExecutorBackend driver.sendStatusUpdate(MesosTaskStatus.newBuilder() .setTaskId(mesosTaskId) .setState(TaskState.toMesos(state)) - .setData(ByteString.copyFrom(data)) +// .setData(ByteString.copyFrom(data)) .build()) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index c717e7c621a8f..8afe2ecfec40a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -116,7 +116,7 @@ private[spark] class MesosSchedulerBackend( ExecutorInfo.newBuilder() .setExecutorId(ExecutorID.newBuilder().setValue(execId).build()) .setCommand(command) - .setData(ByteString.copyFrom(createExecArg())) +// .setData(ByteString.copyFrom(createExecArg())) .addResources(memory) .build() } @@ -253,7 +253,7 @@ private[spark] class MesosSchedulerBackend( .setExecutor(createExecutorInfo(slaveId)) .setName(task.name) .addResources(cpuResource) - .setData(ByteString.copyFrom(task.serializedTask)) +// .setData(ByteString.copyFrom(task.serializedTask)) .build() } diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala index af3e2ecb80589..9702838085627 100644 --- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala @@ -17,11 +17,6 @@ package org.apache.spark -import java.util.Date -import java.util.concurrent._ - -import org.apache.log4j.Logger - import scala.concurrent.Await import akka.actor._ @@ -34,7 +29,6 @@ import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AkkaUtils class MapOutputTrackerSuite extends FunSuite with LocalSparkContext { - val logger = Logger.getLogger(getClass.getName) private val conf = new SparkConf test("compressSize") { assert(MapOutputTracker.compressSize(0L) === 0) @@ -143,76 +137,29 @@ class MapOutputTrackerSuite extends FunSuite with LocalSparkContext { val timeout = AkkaUtils.lookupTimeout(conf) slaveTracker.trackerActor = Await.result(selection.resolveOne(timeout), timeout) - // Test single shuffle execution - val shuffleId = 10 - val start = new Date().getTime - invokeRemoteFetch(masterTracker, slaveTracker, shuffleId) - val singleFetchDuration = new Date().getTime - start - - // Test Parallel execution of shuffles - val NShuffles = 20 - import collection.mutable.ArrayBuffer - val threads = new ArrayBuffer[Thread](NShuffles) - val barrier = new CyclicBarrier(NShuffles) - val latch = new CountDownLatch(NShuffles) - - class ShuffleTesterThread(threadNum: Int, barrier: CyclicBarrier, latch : CountDownLatch) - extends Thread(s"ShuffleTester$threadNum") { - override def run() = { - val shuffleId = 20+threadNum - barrier.await - - invokeRemoteFetch(masterTracker, slaveTracker, shuffleId) - latch.countDown() - } - } - - for (shuffle <- 1 to NShuffles) { - val t = new ShuffleTesterThread(shuffle, barrier, latch) - threads += t - t.start() - } - val pstart = new Date().getTime - latch.await(10, TimeUnit.SECONDS) - assert(latch.getCount == 0, "Not all shuffles completed within allowed time period") - val parallelFetchDuration = new Date().getTime - pstart - log(s"All $NShuffles fetches Completed: duration=$parallelFetchDuration " - + s"(vs single fetch=$singleFetchDuration)") - assert (parallelFetchDuration > singleFetchDuration - && parallelFetchDuration <= (NShuffles * 0.4) * singleFetchDuration, - "Parallel remote fetch should show strong sub-linear execution time increase " - + s" vs number of remote Fetches. " - + s"Actual: single fetch=$singleFetchDuration $NShuffles-threads=$parallelFetchDuration") - - } - - def invokeRemoteFetch(masterTracker: MapOutputTrackerMaster, - slaveTracker: MapOutputTrackerWorker, shuffleId: Int): Unit = { - masterTracker.registerShuffle(shuffleId, 1) + masterTracker.registerShuffle(10, 1) masterTracker.incrementEpoch() slaveTracker.updateEpoch(masterTracker.getEpoch) - intercept[FetchFailedException] { slaveTracker.getServerStatuses(shuffleId, 0) } + intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) } val compressedSize1000 = MapOutputTracker.compressSize(1000L) val size1000 = MapOutputTracker.decompressSize(compressedSize1000) - masterTracker.registerMapOutput(shuffleId, 0, new MapStatus( + masterTracker.registerMapOutput(10, 0, new MapStatus( BlockManagerId("a", "hostA", 1000, 0), Array(compressedSize1000))) masterTracker.incrementEpoch() slaveTracker.updateEpoch(masterTracker.getEpoch) - assert(slaveTracker.getServerStatuses(shuffleId, 0).toSeq === + assert(slaveTracker.getServerStatuses(10, 0).toSeq === Seq((BlockManagerId("a", "hostA", 1000, 0), size1000))) - masterTracker.unregisterMapOutput(shuffleId, 0, BlockManagerId("a", "hostA", 1000, 0)) + masterTracker.unregisterMapOutput(10, 0, BlockManagerId("a", "hostA", 1000, 0)) masterTracker.incrementEpoch() slaveTracker.updateEpoch(masterTracker.getEpoch) - intercept[FetchFailedException] { slaveTracker.getServerStatuses(shuffleId, 0) } + intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) } // failure should be cached - intercept[FetchFailedException] { slaveTracker.getServerStatuses(shuffleId, 0) } + intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) } } - def log(msg: String) = logger.info(msg) - test("remote fetch below akka frame size") { val newConf = new SparkConf newConf.set("spark.akka.frameSize", "1") diff --git a/pom.xml b/pom.xml index ae97bf03c53a2..23d724dad7722 100644 --- a/pom.xml +++ b/pom.xml @@ -1069,6 +1069,16 @@ + + hadoop-2.3-cdh5.0.0 + + 2.3.0-cdh5.0.0 + 2.5.0 + 0.9.0 + 0.96.1.1-cdh5.0.0 + + + hadoop-2.4 From a91f6a39af4e799617fb6f8cb490aaedfbb2af4e Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Wed, 16 Jul 2014 06:24:32 -0700 Subject: [PATCH 06/22] update pom.xml for hadoop-2.3-cdh50.0 and hbase 0.96.1.1 --- pom.xml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pom.xml b/pom.xml index ae97bf03c53a2..23d724dad7722 100644 --- a/pom.xml +++ b/pom.xml @@ -1069,6 +1069,16 @@ + + hadoop-2.3-cdh5.0.0 + + 2.3.0-cdh5.0.0 + 2.5.0 + 0.9.0 + 0.96.1.1-cdh5.0.0 + + + hadoop-2.4 From c638587cf929cf89846a3761b6de1ec084ac160a Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Wed, 23 Jul 2014 09:09:26 -0700 Subject: [PATCH 07/22] Mesos workaround --- .../org/apache/spark/executor/MesosExecutorBackend.scala | 2 +- .../spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala index a42c8b43bbf7f..8c44c1696c833 100644 --- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala @@ -41,7 +41,7 @@ private[spark] class MesosExecutorBackend driver.sendStatusUpdate(MesosTaskStatus.newBuilder() .setTaskId(mesosTaskId) .setState(TaskState.toMesos(state)) - .setData(ByteString.copyFrom(data)) +// .setData(ByteString.copyFrom(data)) .build()) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index c717e7c621a8f..8afe2ecfec40a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -116,7 +116,7 @@ private[spark] class MesosSchedulerBackend( ExecutorInfo.newBuilder() .setExecutorId(ExecutorID.newBuilder().setValue(execId).build()) .setCommand(command) - .setData(ByteString.copyFrom(createExecArg())) +// .setData(ByteString.copyFrom(createExecArg())) .addResources(memory) .build() } @@ -253,7 +253,7 @@ private[spark] class MesosSchedulerBackend( .setExecutor(createExecutorInfo(slaveId)) .setName(task.name) .addResources(cpuResource) - .setData(ByteString.copyFrom(task.serializedTask)) +// .setData(ByteString.copyFrom(task.serializedTask)) .build() } From 31dcd4fbad565e9dca06307a516a682d86dcfc09 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Tue, 22 Jul 2014 21:33:25 -0700 Subject: [PATCH 08/22] SPARK-2638 MapOutputTracker concurrency improvement Rolled back files not intended for checkin --- .../org/apache/spark/MapOutputTracker.scala | 2 +- .../spark/executor/MesosExecutorBackend.scala | 2 +- .../cluster/mesos/MesosSchedulerBackend.scala | 4 +- .../apache/spark/MapOutputTrackerSuite.scala | 67 +++++++++++++++++-- pom.xml | 10 --- 5 files changed, 64 insertions(+), 21 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 894091761485d..560782079ff31 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -130,7 +130,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging if (statuses == null) { logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them") var fetchedStatuses: Array[MapStatus] = null - fetching.synchronized { + shuffleId.toString.intern.synchronized { if (fetching.contains(shuffleId)) { // Someone else is fetching it; wait for them to be done while (fetching.contains(shuffleId)) { diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala index 8c44c1696c833..a42c8b43bbf7f 100644 --- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala @@ -41,7 +41,7 @@ private[spark] class MesosExecutorBackend driver.sendStatusUpdate(MesosTaskStatus.newBuilder() .setTaskId(mesosTaskId) .setState(TaskState.toMesos(state)) -// .setData(ByteString.copyFrom(data)) + .setData(ByteString.copyFrom(data)) .build()) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index 8afe2ecfec40a..c717e7c621a8f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -116,7 +116,7 @@ private[spark] class MesosSchedulerBackend( ExecutorInfo.newBuilder() .setExecutorId(ExecutorID.newBuilder().setValue(execId).build()) .setCommand(command) -// .setData(ByteString.copyFrom(createExecArg())) + .setData(ByteString.copyFrom(createExecArg())) .addResources(memory) .build() } @@ -253,7 +253,7 @@ private[spark] class MesosSchedulerBackend( .setExecutor(createExecutorInfo(slaveId)) .setName(task.name) .addResources(cpuResource) -// .setData(ByteString.copyFrom(task.serializedTask)) + .setData(ByteString.copyFrom(task.serializedTask)) .build() } diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala index 9702838085627..af3e2ecb80589 100644 --- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala @@ -17,6 +17,11 @@ package org.apache.spark +import java.util.Date +import java.util.concurrent._ + +import org.apache.log4j.Logger + import scala.concurrent.Await import akka.actor._ @@ -29,6 +34,7 @@ import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AkkaUtils class MapOutputTrackerSuite extends FunSuite with LocalSparkContext { + val logger = Logger.getLogger(getClass.getName) private val conf = new SparkConf test("compressSize") { assert(MapOutputTracker.compressSize(0L) === 0) @@ -137,29 +143,76 @@ class MapOutputTrackerSuite extends FunSuite with LocalSparkContext { val timeout = AkkaUtils.lookupTimeout(conf) slaveTracker.trackerActor = Await.result(selection.resolveOne(timeout), timeout) - masterTracker.registerShuffle(10, 1) + // Test single shuffle execution + val shuffleId = 10 + val start = new Date().getTime + invokeRemoteFetch(masterTracker, slaveTracker, shuffleId) + val singleFetchDuration = new Date().getTime - start + + // Test Parallel execution of shuffles + val NShuffles = 20 + import collection.mutable.ArrayBuffer + val threads = new ArrayBuffer[Thread](NShuffles) + val barrier = new CyclicBarrier(NShuffles) + val latch = new CountDownLatch(NShuffles) + + class ShuffleTesterThread(threadNum: Int, barrier: CyclicBarrier, latch : CountDownLatch) + extends Thread(s"ShuffleTester$threadNum") { + override def run() = { + val shuffleId = 20+threadNum + barrier.await + + invokeRemoteFetch(masterTracker, slaveTracker, shuffleId) + latch.countDown() + } + } + + for (shuffle <- 1 to NShuffles) { + val t = new ShuffleTesterThread(shuffle, barrier, latch) + threads += t + t.start() + } + val pstart = new Date().getTime + latch.await(10, TimeUnit.SECONDS) + assert(latch.getCount == 0, "Not all shuffles completed within allowed time period") + val parallelFetchDuration = new Date().getTime - pstart + log(s"All $NShuffles fetches Completed: duration=$parallelFetchDuration " + + s"(vs single fetch=$singleFetchDuration)") + assert (parallelFetchDuration > singleFetchDuration + && parallelFetchDuration <= (NShuffles * 0.4) * singleFetchDuration, + "Parallel remote fetch should show strong sub-linear execution time increase " + + s" vs number of remote Fetches. " + + s"Actual: single fetch=$singleFetchDuration $NShuffles-threads=$parallelFetchDuration") + + } + + def invokeRemoteFetch(masterTracker: MapOutputTrackerMaster, + slaveTracker: MapOutputTrackerWorker, shuffleId: Int): Unit = { + masterTracker.registerShuffle(shuffleId, 1) masterTracker.incrementEpoch() slaveTracker.updateEpoch(masterTracker.getEpoch) - intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) } + intercept[FetchFailedException] { slaveTracker.getServerStatuses(shuffleId, 0) } val compressedSize1000 = MapOutputTracker.compressSize(1000L) val size1000 = MapOutputTracker.decompressSize(compressedSize1000) - masterTracker.registerMapOutput(10, 0, new MapStatus( + masterTracker.registerMapOutput(shuffleId, 0, new MapStatus( BlockManagerId("a", "hostA", 1000, 0), Array(compressedSize1000))) masterTracker.incrementEpoch() slaveTracker.updateEpoch(masterTracker.getEpoch) - assert(slaveTracker.getServerStatuses(10, 0).toSeq === + assert(slaveTracker.getServerStatuses(shuffleId, 0).toSeq === Seq((BlockManagerId("a", "hostA", 1000, 0), size1000))) - masterTracker.unregisterMapOutput(10, 0, BlockManagerId("a", "hostA", 1000, 0)) + masterTracker.unregisterMapOutput(shuffleId, 0, BlockManagerId("a", "hostA", 1000, 0)) masterTracker.incrementEpoch() slaveTracker.updateEpoch(masterTracker.getEpoch) - intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) } + intercept[FetchFailedException] { slaveTracker.getServerStatuses(shuffleId, 0) } // failure should be cached - intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) } + intercept[FetchFailedException] { slaveTracker.getServerStatuses(shuffleId, 0) } } + def log(msg: String) = logger.info(msg) + test("remote fetch below akka frame size") { val newConf = new SparkConf newConf.set("spark.akka.frameSize", "1") diff --git a/pom.xml b/pom.xml index 23d724dad7722..ae97bf03c53a2 100644 --- a/pom.xml +++ b/pom.xml @@ -1069,16 +1069,6 @@ - - hadoop-2.3-cdh5.0.0 - - 2.3.0-cdh5.0.0 - 2.5.0 - 0.9.0 - 0.96.1.1-cdh5.0.0 - - - hadoop-2.4 From afe17e27bb0a9314b565dec96d17f9e3224d33bb Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 25 Jul 2014 02:15:39 -0700 Subject: [PATCH 09/22] Updated concurrency fix for using same monitor on the synchronized and wait logic --- .../org/apache/spark/MapOutputTracker.scala | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 560782079ff31..681d75d48e5ba 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -18,6 +18,7 @@ package org.apache.spark import java.io._ +import java.util.concurrent.ConcurrentSkipListSet import java.util.zip.{GZIPInputStream, GZIPOutputStream} import scala.collection.mutable.{HashSet, HashMap, Map} @@ -95,7 +96,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging protected val epochLock = new AnyRef /** Remembers which map output locations are currently being fetched on a worker. */ - private val fetching = new HashSet[Int] + private val fetching = new ConcurrentSkipListSet[Int] /** * Send a message to the trackerActor and get its result within a default timeout, or @@ -130,12 +131,13 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging if (statuses == null) { logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them") var fetchedStatuses: Array[MapStatus] = null - shuffleId.toString.intern.synchronized { + val monitor = shuffleId.toString.intern + monitor.synchronized { if (fetching.contains(shuffleId)) { // Someone else is fetching it; wait for them to be done while (fetching.contains(shuffleId)) { try { - fetching.wait() + monitor.wait() } catch { case e: InterruptedException => } @@ -147,7 +149,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging fetchedStatuses = mapStatuses.get(shuffleId).orNull if (fetchedStatuses == null) { // We have to do the fetch, get others to wait for us. - fetching += shuffleId + fetching.add(shuffleId) } } @@ -162,9 +164,9 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging logInfo("Got the output locations") mapStatuses.put(shuffleId, fetchedStatuses) } finally { - fetching.synchronized { - fetching -= shuffleId - fetching.notifyAll() + monitor.synchronized { + fetching.remove(shuffleId) + monitor.notifyAll() } } } From 0d9db983b2adc5fd8223181a1dcc1a643b856e40 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 1 Aug 2014 14:56:05 -0700 Subject: [PATCH 10/22] Manually revert custom changes to master --- .../org/apache/spark/MapOutputTracker.scala | 16 ++--- .../spark/executor/MesosExecutorBackend.scala | 2 +- .../cluster/mesos/MesosSchedulerBackend.scala | 4 +- .../apache/spark/MapOutputTrackerSuite.scala | 67 ++----------------- pom.xml | 10 +++ 5 files changed, 27 insertions(+), 72 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 681d75d48e5ba..894091761485d 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -18,7 +18,6 @@ package org.apache.spark import java.io._ -import java.util.concurrent.ConcurrentSkipListSet import java.util.zip.{GZIPInputStream, GZIPOutputStream} import scala.collection.mutable.{HashSet, HashMap, Map} @@ -96,7 +95,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging protected val epochLock = new AnyRef /** Remembers which map output locations are currently being fetched on a worker. */ - private val fetching = new ConcurrentSkipListSet[Int] + private val fetching = new HashSet[Int] /** * Send a message to the trackerActor and get its result within a default timeout, or @@ -131,13 +130,12 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging if (statuses == null) { logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them") var fetchedStatuses: Array[MapStatus] = null - val monitor = shuffleId.toString.intern - monitor.synchronized { + fetching.synchronized { if (fetching.contains(shuffleId)) { // Someone else is fetching it; wait for them to be done while (fetching.contains(shuffleId)) { try { - monitor.wait() + fetching.wait() } catch { case e: InterruptedException => } @@ -149,7 +147,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging fetchedStatuses = mapStatuses.get(shuffleId).orNull if (fetchedStatuses == null) { // We have to do the fetch, get others to wait for us. - fetching.add(shuffleId) + fetching += shuffleId } } @@ -164,9 +162,9 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging logInfo("Got the output locations") mapStatuses.put(shuffleId, fetchedStatuses) } finally { - monitor.synchronized { - fetching.remove(shuffleId) - monitor.notifyAll() + fetching.synchronized { + fetching -= shuffleId + fetching.notifyAll() } } } diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala index a42c8b43bbf7f..8c44c1696c833 100644 --- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala @@ -41,7 +41,7 @@ private[spark] class MesosExecutorBackend driver.sendStatusUpdate(MesosTaskStatus.newBuilder() .setTaskId(mesosTaskId) .setState(TaskState.toMesos(state)) - .setData(ByteString.copyFrom(data)) +// .setData(ByteString.copyFrom(data)) .build()) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index c717e7c621a8f..8afe2ecfec40a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -116,7 +116,7 @@ private[spark] class MesosSchedulerBackend( ExecutorInfo.newBuilder() .setExecutorId(ExecutorID.newBuilder().setValue(execId).build()) .setCommand(command) - .setData(ByteString.copyFrom(createExecArg())) +// .setData(ByteString.copyFrom(createExecArg())) .addResources(memory) .build() } @@ -253,7 +253,7 @@ private[spark] class MesosSchedulerBackend( .setExecutor(createExecutorInfo(slaveId)) .setName(task.name) .addResources(cpuResource) - .setData(ByteString.copyFrom(task.serializedTask)) +// .setData(ByteString.copyFrom(task.serializedTask)) .build() } diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala index af3e2ecb80589..9702838085627 100644 --- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala @@ -17,11 +17,6 @@ package org.apache.spark -import java.util.Date -import java.util.concurrent._ - -import org.apache.log4j.Logger - import scala.concurrent.Await import akka.actor._ @@ -34,7 +29,6 @@ import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AkkaUtils class MapOutputTrackerSuite extends FunSuite with LocalSparkContext { - val logger = Logger.getLogger(getClass.getName) private val conf = new SparkConf test("compressSize") { assert(MapOutputTracker.compressSize(0L) === 0) @@ -143,76 +137,29 @@ class MapOutputTrackerSuite extends FunSuite with LocalSparkContext { val timeout = AkkaUtils.lookupTimeout(conf) slaveTracker.trackerActor = Await.result(selection.resolveOne(timeout), timeout) - // Test single shuffle execution - val shuffleId = 10 - val start = new Date().getTime - invokeRemoteFetch(masterTracker, slaveTracker, shuffleId) - val singleFetchDuration = new Date().getTime - start - - // Test Parallel execution of shuffles - val NShuffles = 20 - import collection.mutable.ArrayBuffer - val threads = new ArrayBuffer[Thread](NShuffles) - val barrier = new CyclicBarrier(NShuffles) - val latch = new CountDownLatch(NShuffles) - - class ShuffleTesterThread(threadNum: Int, barrier: CyclicBarrier, latch : CountDownLatch) - extends Thread(s"ShuffleTester$threadNum") { - override def run() = { - val shuffleId = 20+threadNum - barrier.await - - invokeRemoteFetch(masterTracker, slaveTracker, shuffleId) - latch.countDown() - } - } - - for (shuffle <- 1 to NShuffles) { - val t = new ShuffleTesterThread(shuffle, barrier, latch) - threads += t - t.start() - } - val pstart = new Date().getTime - latch.await(10, TimeUnit.SECONDS) - assert(latch.getCount == 0, "Not all shuffles completed within allowed time period") - val parallelFetchDuration = new Date().getTime - pstart - log(s"All $NShuffles fetches Completed: duration=$parallelFetchDuration " - + s"(vs single fetch=$singleFetchDuration)") - assert (parallelFetchDuration > singleFetchDuration - && parallelFetchDuration <= (NShuffles * 0.4) * singleFetchDuration, - "Parallel remote fetch should show strong sub-linear execution time increase " - + s" vs number of remote Fetches. " - + s"Actual: single fetch=$singleFetchDuration $NShuffles-threads=$parallelFetchDuration") - - } - - def invokeRemoteFetch(masterTracker: MapOutputTrackerMaster, - slaveTracker: MapOutputTrackerWorker, shuffleId: Int): Unit = { - masterTracker.registerShuffle(shuffleId, 1) + masterTracker.registerShuffle(10, 1) masterTracker.incrementEpoch() slaveTracker.updateEpoch(masterTracker.getEpoch) - intercept[FetchFailedException] { slaveTracker.getServerStatuses(shuffleId, 0) } + intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) } val compressedSize1000 = MapOutputTracker.compressSize(1000L) val size1000 = MapOutputTracker.decompressSize(compressedSize1000) - masterTracker.registerMapOutput(shuffleId, 0, new MapStatus( + masterTracker.registerMapOutput(10, 0, new MapStatus( BlockManagerId("a", "hostA", 1000, 0), Array(compressedSize1000))) masterTracker.incrementEpoch() slaveTracker.updateEpoch(masterTracker.getEpoch) - assert(slaveTracker.getServerStatuses(shuffleId, 0).toSeq === + assert(slaveTracker.getServerStatuses(10, 0).toSeq === Seq((BlockManagerId("a", "hostA", 1000, 0), size1000))) - masterTracker.unregisterMapOutput(shuffleId, 0, BlockManagerId("a", "hostA", 1000, 0)) + masterTracker.unregisterMapOutput(10, 0, BlockManagerId("a", "hostA", 1000, 0)) masterTracker.incrementEpoch() slaveTracker.updateEpoch(masterTracker.getEpoch) - intercept[FetchFailedException] { slaveTracker.getServerStatuses(shuffleId, 0) } + intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) } // failure should be cached - intercept[FetchFailedException] { slaveTracker.getServerStatuses(shuffleId, 0) } + intercept[FetchFailedException] { slaveTracker.getServerStatuses(10, 0) } } - def log(msg: String) = logger.info(msg) - test("remote fetch below akka frame size") { val newConf = new SparkConf newConf.set("spark.akka.frameSize", "1") diff --git a/pom.xml b/pom.xml index ae97bf03c53a2..23d724dad7722 100644 --- a/pom.xml +++ b/pom.xml @@ -1069,6 +1069,16 @@ + + hadoop-2.3-cdh5.0.0 + + 2.3.0-cdh5.0.0 + 2.5.0 + 0.9.0 + 0.96.1.1-cdh5.0.0 + + + hadoop-2.4 From b08c87f2f620c225b5cd7c420f42ca65676f4ebc Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 1 Aug 2014 15:36:52 -0700 Subject: [PATCH 11/22] Revert "update pom.xml for hadoop-2.3-cdh50.0 and hbase 0.96.1.1" This reverts commit a91f6a39af4e799617fb6f8cb490aaedfbb2af4e. --- pom.xml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pom.xml b/pom.xml index 23d724dad7722..ae97bf03c53a2 100644 --- a/pom.xml +++ b/pom.xml @@ -1069,16 +1069,6 @@ - - hadoop-2.3-cdh5.0.0 - - 2.3.0-cdh5.0.0 - 2.5.0 - 0.9.0 - 0.96.1.1-cdh5.0.0 - - - hadoop-2.4 From 1e10e0024cb6d2f354f9aea72fdff6f6b9ecab6c Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 1 Aug 2014 15:37:13 -0700 Subject: [PATCH 12/22] Revert "SPARK-2638 MapOutputTracker concurrency improvement" This reverts commit 31dcd4fbad565e9dca06307a516a682d86dcfc09. --- pom.xml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pom.xml b/pom.xml index ae97bf03c53a2..23d724dad7722 100644 --- a/pom.xml +++ b/pom.xml @@ -1069,6 +1069,16 @@ + + hadoop-2.3-cdh5.0.0 + + 2.3.0-cdh5.0.0 + 2.5.0 + 0.9.0 + 0.96.1.1-cdh5.0.0 + + + hadoop-2.4 From dea01f506837cf0e14e117bc7783461909f44041 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 1 Aug 2014 15:52:33 -0700 Subject: [PATCH 13/22] Manually revert custom changes to master --- .../apache/spark/executor/MesosExecutorBackend.scala | 2 +- .../cluster/mesos/MesosSchedulerBackend.scala | 4 ++-- pom.xml | 10 ---------- 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala index 8c44c1696c833..a42c8b43bbf7f 100644 --- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala @@ -41,7 +41,7 @@ private[spark] class MesosExecutorBackend driver.sendStatusUpdate(MesosTaskStatus.newBuilder() .setTaskId(mesosTaskId) .setState(TaskState.toMesos(state)) -// .setData(ByteString.copyFrom(data)) + .setData(ByteString.copyFrom(data)) .build()) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index 8afe2ecfec40a..c717e7c621a8f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -116,7 +116,7 @@ private[spark] class MesosSchedulerBackend( ExecutorInfo.newBuilder() .setExecutorId(ExecutorID.newBuilder().setValue(execId).build()) .setCommand(command) -// .setData(ByteString.copyFrom(createExecArg())) + .setData(ByteString.copyFrom(createExecArg())) .addResources(memory) .build() } @@ -253,7 +253,7 @@ private[spark] class MesosSchedulerBackend( .setExecutor(createExecutorInfo(slaveId)) .setName(task.name) .addResources(cpuResource) -// .setData(ByteString.copyFrom(task.serializedTask)) + .setData(ByteString.copyFrom(task.serializedTask)) .build() } diff --git a/pom.xml b/pom.xml index 23d724dad7722..ae97bf03c53a2 100644 --- a/pom.xml +++ b/pom.xml @@ -1069,16 +1069,6 @@ - - hadoop-2.3-cdh5.0.0 - - 2.3.0-cdh5.0.0 - 2.5.0 - 0.9.0 - 0.96.1.1-cdh5.0.0 - - - hadoop-2.4 From 2fc131ec57d8c298c0e3759ef177f62010ba9a09 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 1 Aug 2014 16:04:51 -0700 Subject: [PATCH 14/22] Do this again: Manually revert custom changes to master --- .../apache/spark/executor/MesosExecutorBackend.scala | 2 +- .../cluster/mesos/MesosSchedulerBackend.scala | 4 ++-- pom.xml | 10 ---------- 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala index 8c44c1696c833..a42c8b43bbf7f 100644 --- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala @@ -41,7 +41,7 @@ private[spark] class MesosExecutorBackend driver.sendStatusUpdate(MesosTaskStatus.newBuilder() .setTaskId(mesosTaskId) .setState(TaskState.toMesos(state)) -// .setData(ByteString.copyFrom(data)) + .setData(ByteString.copyFrom(data)) .build()) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index 8afe2ecfec40a..c717e7c621a8f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -116,7 +116,7 @@ private[spark] class MesosSchedulerBackend( ExecutorInfo.newBuilder() .setExecutorId(ExecutorID.newBuilder().setValue(execId).build()) .setCommand(command) -// .setData(ByteString.copyFrom(createExecArg())) + .setData(ByteString.copyFrom(createExecArg())) .addResources(memory) .build() } @@ -253,7 +253,7 @@ private[spark] class MesosSchedulerBackend( .setExecutor(createExecutorInfo(slaveId)) .setName(task.name) .addResources(cpuResource) -// .setData(ByteString.copyFrom(task.serializedTask)) + .setData(ByteString.copyFrom(task.serializedTask)) .build() } diff --git a/pom.xml b/pom.xml index 23d724dad7722..ae97bf03c53a2 100644 --- a/pom.xml +++ b/pom.xml @@ -1069,16 +1069,6 @@ - - hadoop-2.3-cdh5.0.0 - - 2.3.0-cdh5.0.0 - 2.5.0 - 0.9.0 - 0.96.1.1-cdh5.0.0 - - - hadoop-2.4 From 42f5016d0fb000c80bbd9eb4833511b9429776b0 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Thu, 24 Jul 2014 23:26:11 -0700 Subject: [PATCH 15/22] SPARK-2686 Add Length support to Spark SQL and HQL and Strlen support to SQL --- .../apache/spark/sql/catalyst/SqlParser.scala | 10 +++ .../sql/catalyst/expressions/Expression.scala | 2 +- .../expressions/stringOperations.scala | 81 ++++++++++++++++++- .../sql/catalyst/optimizer/Optimizer.scala | 3 + .../ExpressionEvaluationSuite.scala | 29 +++++++ .../optimizer/ConstantFoldingSuite.scala | 12 ++- .../org/apache/spark/sql/QueryTest.scala | 2 + .../org/apache/spark/sql/SQLQuerySuite.scala | 28 +++++++ .../org/apache/spark/sql/hive/HiveQl.scala | 8 +- 9 files changed, 170 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index 2c73a80f64ebf..a73514f4fc197 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -122,6 +122,9 @@ class SqlParser extends StandardTokenParsers with PackratParsers { protected val EXCEPT = Keyword("EXCEPT") protected val SUBSTR = Keyword("SUBSTR") protected val SUBSTRING = Keyword("SUBSTRING") + protected val LEN = Keyword("LEN") + protected val LENGTH = Keyword("LENGTH") + protected val STRLEN = Keyword("STRLEN") // Use reflection to find the reserved words defined in this class. protected val reservedWords = @@ -323,6 +326,13 @@ class SqlParser extends StandardTokenParsers with PackratParsers { (SUBSTR | SUBSTRING) ~> "(" ~> expression ~ "," ~ expression ~ "," ~ expression <~ ")" ^^ { case s ~ "," ~ p ~ "," ~ l => Substring(s,p,l) } | + (LEN | LENGTH) ~> "(" ~> expression <~ ")" ^^ { case s => Length(s) } | + STRLEN ~> "(" ~> expression ~ "," ~ expression <~ ")" ^^ { + case s ~ "," ~ e => Strlen(s, e) + } | + STRLEN ~> "(" ~> expression <~ ")" ^^ { + case s => Strlen(s, Literal(StrlenConstants.DefaultEncoding)) + } | ident ~ "(" ~ repsep(expression, ",") <~ ")" ^^ { case udfName ~ _ ~ exprs => UnresolvedFunction(udfName, exprs) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index ba62dabe3dd6a..5829afec7d928 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -37,7 +37,7 @@ abstract class Expression extends TreeNode[Expression] { * - A [[BinaryExpression]] is foldable if its both left and right child are foldable * - A [[Not]], [[IsNull]], or [[IsNotNull]] is foldable if its child is foldable * - A [[Literal]] is foldable - * - A [[Cast]] or [[UnaryMinus]] is foldable if its child is foldable + * - A [[Cast]] or [[UnaryMinus]] or [[Length/Strlen]] is foldable if its child is foldable */ def foldable: Boolean = false def nullable: Boolean diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index 97fc3a3b14b88..a61435e7250f3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -17,13 +17,16 @@ package org.apache.spark.sql.catalyst.expressions +import java.io.UnsupportedEncodingException import java.util.regex.Pattern +import org.apache.spark.Logging + import scala.collection.IndexedSeqOptimized import org.apache.spark.sql.catalyst.analysis.UnresolvedException -import org.apache.spark.sql.catalyst.types.{BinaryType, BooleanType, DataType, StringType} +import org.apache.spark.sql.catalyst.types.{BinaryType, BooleanType, DataType, StringType, IntegerType} trait StringRegexExpression { self: BinaryExpression => @@ -208,6 +211,82 @@ case class EndsWith(left: Expression, right: Expression) def compare(l: String, r: String) = l.endsWith(r) } +/** + * A function that returns the number of bytes in an expression + */ +case class Length(child: Expression) extends UnaryExpression { + + type EvaluatedType = Any + + override def dataType = IntegerType + + override def foldable = child.foldable + + override def nullable = child.nullable + + override def toString = s"Length($child)" + + override def eval(input: Row): EvaluatedType = { + val string = child.eval(input) + if (string == null) { + null + } else if (!string.isInstanceOf[String]) { + string.toString.length + } else { + new String(string.toString.getBytes, StrlenConstants.DefaultEncoding).length + } + } + +} + +object StrlenConstants { + val DefaultEncoding = "ISO-8859-1" +} + +/** + * A function that returns the number of characters in a string expression + */ +case class Strlen(child: Expression, encoding : Expression) extends UnaryExpression + with Logging { + + type EvaluatedType = Any + + override def dataType = IntegerType + + override def foldable = child.foldable + + override def nullable = true + + override def toString = s"Strlen($child, $encoding)" + + override def eval(input: Row): EvaluatedType = { + val string = child.eval(input) + if (string == null) { + null + } else if (!string.isInstanceOf[String]) { + log.debug(s"Non-string value [$string] provided to strlen") + null + } else { + var evalEncoding = encoding.eval(input) + val strEncoding = + if (evalEncoding != null) { + evalEncoding.toString + } else { + StrlenConstants.DefaultEncoding + } + val s: String = "" + try { + new String(string.asInstanceOf[String].getBytes, strEncoding).length + } catch { + case ue : UnsupportedEncodingException => { + log.debug(s"strlen: Caught UnsupportedEncodingException for encoding=[$strEncoding]") + null + } + } + } + } +} + /** * A function that takes a substring of its first argument starting at a given position. * Defined for String and Binary types. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 5f86d6047cb9c..cd54808d3dd39 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -174,6 +174,9 @@ object NullPropagation extends Rule[LogicalPlan] { case e @ Substring(_, Literal(null, _), _) => Literal(null, e.dataType) case e @ Substring(_, _, Literal(null, _)) => Literal(null, e.dataType) + case e @ Length(Literal(null, _)) => Literal(null, e.dataType) + case e @ Strlen(Literal(null, _),_) => Literal(null, e.dataType) + // Put exceptional cases above if any case e: BinaryArithmetic => e.children match { case Literal(null, _) :: right :: Nil => Literal(null, e.dataType) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala index 999c9fff38d60..c5016e6eeb5e2 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala @@ -18,11 +18,14 @@ package org.apache.spark.sql.catalyst.expressions import java.sql.Timestamp +import java.util.concurrent.atomic.AtomicInteger import org.scalatest.FunSuite import org.apache.spark.sql.catalyst.types._ +import scala.collection.mutable.ArrayBuffer + /* Implicit conversions */ import org.apache.spark.sql.catalyst.dsl.expressions._ @@ -567,4 +570,30 @@ class ExpressionEvaluationSuite extends FunSuite { checkEvaluation(s.substring(0, 2), "ex", row) checkEvaluation(s.substring(0), "example", row) } + + test("Length") { + checkEvaluation(Length(Literal(null, IntegerType)), null) + checkEvaluation(Length(Literal(0,IntegerType)), 1) + checkEvaluation(Length(Literal(12,IntegerType)), 2) + checkEvaluation(Length(Literal(123,IntegerType)), 3) + checkEvaluation(Length(Literal(12.4F, FloatType)), 4) + checkEvaluation(Length(Literal(12345678901L,LongType)), 11) + checkEvaluation(Length(Literal(1234567890.2D, DoubleType)), 14) + checkEvaluation(Length(Literal("1234567890ABC",StringType)), 13) + checkEvaluation(Length(Literal("\uF93D\uF936\uF949\uF942",StringType)), 4) + } + + test("Strlen") { + checkEvaluation(Strlen(Literal(null, StringType), "ISO-8859-1"), null) + checkEvaluation(Strlen(Literal(null, StringType), "UTF-8"), null) + checkEvaluation(Strlen(Literal(null, StringType), "UTF-16"), null) + checkEvaluation(Strlen(Literal("1234567890ABC", StringType), "ISO-8859-1"), 13) + checkEvaluation(Strlen(Literal("1234567890ABC", StringType), "UTF-8"), 13) + checkEvaluation(Strlen(Literal("1234567890ABC", StringType), "UTF-16"), 7) + checkEvaluation(Strlen(Literal("\uF93D\uF936\uF949\uF942", StringType), "ISO-8859-1"), 4) + checkEvaluation(Strlen(Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-8"), 4) + checkEvaluation(Strlen(Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-16"), 2) + checkEvaluation(Strlen(Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-32"), 1) + } + } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala index 0a27cce337482..08a8868f4ef68 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala @@ -208,7 +208,11 @@ class ConstantFoldingSuite extends PlanTest { Substring("abc", 0, Literal(null, IntegerType)) as 'c18, Contains(Literal(null, StringType), "abc") as 'c19, - Contains("abc", Literal(null, StringType)) as 'c20 + Contains("abc", Literal(null, StringType)) as 'c20, + + Length(Literal(null, IntegerType)) as 'c21, + Strlen(Literal(null, StringType), "ISO-8859-1") as 'c22 + ) val optimized = Optimize(originalQuery.analyze) @@ -243,7 +247,11 @@ class ConstantFoldingSuite extends PlanTest { Literal(null, StringType) as 'c18, Literal(null, BooleanType) as 'c19, - Literal(null, BooleanType) as 'c20 + Literal(null, BooleanType) as 'c20, + + Literal(null, IntegerType) as 'c21, + Literal(null, IntegerType) as 'c22 + ).analyze comparePlans(optimized, correctAnswer) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 1fd8d27b34c59..e7aa3265d7b60 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql +import java.io.{PrintWriter, StringWriter} + import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.util._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 5c571d35d1bb9..ce21aa49fbd70 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -47,6 +47,34 @@ class SQLQuerySuite extends QueryTest { checkAnswer( sql("SELECT substring(tableName, 3) FROM tableName"), "st") + checkAnswer( + sql("SELECT substring(tableName, 2) FROM tableName group by substring(tableName, 2)"), + "est") + } + + test("SPARK-TBD Added Parser of SQL LENGTH()") { + checkAnswer( + sql("SELECT length(key) as keylen from testData where key = 100"), 3) + checkAnswer( + sql("SELECT len(key), count(*) as cnt from testData where key <= 100 group by len(key)"), + Seq(Seq(1,9),Seq(2,90), Seq(3,1))) + checkAnswer( + sql("SELECT max(length(key * key) - len(key)) from testData where key <= 100"), 2) + checkAnswer( + sql("SELECT min(Length(s)) FROM nullableRepeatedData where s is not null"), 4) + checkAnswer( + sql("SELECT max(LENGTH(s)) FROM nullableRepeatedData"), 4) + } + + test("SPARK-TBD Added Parser of SQL STRLEN()") { + checkAnswer( + sql("SELECT StrLen(s) from repeatedData"), Seq(Seq(4),Seq(4))) + checkAnswer( + sql("SELECT StrLen(s,'UTF-8') from repeatedData"), Seq(Seq(4),Seq(4))) + checkAnswer( + sql("SELECT max(StrLen(s,'UTF-8')) from nullStrings"), 3) + checkAnswer( + sql("SELECT strlen('a','ISO-8859-1') + strlen('abcde','ISO-8859-1') FROM testData limit 1"), 6) } test("index into array") { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 3d2eb1eefaeda..29f6c532627fd 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -866,6 +866,7 @@ private[hive] object HiveQl { val WHEN = "(?i)WHEN".r val CASE = "(?i)CASE".r val SUBSTR = "(?i)SUBSTR(?:ING)?".r + val STRLEN = "(?i)STRLEN".r protected def nodeToExpr(node: Node): Expression = node match { /* Attribute References */ @@ -995,8 +996,13 @@ private[hive] object HiveQl { case Token("TOK_FUNCTION", Token(RAND(), Nil) :: Nil) => Rand case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: Nil) => Substring(nodeToExpr(string), nodeToExpr(pos), Literal(Integer.MAX_VALUE, IntegerType)) - case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) => + case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) => Substring(nodeToExpr(string), nodeToExpr(pos), nodeToExpr(length)) + case Token("TOK_FUNCTION", Token(STRLEN(), Nil) :: string :: Nil) => + Strlen(nodeToExpr(string), Literal(StrlenConstants.DefaultEncoding)) + case Token("TOK_FUNCTION", Token(STRLEN(), Nil) :: string :: encoding :: Nil) => + Strlen(nodeToExpr(string), nodeToExpr(encoding)) + /* UDFs - Must be last otherwise will preempt built in functions */ case Token("TOK_FUNCTION", Token(name, Nil) :: args) => From ad3859eb547669125ad77e64c645001c1dbe4dc3 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 1 Aug 2014 12:28:23 -0700 Subject: [PATCH 16/22] Ongoing work with Ueshin and Marmbrus --- .../apache/spark/sql/catalyst/SqlParser.scala | 13 +++--- .../sql/catalyst/expressions/Expression.scala | 2 +- .../expressions/stringOperations.scala | 42 ++++++++++++------- .../sql/catalyst/optimizer/Optimizer.scala | 2 +- .../ExpressionEvaluationSuite.scala | 22 +++++----- .../optimizer/ConstantFoldingSuite.scala | 2 +- .../org/apache/spark/sql/SQLQuerySuite.scala | 14 +++---- .../org/apache/spark/sql/hive/HiveQl.scala | 11 ++--- 8 files changed, 62 insertions(+), 46 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index a73514f4fc197..77a3dc2729a48 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -124,7 +124,8 @@ class SqlParser extends StandardTokenParsers with PackratParsers { protected val SUBSTRING = Keyword("SUBSTRING") protected val LEN = Keyword("LEN") protected val LENGTH = Keyword("LENGTH") - protected val STRLEN = Keyword("STRLEN") + protected val CHAR_LEN = Keyword("CHAR_LEN") + protected val OCTET_LEN = Keyword("OCTET_LEN") // Use reflection to find the reserved words defined in this class. protected val reservedWords = @@ -326,12 +327,12 @@ class SqlParser extends StandardTokenParsers with PackratParsers { (SUBSTR | SUBSTRING) ~> "(" ~> expression ~ "," ~ expression ~ "," ~ expression <~ ")" ^^ { case s ~ "," ~ p ~ "," ~ l => Substring(s,p,l) } | - (LEN | LENGTH) ~> "(" ~> expression <~ ")" ^^ { case s => Length(s) } | - STRLEN ~> "(" ~> expression ~ "," ~ expression <~ ")" ^^ { - case s ~ "," ~ e => Strlen(s, e) + (LEN | LENGTH | CHAR_LEN) ~> "(" ~> expression <~ ")" ^^ { case s => Length(s) } | + OCTET_LEN ~> "(" ~> expression ~ "," ~ expression <~ ")" ^^ { + case s ~ "," ~ e => OctetLen(s, e) } | - STRLEN ~> "(" ~> expression <~ ")" ^^ { - case s => Strlen(s, Literal(StrlenConstants.DefaultEncoding)) + OCTET_LEN ~> "(" ~> expression <~ ")" ^^ { + case s => OctetLen(s, Literal(OctetLenConstants.DefaultEncoding)) } | ident ~ "(" ~ repsep(expression, ",") <~ ")" ^^ { case udfName ~ _ ~ exprs => UnresolvedFunction(udfName, exprs) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 5829afec7d928..f3a7233b0b46d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -37,7 +37,7 @@ abstract class Expression extends TreeNode[Expression] { * - A [[BinaryExpression]] is foldable if its both left and right child are foldable * - A [[Not]], [[IsNull]], or [[IsNotNull]] is foldable if its child is foldable * - A [[Literal]] is foldable - * - A [[Cast]] or [[UnaryMinus]] or [[Length/Strlen]] is foldable if its child is foldable + * - A [[Cast]] or [[UnaryMinus]] or [[Length/Octetlen]] is foldable if its child is foldable */ def foldable: Boolean = false def nullable: Boolean diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index a61435e7250f3..99073283b761a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -211,6 +211,7 @@ case class EndsWith(left: Expression, right: Expression) def compare(l: String, r: String) = l.endsWith(r) } + /** * A function that returns the number of bytes in an expression */ @@ -227,26 +228,40 @@ case class Length(child: Expression) extends UnaryExpression { override def toString = s"Length($child)" override def eval(input: Row): EvaluatedType = { - val string = child.eval(input) - if (string == null) { + val inputVal = child.eval(input) + if (inputVal == null) { null - } else if (!string.isInstanceOf[String]) { - string.toString.length + } else if (!inputVal.isInstanceOf[String]) { + inputVal.toString.length } else { - new String(string.toString.getBytes, StrlenConstants.DefaultEncoding).length + OctetLenUtils.len(inputVal.asInstanceOf[String]) } } } -object StrlenConstants { +object OctetLenConstants { val DefaultEncoding = "ISO-8859-1" } +object OctetLenUtils { + def len(s : String) = { + if (s == null) { + null + } else { + @inline def isUtfStartByte(b : Byte) = (b & 0xC0) != 0x80 + s.getBytes.foldLeft(0) { case (cnt, b) => { + cnt + (if (isUtfStartByte(b)) 1 else 0) + } + } + } + } +} + /** * A function that returns the number of characters in a string expression */ -case class Strlen(child: Expression, encoding : Expression) extends UnaryExpression +case class OctetLen(child: Expression, encoding : Expression) extends UnaryExpression with Logging { type EvaluatedType = Any @@ -257,14 +272,14 @@ case class Strlen(child: Expression, encoding : Expression) extends UnaryExpress override def nullable = true - override def toString = s"Strlen($child, $encoding)" + override def toString = s"OctetLen($child, $encoding)" override def eval(input: Row): EvaluatedType = { val string = child.eval(input) if (string == null) { null } else if (!string.isInstanceOf[String]) { - log.debug(s"Non-string value [$string] provided to strlen") + log.debug(s"Non-string value [$string] provided to OctetLen") null } else { var evalEncoding = encoding.eval(input) @@ -272,18 +287,17 @@ case class Strlen(child: Expression, encoding : Expression) extends UnaryExpress if (evalEncoding != null) { evalEncoding.toString } else { - StrlenConstants.DefaultEncoding + OctetLenConstants.DefaultEncoding } val s: String = "" try { - new String(string.asInstanceOf[String].getBytes, strEncoding).length + string.asInstanceOf[String].getBytes(strEncoding).length } catch { case ue : UnsupportedEncodingException => { - log.debug(s"strlen: Caught UnsupportedEncodingException for encoding=[$strEncoding]") - null + throw new UnsupportedEncodingException(s"OctetLen: Caught UnsupportedEncodingException for encoding=[$strEncoding]") } } - } + } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index cd54808d3dd39..d140da368039f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -175,7 +175,7 @@ object NullPropagation extends Rule[LogicalPlan] { case e @ Substring(_, _, Literal(null, _)) => Literal(null, e.dataType) case e @ Length(Literal(null, _)) => Literal(null, e.dataType) - case e @ Strlen(Literal(null, _),_) => Literal(null, e.dataType) + case e @ OctetLen(Literal(null, _),_) => Literal(null, e.dataType) // Put exceptional cases above if any case e: BinaryArithmetic => e.children match { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala index c5016e6eeb5e2..df816f411ee6c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala @@ -583,17 +583,17 @@ class ExpressionEvaluationSuite extends FunSuite { checkEvaluation(Length(Literal("\uF93D\uF936\uF949\uF942",StringType)), 4) } - test("Strlen") { - checkEvaluation(Strlen(Literal(null, StringType), "ISO-8859-1"), null) - checkEvaluation(Strlen(Literal(null, StringType), "UTF-8"), null) - checkEvaluation(Strlen(Literal(null, StringType), "UTF-16"), null) - checkEvaluation(Strlen(Literal("1234567890ABC", StringType), "ISO-8859-1"), 13) - checkEvaluation(Strlen(Literal("1234567890ABC", StringType), "UTF-8"), 13) - checkEvaluation(Strlen(Literal("1234567890ABC", StringType), "UTF-16"), 7) - checkEvaluation(Strlen(Literal("\uF93D\uF936\uF949\uF942", StringType), "ISO-8859-1"), 4) - checkEvaluation(Strlen(Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-8"), 4) - checkEvaluation(Strlen(Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-16"), 2) - checkEvaluation(Strlen(Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-32"), 1) + test("OctetLen") { + checkEvaluation(OctetLen(Literal(null, StringType), "ISO-8859-1"), null) + checkEvaluation(OctetLen(Literal(null, StringType), "UTF-8"), null) + checkEvaluation(OctetLen(Literal(null, StringType), "UTF-16"), null) + checkEvaluation(OctetLen(Literal("1234567890ABC", StringType), "ISO-8859-1"), 13) + checkEvaluation(OctetLen(Literal("1234567890ABC", StringType), "UTF-8"), 13) + checkEvaluation(OctetLen(Literal("1234567890ABC", StringType), "UTF-16"), 7) + checkEvaluation(OctetLen(Literal("\uF93D\uF936\uF949\uF942", StringType), "ISO-8859-1"), 4) + checkEvaluation(OctetLen(Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-8"), 4) + checkEvaluation(OctetLen(Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-16"), 2) + checkEvaluation(OctetLen(Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-32"), 1) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala index 08a8868f4ef68..4997770d13c5a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala @@ -211,7 +211,7 @@ class ConstantFoldingSuite extends PlanTest { Contains("abc", Literal(null, StringType)) as 'c20, Length(Literal(null, IntegerType)) as 'c21, - Strlen(Literal(null, StringType), "ISO-8859-1") as 'c22 + OctetLen(Literal(null, StringType), "ISO-8859-1") as 'c22 ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index ce21aa49fbd70..5a8c3219f0cd3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -52,9 +52,9 @@ class SQLQuerySuite extends QueryTest { "est") } - test("SPARK-TBD Added Parser of SQL LENGTH()") { + test("SPARK-2686 Added Parser of SQL LENGTH()") { checkAnswer( - sql("SELECT length(key) as keylen from testData where key = 100"), 3) + sql("SELECT char_length(key) as keylen from testData where key = 100"), 3) checkAnswer( sql("SELECT len(key), count(*) as cnt from testData where key <= 100 group by len(key)"), Seq(Seq(1,9),Seq(2,90), Seq(3,1))) @@ -66,15 +66,15 @@ class SQLQuerySuite extends QueryTest { sql("SELECT max(LENGTH(s)) FROM nullableRepeatedData"), 4) } - test("SPARK-TBD Added Parser of SQL STRLEN()") { + test("SPARK-2686 Added Parser of SQL STRLEN()") { checkAnswer( - sql("SELECT StrLen(s) from repeatedData"), Seq(Seq(4),Seq(4))) + sql("SELECT octet_len(s) from repeatedData"), Seq(Seq(4),Seq(4))) checkAnswer( - sql("SELECT StrLen(s,'UTF-8') from repeatedData"), Seq(Seq(4),Seq(4))) + sql("SELECT octet_len(s,'UTF-8') from repeatedData"), Seq(Seq(4),Seq(4))) checkAnswer( - sql("SELECT max(StrLen(s,'UTF-8')) from nullStrings"), 3) + sql("SELECT max(octet_len(s,'UTF-8')) from nullStrings"), 3) checkAnswer( - sql("SELECT strlen('a','ISO-8859-1') + strlen('abcde','ISO-8859-1') FROM testData limit 1"), 6) + sql("SELECT octet_len('a','ISO-8859-1') + strlen('abcde','ISO-8859-1') FROM testData limit 1"), 6) } test("index into array") { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 29f6c532627fd..c7ea1cc59a487 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -866,7 +866,8 @@ private[hive] object HiveQl { val WHEN = "(?i)WHEN".r val CASE = "(?i)CASE".r val SUBSTR = "(?i)SUBSTR(?:ING)?".r - val STRLEN = "(?i)STRLEN".r + val CHAR_LEN = "(?i)CHAR_LEN".r + val OCTET_LEN = "(?i)OCTET_LEN".r protected def nodeToExpr(node: Node): Expression = node match { /* Attribute References */ @@ -998,10 +999,10 @@ private[hive] object HiveQl { Substring(nodeToExpr(string), nodeToExpr(pos), Literal(Integer.MAX_VALUE, IntegerType)) case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) => Substring(nodeToExpr(string), nodeToExpr(pos), nodeToExpr(length)) - case Token("TOK_FUNCTION", Token(STRLEN(), Nil) :: string :: Nil) => - Strlen(nodeToExpr(string), Literal(StrlenConstants.DefaultEncoding)) - case Token("TOK_FUNCTION", Token(STRLEN(), Nil) :: string :: encoding :: Nil) => - Strlen(nodeToExpr(string), nodeToExpr(encoding)) + case Token("TOK_FUNCTION", Token(OCTET_LEN(), Nil) :: string :: Nil) => + OctetLen(nodeToExpr(string), Literal(StrlenConstants.DefaultEncoding)) + case Token("TOK_FUNCTION", Token(OCTET_LEN(), Nil) :: string :: encoding :: Nil) => + OctetLen(nodeToExpr(string), nodeToExpr(encoding)) /* UDFs - Must be last otherwise will preempt built in functions */ From 6a6222ade546ab624925c1977ec67f19700d53e1 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 1 Aug 2014 14:27:35 -0700 Subject: [PATCH 17/22] Ongoing work with Takuya and Michael A --- .../apache/spark/sql/catalyst/SqlParser.scala | 14 +- .../expressions/stringOperations.scala | 16 +- .../sql/catalyst/optimizer/Optimizer.scala | 2 +- .../ExpressionEvaluationSuite.scala | 149 ++++++++++-------- .../optimizer/ConstantFoldingSuite.scala | 2 +- .../org/apache/spark/sql/SQLQuerySuite.scala | 10 +- .../org/apache/spark/sql/hive/HiveQl.scala | 12 +- 7 files changed, 110 insertions(+), 95 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index 77a3dc2729a48..188953b272dac 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -124,8 +124,8 @@ class SqlParser extends StandardTokenParsers with PackratParsers { protected val SUBSTRING = Keyword("SUBSTRING") protected val LEN = Keyword("LEN") protected val LENGTH = Keyword("LENGTH") - protected val CHAR_LEN = Keyword("CHAR_LEN") - protected val OCTET_LEN = Keyword("OCTET_LEN") + protected val CHAR_LENGTH = Keyword("CHAR_LENGTH") + protected val OCTET_LENGTH = Keyword("OCTET_LENGTH") // Use reflection to find the reserved words defined in this class. protected val reservedWords = @@ -327,12 +327,12 @@ class SqlParser extends StandardTokenParsers with PackratParsers { (SUBSTR | SUBSTRING) ~> "(" ~> expression ~ "," ~ expression ~ "," ~ expression <~ ")" ^^ { case s ~ "," ~ p ~ "," ~ l => Substring(s,p,l) } | - (LEN | LENGTH | CHAR_LEN) ~> "(" ~> expression <~ ")" ^^ { case s => Length(s) } | - OCTET_LEN ~> "(" ~> expression ~ "," ~ expression <~ ")" ^^ { - case s ~ "," ~ e => OctetLen(s, e) + (LEN | LENGTH | CHAR_LENGTH) ~> "(" ~> expression <~ ")" ^^ { case s => Length(s) } | + OCTET_LENGTH ~> "(" ~> expression ~ "," ~ expression <~ ")" ^^ { + case s ~ "," ~ e => OctetLength(s, e) } | - OCTET_LEN ~> "(" ~> expression <~ ")" ^^ { - case s => OctetLen(s, Literal(OctetLenConstants.DefaultEncoding)) + OCTET_LENGTH ~> "(" ~> expression <~ ")" ^^ { + case s => OctetLength(s, Literal(OctetLengthConstants.DefaultEncoding)) } | ident ~ "(" ~ repsep(expression, ",") <~ ")" ^^ { case udfName ~ _ ~ exprs => UnresolvedFunction(udfName, exprs) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index 99073283b761a..505642394e552 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -240,7 +240,7 @@ case class Length(child: Expression) extends UnaryExpression { } -object OctetLenConstants { +object OctetLengthConstants { val DefaultEncoding = "ISO-8859-1" } @@ -261,7 +261,7 @@ object OctetLenUtils { /** * A function that returns the number of characters in a string expression */ -case class OctetLen(child: Expression, encoding : Expression) extends UnaryExpression +case class OctetLength(child: Expression, encoding : Expression) extends UnaryExpression with Logging { type EvaluatedType = Any @@ -275,11 +275,11 @@ case class OctetLen(child: Expression, encoding : Expression) extends UnaryExpre override def toString = s"OctetLen($child, $encoding)" override def eval(input: Row): EvaluatedType = { - val string = child.eval(input) - if (string == null) { + val evalInput = child.eval(input) + if (evalInput == null) { null - } else if (!string.isInstanceOf[String]) { - log.debug(s"Non-string value [$string] provided to OctetLen") + } else if (!evalInput.isInstanceOf[String]) { + log.debug(s"Non-string value [$evalInput] provided to OctetLen") null } else { var evalEncoding = encoding.eval(input) @@ -287,11 +287,11 @@ case class OctetLen(child: Expression, encoding : Expression) extends UnaryExpre if (evalEncoding != null) { evalEncoding.toString } else { - OctetLenConstants.DefaultEncoding + OctetLengthConstants.DefaultEncoding } val s: String = "" try { - string.asInstanceOf[String].getBytes(strEncoding).length + evalInput.asInstanceOf[String].getBytes(strEncoding).length } catch { case ue : UnsupportedEncodingException => { throw new UnsupportedEncodingException(s"OctetLen: Caught UnsupportedEncodingException for encoding=[$strEncoding]") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index d140da368039f..02b81b799e4c7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -175,7 +175,7 @@ object NullPropagation extends Rule[LogicalPlan] { case e @ Substring(_, _, Literal(null, _)) => Literal(null, e.dataType) case e @ Length(Literal(null, _)) => Literal(null, e.dataType) - case e @ OctetLen(Literal(null, _),_) => Literal(null, e.dataType) + case e @ OctetLength(Literal(null, _),_) => Literal(null, e.dataType) // Put exceptional cases above if any case e: BinaryArithmetic => e.children match { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala index df816f411ee6c..d3cab802dbf78 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.types._ import scala.collection.mutable.ArrayBuffer /* Implicit conversions */ + import org.apache.spark.sql.catalyst.dsl.expressions._ class ExpressionEvaluationSuite extends FunSuite { @@ -62,8 +63,8 @@ class ExpressionEvaluationSuite extends FunSuite { val notTrueTable = (true, false) :: - (false, true) :: - (null, null) :: Nil + (false, true) :: + (null, null) :: Nil test("3VL Not") { notTrueTable.foreach { @@ -73,45 +74,45 @@ class ExpressionEvaluationSuite extends FunSuite { } booleanLogicTest("AND", _ && _, - (true, true, true) :: - (true, false, false) :: - (true, null, null) :: - (false, true, false) :: - (false, false, false) :: - (false, null, false) :: - (null, true, null) :: - (null, false, false) :: - (null, null, null) :: Nil) + (true, true, true) :: + (true, false, false) :: + (true, null, null) :: + (false, true, false) :: + (false, false, false) :: + (false, null, false) :: + (null, true, null) :: + (null, false, false) :: + (null, null, null) :: Nil) booleanLogicTest("OR", _ || _, - (true, true, true) :: - (true, false, true) :: - (true, null, true) :: - (false, true, true) :: - (false, false, false) :: - (false, null, null) :: - (null, true, true) :: - (null, false, null) :: - (null, null, null) :: Nil) + (true, true, true) :: + (true, false, true) :: + (true, null, true) :: + (false, true, true) :: + (false, false, false) :: + (false, null, null) :: + (null, true, true) :: + (null, false, null) :: + (null, null, null) :: Nil) booleanLogicTest("=", _ === _, - (true, true, true) :: - (true, false, false) :: - (true, null, null) :: - (false, true, false) :: - (false, false, true) :: - (false, null, null) :: - (null, true, null) :: - (null, false, null) :: - (null, null, null) :: Nil) + (true, true, true) :: + (true, false, false) :: + (true, null, null) :: + (false, true, false) :: + (false, false, true) :: + (false, null, null) :: + (null, true, null) :: + (null, false, null) :: + (null, null, null) :: Nil) def booleanLogicTest( - name: String, - op: (Expression, Expression) => Expression, - truthTable: Seq[(Any, Any, Any)]) { + name: String, + op: (Expression, Expression) => Expression, + truthTable: Seq[(Any, Any, Any)]) { test(s"3VL $name") { truthTable.foreach { - case (l,r,answer) => + case (l, r, answer) => val expr = op(Literal(l, BooleanType), Literal(r, BooleanType)) checkEvaluation(expr, answer) } @@ -126,8 +127,8 @@ class ExpressionEvaluationSuite extends FunSuite { val actual = try evaluate(expression, inputRow) catch { case e: Exception => fail(s"Exception evaluating $expression", e) } - if(actual != expected) { - val input = if(inputRow == EmptyRow) "" else s", input: $inputRow" + if (actual != expected) { + val input = if (inputRow == EmptyRow) "" else s", input: $inputRow" fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input") } } @@ -152,8 +153,8 @@ class ExpressionEvaluationSuite extends FunSuite { checkEvaluation("addb" like "a%", true) checkEvaluation("addb" like "**", false) checkEvaluation("abc" like "a%", true) - checkEvaluation("abc" like "b%", false) - checkEvaluation("abc" like "bc%", false) + checkEvaluation("abc" like "b%", false) + checkEvaluation("abc" like "bc%", false) } test("LIKE Non-literal Regular Expression") { @@ -190,10 +191,10 @@ class ExpressionEvaluationSuite extends FunSuite { checkEvaluation("axe" rlike "pi|apa", false) checkEvaluation("pip" rlike "^(pi)*$", false) - checkEvaluation("abc" rlike "^ab", true) - checkEvaluation("abc" rlike "^bc", false) - checkEvaluation("abc" rlike "^ab", true) - checkEvaluation("abc" rlike "^bc", false) + checkEvaluation("abc" rlike "^ab", true) + checkEvaluation("abc" rlike "^bc", false) + checkEvaluation("abc" rlike "^ab", true) + checkEvaluation("abc" rlike "^bc", false) intercept[java.util.regex.PatternSyntaxException] { evaluate("abbbbc" rlike "**") @@ -260,7 +261,9 @@ class ExpressionEvaluationSuite extends FunSuite { checkEvaluation(Literal(23.toByte) + Cast(true, ByteType), 24.toByte) checkEvaluation(Literal(23.toShort) + Cast(true, ShortType), 24.toShort) - intercept[Exception] {evaluate(Literal(1) cast BinaryType, null)} + intercept[Exception] { + evaluate(Literal(1) cast BinaryType, null) + } assert(("abcdef" cast StringType).nullable === false) assert(("abcdef" cast BinaryType).nullable === false) @@ -287,7 +290,7 @@ class ExpressionEvaluationSuite extends FunSuite { test("timestamp casting") { val millis = 15 * 1000 + 2 val ts = new Timestamp(millis) - val ts1 = new Timestamp(15 * 1000) // a timestamp without the milliseconds part + val ts1 = new Timestamp(15 * 1000) // a timestamp without the milliseconds part checkEvaluation(Cast(ts, ShortType), 15) checkEvaluation(Cast(ts, IntegerType), 15) checkEvaluation(Cast(ts, LongType), 15) @@ -338,11 +341,11 @@ class ExpressionEvaluationSuite extends FunSuite { checkEvaluation(If(Literal(false, BooleanType), Literal("a", StringType), Literal("b", StringType)), "b", row) - checkEvaluation(c1 in (c1, c2), true, row) + checkEvaluation(c1 in(c1, c2), true, row) checkEvaluation( Literal("^Ba*n", StringType) in (Literal("^Ba*n", StringType)), true, row) checkEvaluation( - Literal("^Ba*n", StringType) in (Literal("^Ba*n", StringType), c2), true, row) + Literal("^Ba*n", StringType) in(Literal("^Ba*n", StringType), c2), true, row) } test("case when") { @@ -390,11 +393,11 @@ class ExpressionEvaluationSuite extends FunSuite { test("complex type") { val row = new GenericRow(Array[Any]( - "^Ba*n", // 0 - null.asInstanceOf[String], // 1 - new GenericRow(Array[Any]("aa", "bb")), // 2 - Map("aa"->"bb"), // 3 - Seq("aa", "bb") // 4 + "^Ba*n", // 0 + null.asInstanceOf[String], // 1 + new GenericRow(Array[Any]("aa", "bb")), // 2 + Map("aa" -> "bb"), // 3 + Seq("aa", "bb") // 4 )) val typeS = StructType( @@ -425,7 +428,7 @@ class ExpressionEvaluationSuite extends FunSuite { :: StructField("b", StringType, nullable = false) :: Nil ) - assert(GetField(BoundReference(2,typeS, nullable = true), "a").nullable === true) + assert(GetField(BoundReference(2, typeS, nullable = true), "a").nullable === true) assert(GetField(BoundReference(2, typeS_notNullable, nullable = false), "a").nullable === false) assert(GetField(Literal(null, typeS), "a").nullable === true) @@ -573,27 +576,39 @@ class ExpressionEvaluationSuite extends FunSuite { test("Length") { checkEvaluation(Length(Literal(null, IntegerType)), null) - checkEvaluation(Length(Literal(0,IntegerType)), 1) - checkEvaluation(Length(Literal(12,IntegerType)), 2) - checkEvaluation(Length(Literal(123,IntegerType)), 3) + checkEvaluation(Length(Literal(0, IntegerType)), 1) + checkEvaluation(Length(Literal(12, IntegerType)), 2) + checkEvaluation(Length(Literal(123, IntegerType)), 3) checkEvaluation(Length(Literal(12.4F, FloatType)), 4) - checkEvaluation(Length(Literal(12345678901L,LongType)), 11) + checkEvaluation(Length(Literal(12345678901L, LongType)), 11) checkEvaluation(Length(Literal(1234567890.2D, DoubleType)), 14) - checkEvaluation(Length(Literal("1234567890ABC",StringType)), 13) - checkEvaluation(Length(Literal("\uF93D\uF936\uF949\uF942",StringType)), 4) + checkEvaluation(Length(Literal("1234567890ABC", StringType)), 13) + checkEvaluation(Length(Literal("\uF93D\uF936\uF949\uF942", StringType)), 4) } test("OctetLen") { - checkEvaluation(OctetLen(Literal(null, StringType), "ISO-8859-1"), null) - checkEvaluation(OctetLen(Literal(null, StringType), "UTF-8"), null) - checkEvaluation(OctetLen(Literal(null, StringType), "UTF-16"), null) - checkEvaluation(OctetLen(Literal("1234567890ABC", StringType), "ISO-8859-1"), 13) - checkEvaluation(OctetLen(Literal("1234567890ABC", StringType), "UTF-8"), 13) - checkEvaluation(OctetLen(Literal("1234567890ABC", StringType), "UTF-16"), 7) - checkEvaluation(OctetLen(Literal("\uF93D\uF936\uF949\uF942", StringType), "ISO-8859-1"), 4) - checkEvaluation(OctetLen(Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-8"), 4) - checkEvaluation(OctetLen(Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-16"), 2) - checkEvaluation(OctetLen(Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-32"), 1) + checkEvaluation(OctetLength(Literal(null, StringType), "ISO-8859-1"), null) + checkEvaluation(OctetLength(Literal(null, StringType), "UTF-8"), null) + checkEvaluation(OctetLength(Literal(null, StringType), "UTF-16"), null) + checkEvaluation(OctetLength(Literal("1234567890ABC", StringType), "ISO-8859-1"), 13) + checkEvaluation(OctetLength(Literal("1234567890ABC", StringType), "UTF-8"), 13) + checkEvaluation(OctetLength(Literal("1234567890ABC", StringType), "UTF-16"), 28) + checkEvaluation(OctetLength(Literal("1234567890ABC", StringType), "UTF-32"), 52) + checkEvaluation(OctetLength( + Literal("\uF93D\uF936\uF949\uF942", StringType), "ISO-8859-1"), 4) + // Chinese characters get truncated by ISO-8859-1 encoding + checkEvaluation(OctetLength( + Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-8"), 12) // chinese characters + checkEvaluation(OctetLength( + Literal("\uD840\uDC0B\uD842\uDFB7", StringType), "UTF-8"), 8) // 2 surrogate pairs + checkEvaluation(OctetLength( + Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-16"), 10) // chinese characters + checkEvaluation(OctetLength( + Literal("\uD840\uDC0B\uD842\uDFB7", StringType), "UTF-16"), 10) // 2 surrogate pairs + checkEvaluation(OctetLength( + Literal("\uF93D\uF936\uF949\uF942", StringType), "UTF-32"), 16) // chinese characters + checkEvaluation(OctetLength( + Literal("\uD840\uDC0B\uD842\uDFB7", StringType), "UTF-32"), 8) // 2 surrogate pairs } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala index 4997770d13c5a..d6549e9de1d1b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala @@ -211,7 +211,7 @@ class ConstantFoldingSuite extends PlanTest { Contains("abc", Literal(null, StringType)) as 'c20, Length(Literal(null, IntegerType)) as 'c21, - OctetLen(Literal(null, StringType), "ISO-8859-1") as 'c22 + OctetLength(Literal(null, StringType), "ISO-8859-1") as 'c22 ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 5a8c3219f0cd3..8e834e6e3ce1e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -66,15 +66,15 @@ class SQLQuerySuite extends QueryTest { sql("SELECT max(LENGTH(s)) FROM nullableRepeatedData"), 4) } - test("SPARK-2686 Added Parser of SQL STRLEN()") { + test("SPARK-2686 Added Parser of SQL OCTET_LENGTH()") { checkAnswer( - sql("SELECT octet_len(s) from repeatedData"), Seq(Seq(4),Seq(4))) + sql("SELECT octet_length(s) from repeatedData"), Seq(Seq(4),Seq(4))) checkAnswer( - sql("SELECT octet_len(s,'UTF-8') from repeatedData"), Seq(Seq(4),Seq(4))) + sql("SELECT octet_length(s,'UTF-8') from repeatedData"), Seq(Seq(4),Seq(4))) checkAnswer( - sql("SELECT max(octet_len(s,'UTF-8')) from nullStrings"), 3) + sql("SELECT max(octet_length(s,'UTF-8')) from nullStrings"), 3) checkAnswer( - sql("SELECT octet_len('a','ISO-8859-1') + strlen('abcde','ISO-8859-1') FROM testData limit 1"), 6) + sql("SELECT octet_length('a','ISO-8859-1') + octet_length('abcde','ISO-8859-1') FROM testData limit 1"), 6) } test("index into array") { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index c7ea1cc59a487..21a1c9252d526 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -866,8 +866,8 @@ private[hive] object HiveQl { val WHEN = "(?i)WHEN".r val CASE = "(?i)CASE".r val SUBSTR = "(?i)SUBSTR(?:ING)?".r - val CHAR_LEN = "(?i)CHAR_LEN".r - val OCTET_LEN = "(?i)OCTET_LEN".r + val CHAR_LENGTH = "(?i)CHAR_LENGTH".r + val OCTET_LENGTH = "(?i)OCTET_LENGTH".r protected def nodeToExpr(node: Node): Expression = node match { /* Attribute References */ @@ -999,10 +999,10 @@ private[hive] object HiveQl { Substring(nodeToExpr(string), nodeToExpr(pos), Literal(Integer.MAX_VALUE, IntegerType)) case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) => Substring(nodeToExpr(string), nodeToExpr(pos), nodeToExpr(length)) - case Token("TOK_FUNCTION", Token(OCTET_LEN(), Nil) :: string :: Nil) => - OctetLen(nodeToExpr(string), Literal(StrlenConstants.DefaultEncoding)) - case Token("TOK_FUNCTION", Token(OCTET_LEN(), Nil) :: string :: encoding :: Nil) => - OctetLen(nodeToExpr(string), nodeToExpr(encoding)) + case Token("TOK_FUNCTION", Token(OCTET_LENGTH(), Nil) :: string :: Nil) => + OctetLength(nodeToExpr(string), Literal(StrlenConstants.DefaultEncoding)) + case Token("TOK_FUNCTION", Token(OCTET_LENGTH(), Nil) :: string :: encoding :: Nil) => + OctetLength(nodeToExpr(string), nodeToExpr(encoding)) /* UDFs - Must be last otherwise will preempt built in functions */ From 81c64c331a70f16d1dd031b635027d352945ff6b Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 1 Aug 2014 16:50:55 -0700 Subject: [PATCH 18/22] Revert whitespace/formatting changes on other sections of ExpressionEvaluationSuite --- .../ExpressionEvaluationSuite.scala | 108 +++++++++--------- 1 file changed, 51 insertions(+), 57 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala index d3cab802dbf78..bc8f1cdfe8fc7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala @@ -18,16 +18,12 @@ package org.apache.spark.sql.catalyst.expressions import java.sql.Timestamp -import java.util.concurrent.atomic.AtomicInteger import org.scalatest.FunSuite import org.apache.spark.sql.catalyst.types._ -import scala.collection.mutable.ArrayBuffer - /* Implicit conversions */ - import org.apache.spark.sql.catalyst.dsl.expressions._ class ExpressionEvaluationSuite extends FunSuite { @@ -63,8 +59,8 @@ class ExpressionEvaluationSuite extends FunSuite { val notTrueTable = (true, false) :: - (false, true) :: - (null, null) :: Nil + (false, true) :: + (null, null) :: Nil test("3VL Not") { notTrueTable.foreach { @@ -74,45 +70,45 @@ class ExpressionEvaluationSuite extends FunSuite { } booleanLogicTest("AND", _ && _, - (true, true, true) :: - (true, false, false) :: - (true, null, null) :: - (false, true, false) :: - (false, false, false) :: - (false, null, false) :: - (null, true, null) :: - (null, false, false) :: - (null, null, null) :: Nil) + (true, true, true) :: + (true, false, false) :: + (true, null, null) :: + (false, true, false) :: + (false, false, false) :: + (false, null, false) :: + (null, true, null) :: + (null, false, false) :: + (null, null, null) :: Nil) booleanLogicTest("OR", _ || _, - (true, true, true) :: - (true, false, true) :: - (true, null, true) :: - (false, true, true) :: - (false, false, false) :: - (false, null, null) :: - (null, true, true) :: - (null, false, null) :: - (null, null, null) :: Nil) + (true, true, true) :: + (true, false, true) :: + (true, null, true) :: + (false, true, true) :: + (false, false, false) :: + (false, null, null) :: + (null, true, true) :: + (null, false, null) :: + (null, null, null) :: Nil) booleanLogicTest("=", _ === _, - (true, true, true) :: - (true, false, false) :: - (true, null, null) :: - (false, true, false) :: - (false, false, true) :: - (false, null, null) :: - (null, true, null) :: - (null, false, null) :: - (null, null, null) :: Nil) + (true, true, true) :: + (true, false, false) :: + (true, null, null) :: + (false, true, false) :: + (false, false, true) :: + (false, null, null) :: + (null, true, null) :: + (null, false, null) :: + (null, null, null) :: Nil) def booleanLogicTest( - name: String, - op: (Expression, Expression) => Expression, - truthTable: Seq[(Any, Any, Any)]) { + name: String, + op: (Expression, Expression) => Expression, + truthTable: Seq[(Any, Any, Any)]) { test(s"3VL $name") { truthTable.foreach { - case (l, r, answer) => + case (l,r,answer) => val expr = op(Literal(l, BooleanType), Literal(r, BooleanType)) checkEvaluation(expr, answer) } @@ -127,8 +123,8 @@ class ExpressionEvaluationSuite extends FunSuite { val actual = try evaluate(expression, inputRow) catch { case e: Exception => fail(s"Exception evaluating $expression", e) } - if (actual != expected) { - val input = if (inputRow == EmptyRow) "" else s", input: $inputRow" + if(actual != expected) { + val input = if(inputRow == EmptyRow) "" else s", input: $inputRow" fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input") } } @@ -153,8 +149,8 @@ class ExpressionEvaluationSuite extends FunSuite { checkEvaluation("addb" like "a%", true) checkEvaluation("addb" like "**", false) checkEvaluation("abc" like "a%", true) - checkEvaluation("abc" like "b%", false) - checkEvaluation("abc" like "bc%", false) + checkEvaluation("abc" like "b%", false) + checkEvaluation("abc" like "bc%", false) } test("LIKE Non-literal Regular Expression") { @@ -191,10 +187,10 @@ class ExpressionEvaluationSuite extends FunSuite { checkEvaluation("axe" rlike "pi|apa", false) checkEvaluation("pip" rlike "^(pi)*$", false) - checkEvaluation("abc" rlike "^ab", true) - checkEvaluation("abc" rlike "^bc", false) - checkEvaluation("abc" rlike "^ab", true) - checkEvaluation("abc" rlike "^bc", false) + checkEvaluation("abc" rlike "^ab", true) + checkEvaluation("abc" rlike "^bc", false) + checkEvaluation("abc" rlike "^ab", true) + checkEvaluation("abc" rlike "^bc", false) intercept[java.util.regex.PatternSyntaxException] { evaluate("abbbbc" rlike "**") @@ -261,9 +257,7 @@ class ExpressionEvaluationSuite extends FunSuite { checkEvaluation(Literal(23.toByte) + Cast(true, ByteType), 24.toByte) checkEvaluation(Literal(23.toShort) + Cast(true, ShortType), 24.toShort) - intercept[Exception] { - evaluate(Literal(1) cast BinaryType, null) - } + intercept[Exception] {evaluate(Literal(1) cast BinaryType, null)} assert(("abcdef" cast StringType).nullable === false) assert(("abcdef" cast BinaryType).nullable === false) @@ -290,7 +284,7 @@ class ExpressionEvaluationSuite extends FunSuite { test("timestamp casting") { val millis = 15 * 1000 + 2 val ts = new Timestamp(millis) - val ts1 = new Timestamp(15 * 1000) // a timestamp without the milliseconds part + val ts1 = new Timestamp(15 * 1000) // a timestamp without the milliseconds part checkEvaluation(Cast(ts, ShortType), 15) checkEvaluation(Cast(ts, IntegerType), 15) checkEvaluation(Cast(ts, LongType), 15) @@ -341,11 +335,11 @@ class ExpressionEvaluationSuite extends FunSuite { checkEvaluation(If(Literal(false, BooleanType), Literal("a", StringType), Literal("b", StringType)), "b", row) - checkEvaluation(c1 in(c1, c2), true, row) + checkEvaluation(c1 in (c1, c2), true, row) checkEvaluation( Literal("^Ba*n", StringType) in (Literal("^Ba*n", StringType)), true, row) checkEvaluation( - Literal("^Ba*n", StringType) in(Literal("^Ba*n", StringType), c2), true, row) + Literal("^Ba*n", StringType) in (Literal("^Ba*n", StringType), c2), true, row) } test("case when") { @@ -393,11 +387,11 @@ class ExpressionEvaluationSuite extends FunSuite { test("complex type") { val row = new GenericRow(Array[Any]( - "^Ba*n", // 0 - null.asInstanceOf[String], // 1 - new GenericRow(Array[Any]("aa", "bb")), // 2 - Map("aa" -> "bb"), // 3 - Seq("aa", "bb") // 4 + "^Ba*n", // 0 + null.asInstanceOf[String], // 1 + new GenericRow(Array[Any]("aa", "bb")), // 2 + Map("aa"->"bb"), // 3 + Seq("aa", "bb") // 4 )) val typeS = StructType( @@ -428,7 +422,7 @@ class ExpressionEvaluationSuite extends FunSuite { :: StructField("b", StringType, nullable = false) :: Nil ) - assert(GetField(BoundReference(2, typeS, nullable = true), "a").nullable === true) + assert(GetField(BoundReference(2,typeS, nullable = true), "a").nullable === true) assert(GetField(BoundReference(2, typeS_notNullable, nullable = false), "a").nullable === false) assert(GetField(Literal(null, typeS), "a").nullable === true) From 94fcbd35bc31fc93244fddfdc16329778fc55013 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 1 Aug 2014 19:08:16 -0700 Subject: [PATCH 19/22] Change default encoding to UTF-8 --- .../spark/sql/catalyst/expressions/stringOperations.scala | 5 +++-- .../src/main/scala/org/apache/spark/sql/hive/HiveQl.scala | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index 505642394e552..5a96c9c722411 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -241,7 +241,7 @@ case class Length(child: Expression) extends UnaryExpression { } object OctetLengthConstants { - val DefaultEncoding = "ISO-8859-1" + val DefaultEncoding = "UTF-8" } object OctetLenUtils { @@ -294,7 +294,8 @@ case class OctetLength(child: Expression, encoding : Expression) extends UnaryEx evalInput.asInstanceOf[String].getBytes(strEncoding).length } catch { case ue : UnsupportedEncodingException => { - throw new UnsupportedEncodingException(s"OctetLen: Caught UnsupportedEncodingException for encoding=[$strEncoding]") + throw new UnsupportedEncodingException( + s"OctetLen: Caught UnsupportedEncodingException for encoding=[$strEncoding]") } } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 21a1c9252d526..a60d3dad8a176 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -1000,7 +1000,7 @@ private[hive] object HiveQl { case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) => Substring(nodeToExpr(string), nodeToExpr(pos), nodeToExpr(length)) case Token("TOK_FUNCTION", Token(OCTET_LENGTH(), Nil) :: string :: Nil) => - OctetLength(nodeToExpr(string), Literal(StrlenConstants.DefaultEncoding)) + OctetLength(nodeToExpr(string), Literal(OctetLengthConstants.DefaultEncoding)) case Token("TOK_FUNCTION", Token(OCTET_LENGTH(), Nil) :: string :: encoding :: Nil) => OctetLength(nodeToExpr(string), nodeToExpr(encoding)) From a0a03d70203c2ff94ed74f9daeb28461f839831f Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 1 Aug 2014 20:15:07 -0700 Subject: [PATCH 20/22] Replace len() method with simpler call to codePointCount --- out | 672 ++++++++++++++++++ .../expressions/stringOperations.scala | 18 +- 2 files changed, 674 insertions(+), 16 deletions(-) create mode 100644 out diff --git a/out b/out new file mode 100644 index 0000000000000..d24695e6107d9 --- /dev/null +++ b/out @@ -0,0 +1,672 @@ +[INFO] Scanning for projects... +[INFO] ------------------------------------------------------------------------ +[INFO] Reactor Build Order: +[INFO] +[INFO] Spark Project Parent POM +[INFO] Spark Project Core +[INFO] Spark Project Bagel +[INFO] Spark Project GraphX +[INFO] Spark Project ML Library +[INFO] Spark Project Streaming +[INFO] Spark Project Tools +[INFO] Spark Project Catalyst +[INFO] Spark Project SQL +[INFO] Spark Project Hive +[INFO] Spark Project REPL +[INFO] Spark Project YARN Parent POM +[INFO] Spark Project YARN Stable API +[INFO] Spark Project Assembly +[INFO] Spark Project External Twitter +[INFO] Spark Project External Kafka +[INFO] Spark Project External Flume Sink +[INFO] Spark Project External Flume +[INFO] Spark Project External ZeroMQ +[INFO] Spark Project External MQTT +[INFO] Spark Project Examples +[INFO] +[INFO] ------------------------------------------------------------------------ +[INFO] Building Spark Project Parent POM 1.1.0-SNAPSHOT +[INFO] ------------------------------------------------------------------------ +[INFO] +[INFO] --- maven-enforcer-plugin:1.3.1:enforce (enforce-versions) @ spark-parent --- +[INFO] +[INFO] --- build-helper-maven-plugin:1.8:add-source (add-scala-sources) @ spark-parent --- +[INFO] Source directory: /shared/strlen/src/main/scala added. +[INFO] +[INFO] --- maven-remote-resources-plugin:1.5:process (default) @ spark-parent --- +[INFO] +[INFO] --- scala-maven-plugin:3.1.6:compile (scala-compile-first) @ spark-parent --- +[INFO] No sources to compile +[INFO] +[INFO] --- build-helper-maven-plugin:1.8:add-test-source (add-scala-test-sources) @ spark-parent --- +[INFO] Test Source directory: /shared/strlen/src/test/scala added. +[INFO] +[INFO] --- scala-maven-plugin:3.1.6:testCompile (scala-test-compile-first) @ spark-parent --- +[INFO] No sources to compile +[INFO] +[INFO] ------------------------------------------------------------------------ +[INFO] Building Spark Project Core 1.1.0-SNAPSHOT +[INFO] ------------------------------------------------------------------------ +[INFO] +[INFO] --- maven-enforcer-plugin:1.3.1:enforce (enforce-versions) @ spark-core_2.10 --- +[INFO] +[INFO] --- build-helper-maven-plugin:1.8:add-source (add-scala-sources) @ spark-core_2.10 --- +[INFO] Source directory: /shared/strlen/core/src/main/scala added. +[INFO] +[INFO] --- maven-remote-resources-plugin:1.5:process (default) @ spark-core_2.10 --- +[INFO] +[INFO] --- exec-maven-plugin:1.2.1:exec (default) @ spark-core_2.10 --- +Archive: lib/py4j-0.8.2.1-src.zip + extracting: build/py4j/__init__.py + inflating: build/py4j/compat.py + inflating: build/py4j/finalizer.py + inflating: build/py4j/java_collections.py + inflating: build/py4j/java_gateway.py + inflating: build/py4j/protocol.py + extracting: build/py4j/tests/__init__.py + inflating: build/py4j/tests/byte_string_test.py + inflating: build/py4j/tests/finalizer_test.py + inflating: build/py4j/tests/java_array_test.py + inflating: build/py4j/tests/java_callback_test.py + inflating: build/py4j/tests/java_gateway_test.py + inflating: build/py4j/tests/java_list_test.py + inflating: build/py4j/tests/java_map_test.py + inflating: build/py4j/tests/java_set_test.py + inflating: build/py4j/tests/multithreadtest.py + inflating: build/py4j/tests/py4j_callback_example.py + inflating: build/py4j/tests/py4j_callback_example2.py + inflating: build/py4j/tests/py4j_example.py + inflating: build/py4j/version.py +[INFO] +[INFO] --- maven-resources-plugin:2.6:resources (default-resources) @ spark-core_2.10 --- +[INFO] Using 'UTF-8' encoding to copy filtered resources. +[INFO] Copying 9 resources +[INFO] Copying 21 resources +[INFO] Copying 7 resources +[INFO] Copying 3 resources +[INFO] +[INFO] --- scala-maven-plugin:3.1.6:compile (scala-compile-first) @ spark-core_2.10 --- +[INFO] Using zinc server for incremental compilation +[INFO] compiler plugin: BasicArtifact(org.scalamacros,paradise_2.10.4,2.0.1,null) +[info] Compile success at Aug 1, 2014 7:55:25 PM [0.258s] +[INFO] +[INFO] --- maven-compiler-plugin:3.1:compile (default-compile) @ spark-core_2.10 --- +[INFO] Changes detected - recompiling the module! +[INFO] Compiling 35 source files to /shared/strlen/core/target/scala-2.10/classes +[INFO] +[INFO] --- build-helper-maven-plugin:1.8:add-test-source (add-scala-test-sources) @ spark-core_2.10 --- +[INFO] Test Source directory: /shared/strlen/core/src/test/scala added. +[INFO] +[INFO] --- maven-resources-plugin:2.6:testResources (default-testResources) @ spark-core_2.10 --- +[INFO] Using 'UTF-8' encoding to copy filtered resources. +[INFO] Copying 4 resources +[INFO] Copying 3 resources +[INFO] +[INFO] --- scala-maven-plugin:3.1.6:testCompile (scala-test-compile-first) @ spark-core_2.10 --- +[INFO] Using zinc server for incremental compilation +[INFO] compiler plugin: BasicArtifact(org.scalamacros,paradise_2.10.4,2.0.1,null) +[info] Compiling 1 Scala source and 1 Java source to /shared/strlen/core/target/scala-2.10/test-classes... +[warn] Note: /shared/strlen/core/src/test/java/org/apache/spark/JavaAPISuite.java uses or overrides a deprecated API. +[warn] Note: Recompile with -Xlint:deprecation for details. +[warn] Note: /shared/strlen/core/src/test/java/org/apache/spark/JavaAPISuite.java uses unchecked or unsafe operations. +[warn] Note: Recompile with -Xlint:unchecked for details. +[info] Compile success at Aug 1, 2014 7:55:32 PM [4.592s] +[INFO] +[INFO] --- maven-compiler-plugin:3.1:testCompile (default-testCompile) @ spark-core_2.10 --- +[INFO] Nothing to compile - all classes are up to date +[INFO] +[INFO] --- maven-surefire-plugin:2.17:test (default-test) @ spark-core_2.10 --- +[INFO] Tests are skipped. +[INFO] +[INFO] --- scalatest-maven-plugin:1.0-RC2:test (test) @ spark-core_2.10 --- +Discovery starting. +Discovery completed in 6 seconds, 462 milliseconds. +Run starting. Expected test count is: 724 +ExternalSorterSuite: +- empty data stream +- few elements per partition +- empty partitions with spilling +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- spilling in local cluster +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- spilling in local cluster with many reduce tasks +- cleanup of intermediate files in sorter +- cleanup of intermediate files in sorter if there are errors +- cleanup of intermediate files in shuffle +- cleanup of intermediate files in shuffle with errors +- no partial aggregation or sorting +- partial aggregation without spill +- partial aggregation with spill, no ordering +- partial aggregation with spill, with ordering +- sorting without aggregation, no spill +- sorting without aggregation, with spill +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- spilling with hash collisions +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- spilling with many hash collisions +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- spilling with hash collisions using the Int.MaxValue key +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- spilling with null keys and values +DAGSchedulerSuite: +- zero split job +- run trivial job +- local job +- local job oom +- run trivial job w/ dependency +- cache location preferences w/ dependency +- avoid exponential blowup when getting preferred locs list +- unserializable task +- trivial job failure +- trivial job cancellation +- job cancellation no-kill backend +- run trivial shuffle +- run trivial shuffle with fetch failure +- ignore late map task completions +- run shuffle with map stage failure +- failure of stage used by two jobs +- run trivial shuffle with out-of-band failure and retry +- recursive shuffle failures +- cached post-shuffle +- misbehaved accumulator should not crash DAGScheduler and SparkContext !!! IGNORED !!! +- misbehaved resultHandler should not crash DAGScheduler and SparkContext +[ERROR] [08/01/2014 19:57:21.310] [test-akka.actor.default-dispatcher-3] [akka://test/user/dagSupervisor/$a] error +org.apache.spark.SparkException: error + at org.apache.spark.scheduler.BuggyDAGEventProcessActor$$anonfun$receive$1.applyOrElse(DAGSchedulerSuite.scala:39) + at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498) + at akka.actor.ActorCell.invoke(ActorCell.scala:456) + at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237) + at akka.dispatch.Mailbox.run(Mailbox.scala:219) + at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386) + at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) + at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) + at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) + at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) + +- DAGSchedulerActorSupervisor closes the SparkContext when EventProcessActor crashes +[ERROR] [08/01/2014 19:57:21.321] [DAGSchedulerSuite-akka.actor.default-dispatcher-3] [akka://DAGSchedulerSuite/user/$$a] Job cancelled because SparkContext was shut down +org.apache.spark.SparkException: Job cancelled because SparkContext was shut down + at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:688) + at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:687) + at scala.collection.mutable.HashSet.foreach(HashSet.scala:79) + at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:687) + at org.apache.spark.scheduler.DAGSchedulerEventProcessActor.postStop(DAGScheduler.scala:1342) + at akka.actor.dungeon.FaultHandling$class.akka$actor$dungeon$FaultHandling$$finishTerminate(FaultHandling.scala:201) + at akka.actor.dungeon.FaultHandling$class.terminate(FaultHandling.scala:163) + at akka.actor.ActorCell.terminate(ActorCell.scala:338) + at akka.actor.ActorCell.invokeAll$1(ActorCell.scala:431) + at akka.actor.ActorCell.systemInvoke(ActorCell.scala:447) + at akka.dispatch.Mailbox.processAllSystemMessages(Mailbox.scala:262) + at akka.testkit.CallingThreadDispatcher.process$1(CallingThreadDispatcher.scala:244) + at akka.testkit.CallingThreadDispatcher.runQueue(CallingThreadDispatcher.scala:284) + at akka.testkit.CallingThreadDispatcher.systemDispatch(CallingThreadDispatcher.scala:192) + at akka.actor.dungeon.Dispatch$class.stop(Dispatch.scala:106) + at akka.actor.ActorCell.stop(ActorCell.scala:338) + at akka.actor.LocalActorRef.stop(ActorRef.scala:340) + at akka.actor.dungeon.Children$class.stop(Children.scala:66) + at akka.actor.ActorCell.stop(ActorCell.scala:338) + at akka.actor.dungeon.FaultHandling$$anonfun$terminate$1.apply(FaultHandling.scala:149) + at akka.actor.dungeon.FaultHandling$$anonfun$terminate$1.apply(FaultHandling.scala:149) + at scala.collection.Iterator$class.foreach(Iterator.scala:727) + at akka.util.Collections$PartialImmutableValuesIterable$$anon$1.foreach(Collections.scala:27) + at akka.util.Collections$PartialImmutableValuesIterable.foreach(Collections.scala:52) + at akka.actor.dungeon.FaultHandling$class.terminate(FaultHandling.scala:149) + at akka.actor.ActorCell.terminate(ActorCell.scala:338) + at akka.actor.ActorCell.invokeAll$1(ActorCell.scala:431) + at akka.actor.ActorCell.systemInvoke(ActorCell.scala:447) + at akka.dispatch.Mailbox.processAllSystemMessages(Mailbox.scala:262) + at akka.dispatch.Mailbox.run(Mailbox.scala:218) + at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386) + at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) + at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) + at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) + at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) + +RDDSuite: +- basic operations +- serialization +- countApproxDistinct +- SparkContext.union +- partitioner aware union +- UnionRDD partition serialized size should be small +- aggregate +- basic caching +- caching with failures +- empty RDD +- repartitioned RDDs +- repartitioned RDDs perform load balancing +- coalesced RDDs +- coalesced RDDs with locality +- coalesced RDDs with locality, large scale (10K partitions) +- coalesced RDDs with locality, fail first pass +- zipped RDDs +- partition pruning +- mapWith +- flatMapWith +- filterWith +- take +- top with predefined ordering +- top with custom ordering +- takeOrdered with predefined ordering +- takeOrdered with custom ordering +- sample preserves partitioner +- takeSample +- takeSample from an empty rdd +- randomSplit +- runJob on an invalid partition +- sort an empty RDD +- sortByKey +- sortByKey ascending parameter +- sortByKey with explicit ordering +- intersection +- intersection strips duplicates in an input +- zipWithIndex +- zipWithIndex with a single partition +- zipWithUniqueId +- retag with implicit ClassTag +- getNarrowAncestors +- getNarrowAncestors with multiple parents +- getNarrowAncestors with cycles +UtilsSuite: +- bytesToString +- copyStream +- memoryStringToMb +- splitCommandString +- string formatting of time durations +- reading offset bytes of a file +- reading offset bytes across multiple files +- deserialize long value +- get iterator size +- findOldFiles +- resolveURI +- nonLocalPaths +SortingSuite: +- sortByKey +- large array +- large array with one split +- large array with many partitions +- sort descending +- sort descending with one split +- sort descending with many partitions +- more partitions than elements +- empty RDD +- partition balancing +- partition balancing for descending sort +TaskContextSuite: +- Calls executeOnCompleteCallbacks after failure +NextIteratorSuite: +- one iteration +- two iterations +- empty iteration +- close is called once for empty iterations +- close is called once for non-empty iterations +ParallelCollectionSplitSuite: +- one element per slice +- one slice +- equal slices +- non-equal slices +- splitting exclusive range +- splitting inclusive range +- empty data +- zero slices +- negative number of slices +- exclusive ranges sliced into ranges +- inclusive ranges sliced into ranges +- identical slice sizes between Range and NumericRange +- identical slice sizes between List and NumericRange +- large ranges don't overflow +- random array tests +- random exclusive range tests +- random inclusive range tests +- exclusive ranges of longs +- inclusive ranges of longs +- exclusive ranges of doubles +- inclusive ranges of doubles +ExecutorURLClassLoaderSuite: +- child first +- parent first +- child first can fall back +- child first can fail +- driver sets context class loader in local mode +ExecutorRunnerTest: +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- command includes appId +EventLoggingListenerSuite: +- Parse names of special files +- Verify special files exist +- Verify special files exist with compression +- Parse event logging info +- Parse event logging info with compression +- Basic event logging +- Basic event logging with compression +- End-to-end event logging +- End-to-end event logging with compression +DriverRunnerTest: +- Process succeeds instantly +- Process failing several times and then succeeding +- Process doesn't restart if not supervised +- Process doesn't restart if killed +- Reset of backoff counter +PairRDDFunctionsSuite: +- aggregateByKey +- groupByKey +- groupByKey with duplicates +- groupByKey with negative key hash codes +- groupByKey with many output partitions +- sampleByKey +- reduceByKey +- reduceByKey with collectAsMap +- reduceByKey with many output partitons +- reduceByKey with partitioner +- countApproxDistinctByKey +- join +- join all-to-all +- leftOuterJoin +- rightOuterJoin +- join with no matches +- join with many output partitions +- groupWith +- groupWith3 +- groupWith4 +- zero-partition RDD +- keys and values +- default partitioner uses partition size +- default partitioner uses largest partitioner +- subtract +- subtract with narrow dependency +- subtractByKey +- subtractByKey with narrow dependency +- foldByKey +- foldByKey with mutable result type +- saveNewAPIHadoopFile should call setConf if format is configurable +- lookup +- lookup with partitioner +- lookup with bad partitioner +PrimitiveVectorSuite: +- primitive value +- non-primitive value +- ideal growth +- ideal size +- resizing +MetricsConfigSuite: +- MetricsConfig with default properties +- MetricsConfig with properties set +- MetricsConfig with subProperties +SparkContextSchedulerCreationSuite: +- bad-master +- local +- local-* +- local-n +- local-*-n-failures +- local-n-failures +- bad-local-n +- bad-local-n-failures +- local-default-parallelism +- simr +- local-cluster +- yarn-cluster +- yarn-standalone +- yarn-client +Failed to load native Mesos library from /usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/lib/amd64/server:/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/lib/amd64:/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/../lib/amd64:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib +- mesos fine-grained +Failed to load native Mesos library from /usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/lib/amd64/server:/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/lib/amd64:/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/../lib/amd64:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib +- mesos coarse-grained +Failed to load native Mesos library from /usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/lib/amd64/server:/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/lib/amd64:/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/../lib/amd64:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib +- mesos with zookeeper +SamplingUtilsSuite: +- reservoirSampleAndCount +- computeFraction +TimeStampedHashMapSuite: +- HashMap - basic test +- TimeStampedHashMap - basic test +- TimeStampedHashMap - threading safety test +- TimeStampedWeakValueHashMap - basic test +- TimeStampedWeakValueHashMap - threading safety test +- TimeStampedHashMap - clearing by timestamp +- TimeStampedWeakValueHashMap - clearing by timestamp +- TimeStampedWeakValueHashMap - clearing weak references +RandomSamplerSuite: +- BernoulliSamplerWithRange +- BernoulliSamplerWithRangeInverse +- BernoulliSamplerWithRatio +- BernoulliSamplerWithComplement +- BernoulliSamplerSetSeed +- PoissonSampler +ImplicitOrderingSuite: +- basic inference of Orderings +ClosureCleanerSuite: +- closures inside an object +- closures inside a class +- closures inside a class with no default constructor +- closures that don't use fields of the outer class +- nested closures inside an object +- nested closures inside a class +- toplevel return statements in closures are identified at cleaning time +- return statements from named functions nested in closures don't raise exceptions +UnpersistSuite: +- unpersist RDD +TaskSetManagerSuite: +- TaskSet with no preferences +- multiple offers with no preferences +- skip unsatisfiable locality levels +- basic delay scheduling +- delay scheduling with fallback +- delay scheduling with failed hosts +- task result lost +- repeated failures lead to task set abortion +- executors should be blacklisted after task failure, in spite of locality preferences +- new executors get added +- test RACK_LOCAL tasks +- do not emit warning when serialized task is small +- emit warning when serialized task is large +DriverSuite: +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- driver should exit after finishing +CompactBufferSuite: +- empty buffer +- basic inserts +- adding sequences +- adding the same buffer to itself +CacheManagerSuite: +- get uncached rdd +- get cached rdd +- get uncached local rdd +TaskSchedulerImplSuite: +- FIFO Scheduler Test +- Fair Scheduler Test +- Nested Pool Test +- Scheduler does not always schedule tasks on the same workers +- Scheduler correctly accounts for multiple CPUs per task +SparkConfSuite: +- loading from system properties +- initializing without loading defaults +- named set methods +- basic get and set +- creating SparkContext without master and app name +- creating SparkContext without master +- creating SparkContext without app name +- creating SparkContext with both master and app name +- SparkContext property overriding +- nested property names +BlockManagerSuite: +- StorageLevel object caching +- BlockManagerId object caching +- master + 1 manager interaction +- master + 2 managers interaction +- removing block +- removing rdd +- removing broadcast +- reregistration on heart beat +- reregistration on block update +- reregistration doesn't dead lock +Some(org.apache.spark.storage.BlockResult@12a52bbe) +- correct BlockResult returned from get() calls +- in-memory LRU storage +- in-memory LRU storage with serialization +- in-memory LRU for partitions of same RDD +- in-memory LRU for partitions of multiple RDDs +- tachyon storage + + tachyon storage test disabled.  +- on-disk storage +- disk and memory storage +- disk and memory storage with getLocalBytes +- disk and memory storage with serialization +- disk and memory storage with serialization and getLocalBytes +- LRU with mixed storage levels +- in-memory LRU with streams +- LRU with mixed storage levels and streams +- negative byte values in ByteBufferInputStream +- overly large block +- block compression +- block store put failure +- reads of memory-mapped and non memory-mapped files are equivalent +- updated block statuses +- query block statuses +- get matching blocks +- SPARK-1194 regression: fix the same-RDD rule for cache replacement +- reserve/release unroll memory +- safely unroll blocks +- safely unroll blocks through putIterator +- safely unroll blocks through putIterator (disk) +- multiple unrolls by the same thread +PythonRunnerSuite: +- format path +- format paths +BitSetSuite: +- basic set and get +- 100% full bit set +- nextSetBit +- xor len(bitsetX) < len(bitsetY) +- xor len(bitsetX) > len(bitsetY) +- andNot len(bitsetX) < len(bitsetY) +- andNot len(bitsetX) > len(bitsetY) +AsyncRDDActionsSuite: +- countAsync +- collectAsync +- foreachAsync +- foreachPartitionAsync +- takeAsync +- async success handling +- async failure handling +- FutureAction result, infinite wait +- FutureAction result, finite wait +- FutureAction result, timeout +MetricsSystemSuite: +- MetricsSystem with default config +- MetricsSystem with sources add +JobCancellationSuite: +- local mode, FIFO scheduler +- local mode, fair scheduler +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- cluster mode, FIFO scheduler +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- cluster mode, fair scheduler +- do not put partially executed partitions into cache +- job group +- job group with interruption +- two jobs sharing the same stage !!! IGNORED !!! +PartitioningSuite: +- HashPartitioner equality +- RangePartitioner equality +- RangePartitioner getPartition +- RangePartitioner for keys that are not Comparable (but with Ordering) +- RangPartitioner.sketch +- RangePartitioner.determineBounds +- RangePartitioner should run only one job if data is roughly balanced +- RangePartitioner should work well on unbalanced data +- RangePartitioner should return a single partition for empty RDDs +- HashPartitioner not equal to RangePartitioner +- partitioner preservation +- partitioning Java arrays should fail +- zero-length partitions should be correctly handled +SecurityManagerSuite: +- set security with conf +- set security with api +UISuite: +- basic ui visibility !!! IGNORED !!! +- visibility at localhost:4040 !!! IGNORED !!! +- attaching a new tab !!! IGNORED !!! +- jetty selects different port under contention +- jetty binds to port 0 correctly +- verify appUIAddress contains the scheme +- verify appUIAddress contains the port +SortShuffleSuite: +- groupByKey without compression +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- shuffle non-zero block size +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- shuffle serializer +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- zero sized blocks +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- zero sized blocks without kryo +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- shuffle on mutable pairs +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- sorting on mutable pairs +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- cogroup using mutable pairs +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- subtract mutable pairs +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- sort with Java non serializable class - Kryo +Spark assembly has been built with Hive, including Datanucleus jars on classpath +Spark assembly has been built with Hive, including Datanucleus jars on classpath +- sort with Java non serializable class - Java +/bin/sh: line 1: 1895 Killed java -Dbasedir=/shared/strlen/core -Xmx3g -XX:MaxPermSize=512m -XX:ReservedCodeCacheSize=512m org.scalatest.tools.Runner -R '/shared/strlen/core/target/scala-2.10/classes /shared/strlen/core/target/scala-2.10/test-classes' -o -f /shared/strlen/core/target/surefire-reports/shared/strlen/core/target/SparkTestSuite.txt -u /shared/strlen/core/target/surefire-reports/. +[INFO] ------------------------------------------------------------------------ +[INFO] Reactor Summary: +[INFO] +[INFO] Spark Project Parent POM .......................... SUCCESS [2.021s] +[INFO] Spark Project Core ................................ FAILURE [8:09.635s] +[INFO] Spark Project Bagel ............................... SKIPPED +[INFO] Spark Project GraphX .............................. SKIPPED +[INFO] Spark Project ML Library .......................... SKIPPED +[INFO] Spark Project Streaming ........................... SKIPPED +[INFO] Spark Project Tools ............................... SKIPPED +[INFO] Spark Project Catalyst ............................ SKIPPED +[INFO] Spark Project SQL ................................. SKIPPED +[INFO] Spark Project Hive ................................ SKIPPED +[INFO] Spark Project REPL ................................ SKIPPED +[INFO] Spark Project YARN Parent POM ..................... SKIPPED +[INFO] Spark Project YARN Stable API ..................... SKIPPED +[INFO] Spark Project Assembly ............................ SKIPPED +[INFO] Spark Project External Twitter .................... SKIPPED +[INFO] Spark Project External Kafka ...................... SKIPPED +[INFO] Spark Project External Flume Sink ................. SKIPPED +[INFO] Spark Project External Flume ...................... SKIPPED +[INFO] Spark Project External ZeroMQ ..................... SKIPPED +[INFO] Spark Project External MQTT ....................... SKIPPED +[INFO] Spark Project Examples ............................ SKIPPED +[INFO] ------------------------------------------------------------------------ +[INFO] BUILD FAILURE +[INFO] ------------------------------------------------------------------------ +[INFO] Total time: 8:12.473s +[INFO] Finished at: Fri Aug 01 20:03:30 PDT 2014 +[INFO] Final Memory: 30M/697M +[INFO] ------------------------------------------------------------------------ +[ERROR] Failed to execute goal org.scalatest:scalatest-maven-plugin:1.0-RC2:test (test) on project spark-core_2.10: There are test failures -> [Help 1] +[ERROR] +[ERROR] To see the full stack trace of the errors, re-run Maven with the -e switch. +[ERROR] Re-run Maven using the -X switch to enable full debug logging. +[ERROR] +[ERROR] For more information about the errors and possible solutions, please read the following articles: +[ERROR] [Help 1] http://cwiki.apache.org/confluence/display/MAVEN/MojoFailureException +[ERROR] +[ERROR] After correcting the problems, you can resume the build with the command +[ERROR] mvn -rf :spark-core_2.10 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index 5a96c9c722411..65be1a76fa35d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -234,30 +234,16 @@ case class Length(child: Expression) extends UnaryExpression { } else if (!inputVal.isInstanceOf[String]) { inputVal.toString.length } else { - OctetLenUtils.len(inputVal.asInstanceOf[String]) + val str = inputVal.asInstanceOf[String] + str.codePointCount(0, str.length) } } - } object OctetLengthConstants { val DefaultEncoding = "UTF-8" } -object OctetLenUtils { - def len(s : String) = { - if (s == null) { - null - } else { - @inline def isUtfStartByte(b : Byte) = (b & 0xC0) != 0x80 - s.getBytes.foldLeft(0) { case (cnt, b) => { - cnt + (if (isUtfStartByte(b)) 1 else 0) - } - } - } - } -} - /** * A function that returns the number of characters in a string expression */ From 91761be0b0433a81d5b01c13aeac718decae8df3 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Fri, 1 Aug 2014 20:17:18 -0700 Subject: [PATCH 21/22] Remove spurious output log file --- out | 672 ------------------------------------------------------------ 1 file changed, 672 deletions(-) delete mode 100644 out diff --git a/out b/out deleted file mode 100644 index d24695e6107d9..0000000000000 --- a/out +++ /dev/null @@ -1,672 +0,0 @@ -[INFO] Scanning for projects... -[INFO] ------------------------------------------------------------------------ -[INFO] Reactor Build Order: -[INFO] -[INFO] Spark Project Parent POM -[INFO] Spark Project Core -[INFO] Spark Project Bagel -[INFO] Spark Project GraphX -[INFO] Spark Project ML Library -[INFO] Spark Project Streaming -[INFO] Spark Project Tools -[INFO] Spark Project Catalyst -[INFO] Spark Project SQL -[INFO] Spark Project Hive -[INFO] Spark Project REPL -[INFO] Spark Project YARN Parent POM -[INFO] Spark Project YARN Stable API -[INFO] Spark Project Assembly -[INFO] Spark Project External Twitter -[INFO] Spark Project External Kafka -[INFO] Spark Project External Flume Sink -[INFO] Spark Project External Flume -[INFO] Spark Project External ZeroMQ -[INFO] Spark Project External MQTT -[INFO] Spark Project Examples -[INFO] -[INFO] ------------------------------------------------------------------------ -[INFO] Building Spark Project Parent POM 1.1.0-SNAPSHOT -[INFO] ------------------------------------------------------------------------ -[INFO] -[INFO] --- maven-enforcer-plugin:1.3.1:enforce (enforce-versions) @ spark-parent --- -[INFO] -[INFO] --- build-helper-maven-plugin:1.8:add-source (add-scala-sources) @ spark-parent --- -[INFO] Source directory: /shared/strlen/src/main/scala added. -[INFO] -[INFO] --- maven-remote-resources-plugin:1.5:process (default) @ spark-parent --- -[INFO] -[INFO] --- scala-maven-plugin:3.1.6:compile (scala-compile-first) @ spark-parent --- -[INFO] No sources to compile -[INFO] -[INFO] --- build-helper-maven-plugin:1.8:add-test-source (add-scala-test-sources) @ spark-parent --- -[INFO] Test Source directory: /shared/strlen/src/test/scala added. -[INFO] -[INFO] --- scala-maven-plugin:3.1.6:testCompile (scala-test-compile-first) @ spark-parent --- -[INFO] No sources to compile -[INFO] -[INFO] ------------------------------------------------------------------------ -[INFO] Building Spark Project Core 1.1.0-SNAPSHOT -[INFO] ------------------------------------------------------------------------ -[INFO] -[INFO] --- maven-enforcer-plugin:1.3.1:enforce (enforce-versions) @ spark-core_2.10 --- -[INFO] -[INFO] --- build-helper-maven-plugin:1.8:add-source (add-scala-sources) @ spark-core_2.10 --- -[INFO] Source directory: /shared/strlen/core/src/main/scala added. -[INFO] -[INFO] --- maven-remote-resources-plugin:1.5:process (default) @ spark-core_2.10 --- -[INFO] -[INFO] --- exec-maven-plugin:1.2.1:exec (default) @ spark-core_2.10 --- -Archive: lib/py4j-0.8.2.1-src.zip - extracting: build/py4j/__init__.py - inflating: build/py4j/compat.py - inflating: build/py4j/finalizer.py - inflating: build/py4j/java_collections.py - inflating: build/py4j/java_gateway.py - inflating: build/py4j/protocol.py - extracting: build/py4j/tests/__init__.py - inflating: build/py4j/tests/byte_string_test.py - inflating: build/py4j/tests/finalizer_test.py - inflating: build/py4j/tests/java_array_test.py - inflating: build/py4j/tests/java_callback_test.py - inflating: build/py4j/tests/java_gateway_test.py - inflating: build/py4j/tests/java_list_test.py - inflating: build/py4j/tests/java_map_test.py - inflating: build/py4j/tests/java_set_test.py - inflating: build/py4j/tests/multithreadtest.py - inflating: build/py4j/tests/py4j_callback_example.py - inflating: build/py4j/tests/py4j_callback_example2.py - inflating: build/py4j/tests/py4j_example.py - inflating: build/py4j/version.py -[INFO] -[INFO] --- maven-resources-plugin:2.6:resources (default-resources) @ spark-core_2.10 --- -[INFO] Using 'UTF-8' encoding to copy filtered resources. -[INFO] Copying 9 resources -[INFO] Copying 21 resources -[INFO] Copying 7 resources -[INFO] Copying 3 resources -[INFO] -[INFO] --- scala-maven-plugin:3.1.6:compile (scala-compile-first) @ spark-core_2.10 --- -[INFO] Using zinc server for incremental compilation -[INFO] compiler plugin: BasicArtifact(org.scalamacros,paradise_2.10.4,2.0.1,null) -[info] Compile success at Aug 1, 2014 7:55:25 PM [0.258s] -[INFO] -[INFO] --- maven-compiler-plugin:3.1:compile (default-compile) @ spark-core_2.10 --- -[INFO] Changes detected - recompiling the module! -[INFO] Compiling 35 source files to /shared/strlen/core/target/scala-2.10/classes -[INFO] -[INFO] --- build-helper-maven-plugin:1.8:add-test-source (add-scala-test-sources) @ spark-core_2.10 --- -[INFO] Test Source directory: /shared/strlen/core/src/test/scala added. -[INFO] -[INFO] --- maven-resources-plugin:2.6:testResources (default-testResources) @ spark-core_2.10 --- -[INFO] Using 'UTF-8' encoding to copy filtered resources. -[INFO] Copying 4 resources -[INFO] Copying 3 resources -[INFO] -[INFO] --- scala-maven-plugin:3.1.6:testCompile (scala-test-compile-first) @ spark-core_2.10 --- -[INFO] Using zinc server for incremental compilation -[INFO] compiler plugin: BasicArtifact(org.scalamacros,paradise_2.10.4,2.0.1,null) -[info] Compiling 1 Scala source and 1 Java source to /shared/strlen/core/target/scala-2.10/test-classes... -[warn] Note: /shared/strlen/core/src/test/java/org/apache/spark/JavaAPISuite.java uses or overrides a deprecated API. -[warn] Note: Recompile with -Xlint:deprecation for details. -[warn] Note: /shared/strlen/core/src/test/java/org/apache/spark/JavaAPISuite.java uses unchecked or unsafe operations. -[warn] Note: Recompile with -Xlint:unchecked for details. -[info] Compile success at Aug 1, 2014 7:55:32 PM [4.592s] -[INFO] -[INFO] --- maven-compiler-plugin:3.1:testCompile (default-testCompile) @ spark-core_2.10 --- -[INFO] Nothing to compile - all classes are up to date -[INFO] -[INFO] --- maven-surefire-plugin:2.17:test (default-test) @ spark-core_2.10 --- -[INFO] Tests are skipped. -[INFO] -[INFO] --- scalatest-maven-plugin:1.0-RC2:test (test) @ spark-core_2.10 --- -Discovery starting. -Discovery completed in 6 seconds, 462 milliseconds. -Run starting. Expected test count is: 724 -ExternalSorterSuite: -- empty data stream -- few elements per partition -- empty partitions with spilling -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- spilling in local cluster -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- spilling in local cluster with many reduce tasks -- cleanup of intermediate files in sorter -- cleanup of intermediate files in sorter if there are errors -- cleanup of intermediate files in shuffle -- cleanup of intermediate files in shuffle with errors -- no partial aggregation or sorting -- partial aggregation without spill -- partial aggregation with spill, no ordering -- partial aggregation with spill, with ordering -- sorting without aggregation, no spill -- sorting without aggregation, with spill -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- spilling with hash collisions -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- spilling with many hash collisions -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- spilling with hash collisions using the Int.MaxValue key -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- spilling with null keys and values -DAGSchedulerSuite: -- zero split job -- run trivial job -- local job -- local job oom -- run trivial job w/ dependency -- cache location preferences w/ dependency -- avoid exponential blowup when getting preferred locs list -- unserializable task -- trivial job failure -- trivial job cancellation -- job cancellation no-kill backend -- run trivial shuffle -- run trivial shuffle with fetch failure -- ignore late map task completions -- run shuffle with map stage failure -- failure of stage used by two jobs -- run trivial shuffle with out-of-band failure and retry -- recursive shuffle failures -- cached post-shuffle -- misbehaved accumulator should not crash DAGScheduler and SparkContext !!! IGNORED !!! -- misbehaved resultHandler should not crash DAGScheduler and SparkContext -[ERROR] [08/01/2014 19:57:21.310] [test-akka.actor.default-dispatcher-3] [akka://test/user/dagSupervisor/$a] error -org.apache.spark.SparkException: error - at org.apache.spark.scheduler.BuggyDAGEventProcessActor$$anonfun$receive$1.applyOrElse(DAGSchedulerSuite.scala:39) - at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498) - at akka.actor.ActorCell.invoke(ActorCell.scala:456) - at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237) - at akka.dispatch.Mailbox.run(Mailbox.scala:219) - at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386) - at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) - at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) - at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) - at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) - -- DAGSchedulerActorSupervisor closes the SparkContext when EventProcessActor crashes -[ERROR] [08/01/2014 19:57:21.321] [DAGSchedulerSuite-akka.actor.default-dispatcher-3] [akka://DAGSchedulerSuite/user/$$a] Job cancelled because SparkContext was shut down -org.apache.spark.SparkException: Job cancelled because SparkContext was shut down - at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:688) - at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:687) - at scala.collection.mutable.HashSet.foreach(HashSet.scala:79) - at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:687) - at org.apache.spark.scheduler.DAGSchedulerEventProcessActor.postStop(DAGScheduler.scala:1342) - at akka.actor.dungeon.FaultHandling$class.akka$actor$dungeon$FaultHandling$$finishTerminate(FaultHandling.scala:201) - at akka.actor.dungeon.FaultHandling$class.terminate(FaultHandling.scala:163) - at akka.actor.ActorCell.terminate(ActorCell.scala:338) - at akka.actor.ActorCell.invokeAll$1(ActorCell.scala:431) - at akka.actor.ActorCell.systemInvoke(ActorCell.scala:447) - at akka.dispatch.Mailbox.processAllSystemMessages(Mailbox.scala:262) - at akka.testkit.CallingThreadDispatcher.process$1(CallingThreadDispatcher.scala:244) - at akka.testkit.CallingThreadDispatcher.runQueue(CallingThreadDispatcher.scala:284) - at akka.testkit.CallingThreadDispatcher.systemDispatch(CallingThreadDispatcher.scala:192) - at akka.actor.dungeon.Dispatch$class.stop(Dispatch.scala:106) - at akka.actor.ActorCell.stop(ActorCell.scala:338) - at akka.actor.LocalActorRef.stop(ActorRef.scala:340) - at akka.actor.dungeon.Children$class.stop(Children.scala:66) - at akka.actor.ActorCell.stop(ActorCell.scala:338) - at akka.actor.dungeon.FaultHandling$$anonfun$terminate$1.apply(FaultHandling.scala:149) - at akka.actor.dungeon.FaultHandling$$anonfun$terminate$1.apply(FaultHandling.scala:149) - at scala.collection.Iterator$class.foreach(Iterator.scala:727) - at akka.util.Collections$PartialImmutableValuesIterable$$anon$1.foreach(Collections.scala:27) - at akka.util.Collections$PartialImmutableValuesIterable.foreach(Collections.scala:52) - at akka.actor.dungeon.FaultHandling$class.terminate(FaultHandling.scala:149) - at akka.actor.ActorCell.terminate(ActorCell.scala:338) - at akka.actor.ActorCell.invokeAll$1(ActorCell.scala:431) - at akka.actor.ActorCell.systemInvoke(ActorCell.scala:447) - at akka.dispatch.Mailbox.processAllSystemMessages(Mailbox.scala:262) - at akka.dispatch.Mailbox.run(Mailbox.scala:218) - at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386) - at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) - at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) - at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) - at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) - -RDDSuite: -- basic operations -- serialization -- countApproxDistinct -- SparkContext.union -- partitioner aware union -- UnionRDD partition serialized size should be small -- aggregate -- basic caching -- caching with failures -- empty RDD -- repartitioned RDDs -- repartitioned RDDs perform load balancing -- coalesced RDDs -- coalesced RDDs with locality -- coalesced RDDs with locality, large scale (10K partitions) -- coalesced RDDs with locality, fail first pass -- zipped RDDs -- partition pruning -- mapWith -- flatMapWith -- filterWith -- take -- top with predefined ordering -- top with custom ordering -- takeOrdered with predefined ordering -- takeOrdered with custom ordering -- sample preserves partitioner -- takeSample -- takeSample from an empty rdd -- randomSplit -- runJob on an invalid partition -- sort an empty RDD -- sortByKey -- sortByKey ascending parameter -- sortByKey with explicit ordering -- intersection -- intersection strips duplicates in an input -- zipWithIndex -- zipWithIndex with a single partition -- zipWithUniqueId -- retag with implicit ClassTag -- getNarrowAncestors -- getNarrowAncestors with multiple parents -- getNarrowAncestors with cycles -UtilsSuite: -- bytesToString -- copyStream -- memoryStringToMb -- splitCommandString -- string formatting of time durations -- reading offset bytes of a file -- reading offset bytes across multiple files -- deserialize long value -- get iterator size -- findOldFiles -- resolveURI -- nonLocalPaths -SortingSuite: -- sortByKey -- large array -- large array with one split -- large array with many partitions -- sort descending -- sort descending with one split -- sort descending with many partitions -- more partitions than elements -- empty RDD -- partition balancing -- partition balancing for descending sort -TaskContextSuite: -- Calls executeOnCompleteCallbacks after failure -NextIteratorSuite: -- one iteration -- two iterations -- empty iteration -- close is called once for empty iterations -- close is called once for non-empty iterations -ParallelCollectionSplitSuite: -- one element per slice -- one slice -- equal slices -- non-equal slices -- splitting exclusive range -- splitting inclusive range -- empty data -- zero slices -- negative number of slices -- exclusive ranges sliced into ranges -- inclusive ranges sliced into ranges -- identical slice sizes between Range and NumericRange -- identical slice sizes between List and NumericRange -- large ranges don't overflow -- random array tests -- random exclusive range tests -- random inclusive range tests -- exclusive ranges of longs -- inclusive ranges of longs -- exclusive ranges of doubles -- inclusive ranges of doubles -ExecutorURLClassLoaderSuite: -- child first -- parent first -- child first can fall back -- child first can fail -- driver sets context class loader in local mode -ExecutorRunnerTest: -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- command includes appId -EventLoggingListenerSuite: -- Parse names of special files -- Verify special files exist -- Verify special files exist with compression -- Parse event logging info -- Parse event logging info with compression -- Basic event logging -- Basic event logging with compression -- End-to-end event logging -- End-to-end event logging with compression -DriverRunnerTest: -- Process succeeds instantly -- Process failing several times and then succeeding -- Process doesn't restart if not supervised -- Process doesn't restart if killed -- Reset of backoff counter -PairRDDFunctionsSuite: -- aggregateByKey -- groupByKey -- groupByKey with duplicates -- groupByKey with negative key hash codes -- groupByKey with many output partitions -- sampleByKey -- reduceByKey -- reduceByKey with collectAsMap -- reduceByKey with many output partitons -- reduceByKey with partitioner -- countApproxDistinctByKey -- join -- join all-to-all -- leftOuterJoin -- rightOuterJoin -- join with no matches -- join with many output partitions -- groupWith -- groupWith3 -- groupWith4 -- zero-partition RDD -- keys and values -- default partitioner uses partition size -- default partitioner uses largest partitioner -- subtract -- subtract with narrow dependency -- subtractByKey -- subtractByKey with narrow dependency -- foldByKey -- foldByKey with mutable result type -- saveNewAPIHadoopFile should call setConf if format is configurable -- lookup -- lookup with partitioner -- lookup with bad partitioner -PrimitiveVectorSuite: -- primitive value -- non-primitive value -- ideal growth -- ideal size -- resizing -MetricsConfigSuite: -- MetricsConfig with default properties -- MetricsConfig with properties set -- MetricsConfig with subProperties -SparkContextSchedulerCreationSuite: -- bad-master -- local -- local-* -- local-n -- local-*-n-failures -- local-n-failures -- bad-local-n -- bad-local-n-failures -- local-default-parallelism -- simr -- local-cluster -- yarn-cluster -- yarn-standalone -- yarn-client -Failed to load native Mesos library from /usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/lib/amd64/server:/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/lib/amd64:/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/../lib/amd64:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib -- mesos fine-grained -Failed to load native Mesos library from /usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/lib/amd64/server:/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/lib/amd64:/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/../lib/amd64:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib -- mesos coarse-grained -Failed to load native Mesos library from /usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/lib/amd64/server:/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/lib/amd64:/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/jre/../lib/amd64:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib -- mesos with zookeeper -SamplingUtilsSuite: -- reservoirSampleAndCount -- computeFraction -TimeStampedHashMapSuite: -- HashMap - basic test -- TimeStampedHashMap - basic test -- TimeStampedHashMap - threading safety test -- TimeStampedWeakValueHashMap - basic test -- TimeStampedWeakValueHashMap - threading safety test -- TimeStampedHashMap - clearing by timestamp -- TimeStampedWeakValueHashMap - clearing by timestamp -- TimeStampedWeakValueHashMap - clearing weak references -RandomSamplerSuite: -- BernoulliSamplerWithRange -- BernoulliSamplerWithRangeInverse -- BernoulliSamplerWithRatio -- BernoulliSamplerWithComplement -- BernoulliSamplerSetSeed -- PoissonSampler -ImplicitOrderingSuite: -- basic inference of Orderings -ClosureCleanerSuite: -- closures inside an object -- closures inside a class -- closures inside a class with no default constructor -- closures that don't use fields of the outer class -- nested closures inside an object -- nested closures inside a class -- toplevel return statements in closures are identified at cleaning time -- return statements from named functions nested in closures don't raise exceptions -UnpersistSuite: -- unpersist RDD -TaskSetManagerSuite: -- TaskSet with no preferences -- multiple offers with no preferences -- skip unsatisfiable locality levels -- basic delay scheduling -- delay scheduling with fallback -- delay scheduling with failed hosts -- task result lost -- repeated failures lead to task set abortion -- executors should be blacklisted after task failure, in spite of locality preferences -- new executors get added -- test RACK_LOCAL tasks -- do not emit warning when serialized task is small -- emit warning when serialized task is large -DriverSuite: -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- driver should exit after finishing -CompactBufferSuite: -- empty buffer -- basic inserts -- adding sequences -- adding the same buffer to itself -CacheManagerSuite: -- get uncached rdd -- get cached rdd -- get uncached local rdd -TaskSchedulerImplSuite: -- FIFO Scheduler Test -- Fair Scheduler Test -- Nested Pool Test -- Scheduler does not always schedule tasks on the same workers -- Scheduler correctly accounts for multiple CPUs per task -SparkConfSuite: -- loading from system properties -- initializing without loading defaults -- named set methods -- basic get and set -- creating SparkContext without master and app name -- creating SparkContext without master -- creating SparkContext without app name -- creating SparkContext with both master and app name -- SparkContext property overriding -- nested property names -BlockManagerSuite: -- StorageLevel object caching -- BlockManagerId object caching -- master + 1 manager interaction -- master + 2 managers interaction -- removing block -- removing rdd -- removing broadcast -- reregistration on heart beat -- reregistration on block update -- reregistration doesn't dead lock -Some(org.apache.spark.storage.BlockResult@12a52bbe) -- correct BlockResult returned from get() calls -- in-memory LRU storage -- in-memory LRU storage with serialization -- in-memory LRU for partitions of same RDD -- in-memory LRU for partitions of multiple RDDs -- tachyon storage - + tachyon storage test disabled.  -- on-disk storage -- disk and memory storage -- disk and memory storage with getLocalBytes -- disk and memory storage with serialization -- disk and memory storage with serialization and getLocalBytes -- LRU with mixed storage levels -- in-memory LRU with streams -- LRU with mixed storage levels and streams -- negative byte values in ByteBufferInputStream -- overly large block -- block compression -- block store put failure -- reads of memory-mapped and non memory-mapped files are equivalent -- updated block statuses -- query block statuses -- get matching blocks -- SPARK-1194 regression: fix the same-RDD rule for cache replacement -- reserve/release unroll memory -- safely unroll blocks -- safely unroll blocks through putIterator -- safely unroll blocks through putIterator (disk) -- multiple unrolls by the same thread -PythonRunnerSuite: -- format path -- format paths -BitSetSuite: -- basic set and get -- 100% full bit set -- nextSetBit -- xor len(bitsetX) < len(bitsetY) -- xor len(bitsetX) > len(bitsetY) -- andNot len(bitsetX) < len(bitsetY) -- andNot len(bitsetX) > len(bitsetY) -AsyncRDDActionsSuite: -- countAsync -- collectAsync -- foreachAsync -- foreachPartitionAsync -- takeAsync -- async success handling -- async failure handling -- FutureAction result, infinite wait -- FutureAction result, finite wait -- FutureAction result, timeout -MetricsSystemSuite: -- MetricsSystem with default config -- MetricsSystem with sources add -JobCancellationSuite: -- local mode, FIFO scheduler -- local mode, fair scheduler -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- cluster mode, FIFO scheduler -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- cluster mode, fair scheduler -- do not put partially executed partitions into cache -- job group -- job group with interruption -- two jobs sharing the same stage !!! IGNORED !!! -PartitioningSuite: -- HashPartitioner equality -- RangePartitioner equality -- RangePartitioner getPartition -- RangePartitioner for keys that are not Comparable (but with Ordering) -- RangPartitioner.sketch -- RangePartitioner.determineBounds -- RangePartitioner should run only one job if data is roughly balanced -- RangePartitioner should work well on unbalanced data -- RangePartitioner should return a single partition for empty RDDs -- HashPartitioner not equal to RangePartitioner -- partitioner preservation -- partitioning Java arrays should fail -- zero-length partitions should be correctly handled -SecurityManagerSuite: -- set security with conf -- set security with api -UISuite: -- basic ui visibility !!! IGNORED !!! -- visibility at localhost:4040 !!! IGNORED !!! -- attaching a new tab !!! IGNORED !!! -- jetty selects different port under contention -- jetty binds to port 0 correctly -- verify appUIAddress contains the scheme -- verify appUIAddress contains the port -SortShuffleSuite: -- groupByKey without compression -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- shuffle non-zero block size -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- shuffle serializer -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- zero sized blocks -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- zero sized blocks without kryo -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- shuffle on mutable pairs -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- sorting on mutable pairs -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- cogroup using mutable pairs -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- subtract mutable pairs -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- sort with Java non serializable class - Kryo -Spark assembly has been built with Hive, including Datanucleus jars on classpath -Spark assembly has been built with Hive, including Datanucleus jars on classpath -- sort with Java non serializable class - Java -/bin/sh: line 1: 1895 Killed java -Dbasedir=/shared/strlen/core -Xmx3g -XX:MaxPermSize=512m -XX:ReservedCodeCacheSize=512m org.scalatest.tools.Runner -R '/shared/strlen/core/target/scala-2.10/classes /shared/strlen/core/target/scala-2.10/test-classes' -o -f /shared/strlen/core/target/surefire-reports/shared/strlen/core/target/SparkTestSuite.txt -u /shared/strlen/core/target/surefire-reports/. -[INFO] ------------------------------------------------------------------------ -[INFO] Reactor Summary: -[INFO] -[INFO] Spark Project Parent POM .......................... SUCCESS [2.021s] -[INFO] Spark Project Core ................................ FAILURE [8:09.635s] -[INFO] Spark Project Bagel ............................... SKIPPED -[INFO] Spark Project GraphX .............................. SKIPPED -[INFO] Spark Project ML Library .......................... SKIPPED -[INFO] Spark Project Streaming ........................... SKIPPED -[INFO] Spark Project Tools ............................... SKIPPED -[INFO] Spark Project Catalyst ............................ SKIPPED -[INFO] Spark Project SQL ................................. SKIPPED -[INFO] Spark Project Hive ................................ SKIPPED -[INFO] Spark Project REPL ................................ SKIPPED -[INFO] Spark Project YARN Parent POM ..................... SKIPPED -[INFO] Spark Project YARN Stable API ..................... SKIPPED -[INFO] Spark Project Assembly ............................ SKIPPED -[INFO] Spark Project External Twitter .................... SKIPPED -[INFO] Spark Project External Kafka ...................... SKIPPED -[INFO] Spark Project External Flume Sink ................. SKIPPED -[INFO] Spark Project External Flume ...................... SKIPPED -[INFO] Spark Project External ZeroMQ ..................... SKIPPED -[INFO] Spark Project External MQTT ....................... SKIPPED -[INFO] Spark Project Examples ............................ SKIPPED -[INFO] ------------------------------------------------------------------------ -[INFO] BUILD FAILURE -[INFO] ------------------------------------------------------------------------ -[INFO] Total time: 8:12.473s -[INFO] Finished at: Fri Aug 01 20:03:30 PDT 2014 -[INFO] Final Memory: 30M/697M -[INFO] ------------------------------------------------------------------------ -[ERROR] Failed to execute goal org.scalatest:scalatest-maven-plugin:1.0-RC2:test (test) on project spark-core_2.10: There are test failures -> [Help 1] -[ERROR] -[ERROR] To see the full stack trace of the errors, re-run Maven with the -e switch. -[ERROR] Re-run Maven using the -X switch to enable full debug logging. -[ERROR] -[ERROR] For more information about the errors and possible solutions, please read the following articles: -[ERROR] [Help 1] http://cwiki.apache.org/confluence/display/MAVEN/MojoFailureException -[ERROR] -[ERROR] After correcting the problems, you can resume the build with the command -[ERROR] mvn -rf :spark-core_2.10 From 22eddbce6a201c8f5b5c31859ceb972e60657377 Mon Sep 17 00:00:00 2001 From: Stephen Boesch Date: Sun, 3 Aug 2014 22:48:54 -0700 Subject: [PATCH 22/22] Use Octet/Char_Len instead of Octet/Char_length due to apparent preexisting spark ParserCombinator bug. --- .../org/apache/spark/sql/catalyst/SqlParser.scala | 10 +++++----- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 12 ++++++------ .../scala/org/apache/spark/sql/hive/HiveQl.scala | 10 +++++----- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index 188953b272dac..826e3b76f9bd8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -124,8 +124,8 @@ class SqlParser extends StandardTokenParsers with PackratParsers { protected val SUBSTRING = Keyword("SUBSTRING") protected val LEN = Keyword("LEN") protected val LENGTH = Keyword("LENGTH") - protected val CHAR_LENGTH = Keyword("CHAR_LENGTH") - protected val OCTET_LENGTH = Keyword("OCTET_LENGTH") + protected val CHAR_LEN = Keyword("CHAR_LEN") + protected val OCTET_LEN = Keyword("OCTET_LEN") // Use reflection to find the reserved words defined in this class. protected val reservedWords = @@ -327,11 +327,11 @@ class SqlParser extends StandardTokenParsers with PackratParsers { (SUBSTR | SUBSTRING) ~> "(" ~> expression ~ "," ~ expression ~ "," ~ expression <~ ")" ^^ { case s ~ "," ~ p ~ "," ~ l => Substring(s,p,l) } | - (LEN | LENGTH | CHAR_LENGTH) ~> "(" ~> expression <~ ")" ^^ { case s => Length(s) } | - OCTET_LENGTH ~> "(" ~> expression ~ "," ~ expression <~ ")" ^^ { + (LEN | LENGTH | CHAR_LEN) ~> "(" ~> expression <~ ")" ^^ { case s => Length(s) } | + OCTET_LEN ~> "(" ~> expression ~ "," ~ expression <~ ")" ^^ { case s ~ "," ~ e => OctetLength(s, e) } | - OCTET_LENGTH ~> "(" ~> expression <~ ")" ^^ { + OCTET_LEN ~> "(" ~> expression <~ ")" ^^ { case s => OctetLength(s, Literal(OctetLengthConstants.DefaultEncoding)) } | ident ~ "(" ~ repsep(expression, ",") <~ ")" ^^ { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 8e834e6e3ce1e..595b11c2a305d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -54,7 +54,7 @@ class SQLQuerySuite extends QueryTest { test("SPARK-2686 Added Parser of SQL LENGTH()") { checkAnswer( - sql("SELECT char_length(key) as keylen from testData where key = 100"), 3) + sql("SELECT char_len(key) as keylen from testData where key = 100"), 3) checkAnswer( sql("SELECT len(key), count(*) as cnt from testData where key <= 100 group by len(key)"), Seq(Seq(1,9),Seq(2,90), Seq(3,1))) @@ -66,15 +66,15 @@ class SQLQuerySuite extends QueryTest { sql("SELECT max(LENGTH(s)) FROM nullableRepeatedData"), 4) } - test("SPARK-2686 Added Parser of SQL OCTET_LENGTH()") { + test("SPARK-2686 Added Parser of SQL OCTET_LEN()") { checkAnswer( - sql("SELECT octet_length(s) from repeatedData"), Seq(Seq(4),Seq(4))) + sql("SELECT octet_len(s) from repeatedData"), Seq(Seq(4),Seq(4))) checkAnswer( - sql("SELECT octet_length(s,'UTF-8') from repeatedData"), Seq(Seq(4),Seq(4))) + sql("SELECT octet_len(s,'UTF-8') from repeatedData"), Seq(Seq(4),Seq(4))) checkAnswer( - sql("SELECT max(octet_length(s,'UTF-8')) from nullStrings"), 3) + sql("SELECT max(octet_len(s,'UTF-8')) from nullStrings"), 3) checkAnswer( - sql("SELECT octet_length('a','ISO-8859-1') + octet_length('abcde','ISO-8859-1') FROM testData limit 1"), 6) + sql("SELECT octet_len('a','ISO-8859-1') + octet_len('abcde','ISO-8859-1') FROM testData limit 1"), 6) } test("index into array") { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index a60d3dad8a176..e6769646e4af2 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -866,8 +866,8 @@ private[hive] object HiveQl { val WHEN = "(?i)WHEN".r val CASE = "(?i)CASE".r val SUBSTR = "(?i)SUBSTR(?:ING)?".r - val CHAR_LENGTH = "(?i)CHAR_LENGTH".r - val OCTET_LENGTH = "(?i)OCTET_LENGTH".r + val CHAR_LEN = "(?i)CHAR_LEN".r + val OCTET_LEN = "(?i)OCTET_LEN".r protected def nodeToExpr(node: Node): Expression = node match { /* Attribute References */ @@ -997,11 +997,11 @@ private[hive] object HiveQl { case Token("TOK_FUNCTION", Token(RAND(), Nil) :: Nil) => Rand case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: Nil) => Substring(nodeToExpr(string), nodeToExpr(pos), Literal(Integer.MAX_VALUE, IntegerType)) - case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) => + case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) => Substring(nodeToExpr(string), nodeToExpr(pos), nodeToExpr(length)) - case Token("TOK_FUNCTION", Token(OCTET_LENGTH(), Nil) :: string :: Nil) => + case Token("TOK_FUNCTION", Token(OCTET_LEN(), Nil) :: string :: Nil) => OctetLength(nodeToExpr(string), Literal(OctetLengthConstants.DefaultEncoding)) - case Token("TOK_FUNCTION", Token(OCTET_LENGTH(), Nil) :: string :: encoding :: Nil) => + case Token("TOK_FUNCTION", Token(OCTET_LEN(), Nil) :: string :: encoding :: Nil) => OctetLength(nodeToExpr(string), nodeToExpr(encoding))