microsoft · imback82 · Sep 18, 2020 · Aug 26, 2020 · Aug 26, 2020 · Aug 31, 2020
diff --git a/src/main/scala/com/microsoft/hyperspace/actions/CreateActionBase.scala b/src/main/scala/com/microsoft/hyperspace/actions/CreateActionBase.scala
@@ -19,7 +19,8 @@ package com.microsoft.hyperspace.actions
 import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PartitioningAwareFileIndex}
-import org.apache.spark.sql.functions.input_file_name
+import org.apache.spark.sql.expressions.UserDefinedFunction
+import org.apache.spark.sql.functions.{input_file_name, udf}
 import org.apache.spark.sql.sources.DataSourceRegister
 
 import com.microsoft.hyperspace.HyperspaceException
@@ -181,8 +182,24 @@ private[actions] abstract class CreateActionBase(dataManager: IndexDataManager)
       val missingPartitionColumns = getPartitionColumns(df).filter(
         ResolverUtils.resolve(spark, _, columnsFromIndexConfig).isEmpty)
       val allIndexColumns = columnsFromIndexConfig ++ missingPartitionColumns
+
+      // File path value in DATA_FILE_NAME_COLUMN column is stored as String. We normalize
+      // path values in this column by removing extra preceding `/` characters in the path
+      // and store it the same way paths are stored in Content in an IndexLogEntry instance.
+      // Normalizing path values in DATA_FILE_NAME_COLUMN column keeps path representation
+      // unified in Hyperspace (between index lineage and index metadata) and helps performance
+      // by avoiding the need to fix paths (i.e. removing extra `/` characters) each time
+      // value of DATA_FILE_NAME_COLUMN column is read.
+      // Here is an example of normalization:
+      // - Original raw path (output of input_file_name() udf, before normalization):
+      //    + file:///C:/myGit/hyperspace-1/src/test/part-00003.snappy.parquet
+      // - Normalized path to be stored in DATA_FILE_NAME_COLUMN column:
+      //    + file:/C:/myGit/hyperspace-1/src/test/part-00003.snappy.parquet
+      val absolutePath: UserDefinedFunction = udf(
+        (filePath: String) => PathUtils.makeAbsolute(filePath).toString)
+
       df.select(allIndexColumns.head, allIndexColumns.tail: _*)
-        .withColumn(IndexConstants.DATA_FILE_NAME_COLUMN, input_file_name())
+        .withColumn(IndexConstants.DATA_FILE_NAME_COLUMN, absolutePath(input_file_name()))
     } else {
       df.select(columnsFromIndexConfig.head, columnsFromIndexConfig.tail: _*)
     }

diff --git a/src/main/scala/com/microsoft/hyperspace/actions/RefreshAction.scala b/src/main/scala/com/microsoft/hyperspace/actions/RefreshAction.scala
@@ -17,61 +17,24 @@
 package com.microsoft.hyperspace.actions
 
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.types.{DataType, StructType}
 
-import com.microsoft.hyperspace.HyperspaceException
-import com.microsoft.hyperspace.actions.Constants.States.{ACTIVE, REFRESHING}
 import com.microsoft.hyperspace.index._
 import com.microsoft.hyperspace.telemetry.{AppInfo, HyperspaceEvent, RefreshActionEvent}
 
-// TODO: This class depends directly on LogEntry. This should be updated such that
-//   it works with IndexLogEntry only. (for example, this class can take in
-//   derivedDataset specific logic for refreshing).
+/**
+ * The Index refresh action is used to perform a full rebuild of the index.
+ * Consequently, it ends up creating a new version of the index and involves
+ * a full scan of the underlying source data.
+ *
+ * @param spark SparkSession.
+ * @param logManager Index LogManager for index being refreshed.
+ * @param dataManager Index DataManager for index being refreshed.
+ */
 class RefreshAction(
     spark: SparkSession,
-    final override protected val logManager: IndexLogManager,
+    logManager: IndexLogManager,
     dataManager: IndexDataManager)
-    extends CreateActionBase(dataManager)
-    with Action {
-  private lazy val previousLogEntry: LogEntry = {
-    logManager.getLog(baseId).getOrElse {
-      throw HyperspaceException("LogEntry must exist for refresh operation")
-    }
-  }
-
-  private lazy val previousIndexLogEntry = previousLogEntry.asInstanceOf[IndexLogEntry]
-
-  // Reconstruct a df from schema
-  private lazy val df = {
-    val rels = previousIndexLogEntry.relations
-    // Currently we only support to create an index on a LogicalRelation.
-    assert(rels.size == 1)
-    val dataSchema = DataType.fromJson(rels.head.dataSchemaJson).asInstanceOf[StructType]
-    spark.read
-      .schema(dataSchema)
-      .format(rels.head.fileFormat)
-      .options(rels.head.options)
-      .load(rels.head.rootPaths: _*)
-  }
-
-  private lazy val indexConfig: IndexConfig = {
-    val ddColumns = previousIndexLogEntry.derivedDataset.properties.columns
-    IndexConfig(previousIndexLogEntry.name, ddColumns.indexed, ddColumns.included)
-  }
-
-  final override def logEntry: LogEntry = getIndexLogEntry(spark, df, indexConfig, indexDataPath)
-
-  final override val transientState: String = REFRESHING
-
-  final override val finalState: String = ACTIVE
-
-  final override def validate(): Unit = {
-    if (!previousIndexLogEntry.state.equalsIgnoreCase(ACTIVE)) {
-      throw HyperspaceException(
-        s"Refresh is only supported in $ACTIVE state. " +
-          s"Current index state is ${previousIndexLogEntry.state}")
-    }
-  }
+    extends RefreshActionBase(spark, logManager, dataManager) {
 
   final override def op(): Unit = {
     // TODO: The current implementation picks the number of buckets from session config.

diff --git a/src/main/scala/com/microsoft/hyperspace/actions/RefreshActionBase.scala b/src/main/scala/com/microsoft/hyperspace/actions/RefreshActionBase.scala
@@ -0,0 +1,79 @@
+/*
+ * Copyright (2020) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.actions
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.types.{DataType, StructType}
+
+import com.microsoft.hyperspace.HyperspaceException
+import com.microsoft.hyperspace.actions.Constants.States.{ACTIVE, REFRESHING}
+import com.microsoft.hyperspace.index._
+
+/**
+ * Base abstract class containing common code for different types of index refresh actions.
+ *
+ * @param spark SparkSession
+ * @param logManager Index LogManager for index being refreshed
+ * @param dataManager Index DataManager for index being refreshed
+ */
+// TODO: This class depends directly on LogEntry. This should be updated such that
+//   it works with IndexLogEntry only. (for example, this class can take in
+//   derivedDataset specific logic for refreshing).
+private[actions] abstract class RefreshActionBase(
+    spark: SparkSession,
+    final override protected val logManager: IndexLogManager,
+    dataManager: IndexDataManager)
+    extends CreateActionBase(dataManager)
+    with Action {
+  private lazy val previousLogEntry: LogEntry = {
+    logManager.getLog(baseId).getOrElse {
+      throw HyperspaceException("LogEntry must exist for refresh operation")
+    }
+  }
+
+  protected lazy val previousIndexLogEntry = previousLogEntry.asInstanceOf[IndexLogEntry]
+
+  // Reconstruct a df from schema
+  protected lazy val df = {
+    val rels = previousIndexLogEntry.relations
+    val dataSchema = DataType.fromJson(rels.head.dataSchemaJson).asInstanceOf[StructType]
+    spark.read
+      .schema(dataSchema)
+      .format(rels.head.fileFormat)
+      .options(rels.head.options)
+      .load(rels.head.rootPaths: _*)
+  }
+
+  protected lazy val indexConfig: IndexConfig = {
+    val ddColumns = previousIndexLogEntry.derivedDataset.properties.columns
+    IndexConfig(previousIndexLogEntry.name, ddColumns.indexed, ddColumns.included)
+  }
+
+  final override def logEntry: LogEntry = getIndexLogEntry(spark, df, indexConfig, indexDataPath)
+
+  final override val transientState: String = REFRESHING
+
+  final override val finalState: String = ACTIVE
+
+  override def validate(): Unit = {
+    if (!previousIndexLogEntry.state.equalsIgnoreCase(ACTIVE)) {
+      throw HyperspaceException(
+        s"Refresh is only supported in $ACTIVE state. " +
+          s"Current index state is ${previousIndexLogEntry.state}")
+    }
+  }
+}
diff --git a/src/main/scala/com/microsoft/hyperspace/actions/RefreshDeleteAction.scala b/src/main/scala/com/microsoft/hyperspace/actions/RefreshDeleteAction.scala
@@ -0,0 +1,125 @@
+/*
+ * Copyright (2020) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.actions
+
+import org.apache.hadoop.fs.Path
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions._
+
+import com.microsoft.hyperspace.HyperspaceException
+import com.microsoft.hyperspace.index._
+import com.microsoft.hyperspace.index.DataFrameWriterExtensions.Bucketizer
+import com.microsoft.hyperspace.telemetry.{AppInfo, HyperspaceEvent, RefreshDeleteActionEvent}
+
+/**
+ * Refresh index by removing index entries from any deleted source data file.
+ * Note this Refresh Action only fixes an index w.r.t deleted source data files
+ * and does not consider new source data files (if any).
+ * If some original source data file(s) are removed between previous version of index and now,
+ * this Refresh Action updates the index as follows:
+ * 1. Deleted source data files are identified;
+ * 2. Index records' lineage is leveraged to remove any index entry coming
+ *    from those deleted source data files.
+ *
+ * @param spark SparkSession
+ * @param logManager Index LogManager for index being refreshed
+ * @param dataManager Index DataManager for index being refreshed
+ */
+class RefreshDeleteAction(
+    spark: SparkSession,
+    logManager: IndexLogManager,
+    dataManager: IndexDataManager)
+    extends RefreshActionBase(spark, logManager, dataManager)
+    with Logging {
+
+  final override protected def event(appInfo: AppInfo, message: String): HyperspaceEvent = {
+    RefreshDeleteActionEvent(appInfo, logEntry.asInstanceOf[IndexLogEntry], message)
+  }
+
+  /**
+   * Validate index has lineage column and it is in active state for refreshing and
+   * there are some deleted source data file(s).
+   */
+  final override def validate(): Unit = {
+    super.validate()
+    if (!previousIndexLogEntry.hasLineageColumn(spark)) {
+      throw HyperspaceException(
+        "Index refresh (to handle deleted source data) is " +
+          "only supported on an index with lineage.")
+    }
+
+    if (deletedFiles.isEmpty) {
+      throw HyperspaceException("Refresh aborted as no deleted source data file found.")
+    }
+  }
+
+  /**
+   * For an index with lineage, find all the source data files which have been deleted,
+   * and use index records' lineage to mark and remove index entries which belong to
+   * deleted source data files as those entries are no longer valid.
+   */
+  final override def op(): Unit = {
+    logInfo(
+      "Refresh index is updating index by removing index entries " +
+        s"corresponding to ${deletedFiles.length} deleted source data files.")
+
+    val refreshDF =
+      spark.read
+        .parquet(previousIndexLogEntry.content.files.map(_.toString): _*)
+        .filter(!col(s"${IndexConstants.DATA_FILE_NAME_COLUMN}").isin(deletedFiles: _*))
+
+    refreshDF.write.saveWithBuckets(
+      refreshDF,
+      indexDataPath.toString,
+      previousIndexLogEntry.numBuckets,
+      indexConfig.indexedColumns)
+  }
+
+  /**
+   * Compare list of source data files from previous IndexLogEntry to list
+   * of current source data files, validate fileInfo for existing files and
+   * identify deleted source data files.
+   */
+  private lazy val deletedFiles: Seq[String] = {
+    val rels = previousIndexLogEntry.relations
+    val originalFiles = rels.head.data.properties.content.fileInfos
+    val currentFiles = rels.head.rootPaths
+      .flatMap { p =>
+        Content
+          .fromDirectory(path = new Path(p), throwIfNotExists = true)
+          .fileInfos
+      }
+      .map(f => f.name -> f)
+      .toMap
+
+    var delFiles = Seq[String]()
+    originalFiles.foreach { f =>
+      currentFiles.get(f.name) match {
+        case Some(v) =>
+          if (!f.equals(v)) {
+            throw HyperspaceException(
+              "Index refresh (to handle deleted source data) aborted. " +
+                s"Existing source data file info is changed (file: ${f.name}).")
+          }
+        case None => delFiles :+= f.name
+      }
+    }
+
+    delFiles
+  }
+}
diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexCollectionManager.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexCollectionManager.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.internal.SQLConf
 
 import com.microsoft.hyperspace.HyperspaceException
 import com.microsoft.hyperspace.actions._
+import com.microsoft.hyperspace.util.HyperspaceConf
 
 class IndexCollectionManager(
     spark: SparkSession,
@@ -66,7 +67,11 @@ class IndexCollectionManager(
     withLogManager(indexName) { logManager =>
       val indexPath = PathResolver(spark.sessionState.conf).getIndexPath(indexName)
       val dataManager = indexDataManagerFactory.create(indexPath)
-      new RefreshAction(spark, logManager, dataManager).run()
+      if (HyperspaceConf.refreshDeleteEnabled(spark)) {
+        new RefreshDeleteAction(spark, logManager, dataManager).run()
+      } else {
+        new RefreshAction(spark, logManager, dataManager).run()
+      }
     }
   }
 

diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexConstants.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexConstants.scala
@@ -54,4 +54,7 @@ object IndexConstants {
   private[hyperspace] val DATA_FILE_NAME_COLUMN = "_data_file_name"
   val INDEX_LINEAGE_ENABLED = "spark.hyperspace.index.lineage.enabled"
   val INDEX_LINEAGE_ENABLED_DEFAULT = "false"
+
+  val REFRESH_DELETE_ENABLED = "spark.hyperspace.index.refresh.delete.enabled"
+  val REFRESH_DELETE_ENABLED_DEFAULT = "false"
 }
diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala
@@ -24,11 +24,12 @@ import scala.collection.mutable.{HashMap, ListBuffer}
 import com.fasterxml.jackson.annotation.JsonIgnore
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter}
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.types.{DataType, StructType}
 
 import com.microsoft.hyperspace.HyperspaceException
 import com.microsoft.hyperspace.actions.Constants
-import com.microsoft.hyperspace.util.PathUtils
+import com.microsoft.hyperspace.util.{PathUtils, ResolverUtils}
 
 // IndexLogEntry-specific fingerprint to be temporarily used where fingerprint is not defined.
 case class NoOpFingerprint() {
@@ -402,6 +403,12 @@ case class IndexLogEntry(
     sourcePlanSignatures.head
   }
 
+  def hasLineageColumn(spark: SparkSession): Boolean = {
+    ResolverUtils
+      .resolve(spark, IndexConstants.DATA_FILE_NAME_COLUMN, schema.fieldNames)
+      .isDefined
+  }
+
   override def hashCode(): Int = {
     config.hashCode + signature.hashCode + numBuckets.hashCode + content.hashCode
   }

diff --git a/src/main/scala/com/microsoft/hyperspace/telemetry/HyperspaceEvent.scala b/src/main/scala/com/microsoft/hyperspace/telemetry/HyperspaceEvent.scala
@@ -105,9 +105,21 @@ case class RefreshActionEvent(appInfo: AppInfo, index: IndexLogEntry, message: S
 case class CancelActionEvent(appInfo: AppInfo, index: IndexLogEntry, message: String)
     extends HyperspaceIndexCRUDEvent
 
+/**
+ * Index Refresh Event for deleted source files. Emitted when refresh is called on an index
+ * with config flag set to remove index entries for deleted source data files.
+ *
+ * @param appInfo AppInfo for spark application.
+ * @param index Related index.
+ * @param message Message about event.
+ */
+case class RefreshDeleteActionEvent(appInfo: AppInfo, index: IndexLogEntry, message: String)
+    extends HyperspaceIndexCRUDEvent
+
 /**
  * Index usage event. This event is emitted when an index is picked instead of original data
  * source by one of the hyperspace rules.
+ *
  * @param appInfo AppInfo for spark application.
  * @param indexes List of selected indexes for this plan.
  * @param planBeforeRule Original plan before application of indexes.