-
Notifications
You must be signed in to change notification settings - Fork 117
Add support for delete to index refresh #142
Changes from all commits
4d002cd
f8f72f2
aed6255
bf98425
fcb4f95
ba55f30
a15b71d
70d9416
0b26209
3cc1e0f
bfa0543
66fa8fb
2a62d1d
9916c25
d808a13
38623a0
96b10bb
6bf5990
25bc63c
d0bda6a
8da34c6
33c2080
7ca9f8c
46aa2a7
ed88a36
c98b30b
5d08fc6
0499b45
7137b89
7be558b
7974b9a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| /* | ||
| * Copyright (2020) The Hyperspace Project Authors. | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package com.microsoft.hyperspace.actions | ||
|
|
||
| import org.apache.spark.sql.SparkSession | ||
| import org.apache.spark.sql.types.{DataType, StructType} | ||
|
|
||
| import com.microsoft.hyperspace.HyperspaceException | ||
| import com.microsoft.hyperspace.actions.Constants.States.{ACTIVE, REFRESHING} | ||
| import com.microsoft.hyperspace.index._ | ||
|
|
||
| /** | ||
| * Base abstract class containing common code for different types of index refresh actions. | ||
| * | ||
| * @param spark SparkSession | ||
| * @param logManager Index LogManager for index being refreshed | ||
| * @param dataManager Index DataManager for index being refreshed | ||
| */ | ||
| // TODO: This class depends directly on LogEntry. This should be updated such that | ||
| // it works with IndexLogEntry only. (for example, this class can take in | ||
| // derivedDataset specific logic for refreshing). | ||
|
pirz marked this conversation as resolved.
|
||
| private[actions] abstract class RefreshActionBase( | ||
| spark: SparkSession, | ||
| final override protected val logManager: IndexLogManager, | ||
| dataManager: IndexDataManager) | ||
| extends CreateActionBase(dataManager) | ||
| with Action { | ||
| private lazy val previousLogEntry: LogEntry = { | ||
| logManager.getLog(baseId).getOrElse { | ||
| throw HyperspaceException("LogEntry must exist for refresh operation") | ||
| } | ||
| } | ||
|
|
||
| protected lazy val previousIndexLogEntry = previousLogEntry.asInstanceOf[IndexLogEntry] | ||
|
|
||
| // Reconstruct a df from schema | ||
| protected lazy val df = { | ||
|
pirz marked this conversation as resolved.
|
||
| val rels = previousIndexLogEntry.relations | ||
| val dataSchema = DataType.fromJson(rels.head.dataSchemaJson).asInstanceOf[StructType] | ||
| spark.read | ||
| .schema(dataSchema) | ||
| .format(rels.head.fileFormat) | ||
| .options(rels.head.options) | ||
| .load(rels.head.rootPaths: _*) | ||
| } | ||
|
|
||
| protected lazy val indexConfig: IndexConfig = { | ||
| val ddColumns = previousIndexLogEntry.derivedDataset.properties.columns | ||
| IndexConfig(previousIndexLogEntry.name, ddColumns.indexed, ddColumns.included) | ||
| } | ||
|
|
||
| final override def logEntry: LogEntry = getIndexLogEntry(spark, df, indexConfig, indexDataPath) | ||
|
|
||
| final override val transientState: String = REFRESHING | ||
|
|
||
| final override val finalState: String = ACTIVE | ||
|
|
||
| override def validate(): Unit = { | ||
| if (!previousIndexLogEntry.state.equalsIgnoreCase(ACTIVE)) { | ||
| throw HyperspaceException( | ||
| s"Refresh is only supported in $ACTIVE state. " + | ||
| s"Current index state is ${previousIndexLogEntry.state}") | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| /* | ||
| * Copyright (2020) The Hyperspace Project Authors. | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package com.microsoft.hyperspace.actions | ||
|
|
||
| import org.apache.hadoop.fs.Path | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.SparkSession | ||
| import org.apache.spark.sql.functions._ | ||
|
|
||
| import com.microsoft.hyperspace.HyperspaceException | ||
| import com.microsoft.hyperspace.index._ | ||
| import com.microsoft.hyperspace.index.DataFrameWriterExtensions.Bucketizer | ||
| import com.microsoft.hyperspace.telemetry.{AppInfo, HyperspaceEvent, RefreshDeleteActionEvent} | ||
|
|
||
|
pirz marked this conversation as resolved.
|
||
| /** | ||
| * Refresh index by removing index entries from any deleted source data file. | ||
| * Note this Refresh Action only fixes an index w.r.t deleted source data files | ||
| * and does not consider new source data files (if any). | ||
| * If some original source data file(s) are removed between previous version of index and now, | ||
| * this Refresh Action updates the index as follows: | ||
| * 1. Deleted source data files are identified; | ||
| * 2. Index records' lineage is leveraged to remove any index entry coming | ||
| * from those deleted source data files. | ||
| * | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add a note that this doesn't handle any "new" source data? (and update PR description as well?)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated both.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just so I understand this right, this is the plan, right?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Can you also link to the issue that will handle this addition of "new" source data? Once we address #149, we can get rid of that line.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, all I meant is some clarification saying something like:
|
||
| * @param spark SparkSession | ||
| * @param logManager Index LogManager for index being refreshed | ||
| * @param dataManager Index DataManager for index being refreshed | ||
| */ | ||
| class RefreshDeleteAction( | ||
|
imback82 marked this conversation as resolved.
|
||
| spark: SparkSession, | ||
| logManager: IndexLogManager, | ||
| dataManager: IndexDataManager) | ||
| extends RefreshActionBase(spark, logManager, dataManager) | ||
| with Logging { | ||
|
|
||
| final override protected def event(appInfo: AppInfo, message: String): HyperspaceEvent = { | ||
| RefreshDeleteActionEvent(appInfo, logEntry.asInstanceOf[IndexLogEntry], message) | ||
| } | ||
|
|
||
| /** | ||
| * Validate index has lineage column and it is in active state for refreshing and | ||
| * there are some deleted source data file(s). | ||
| */ | ||
| final override def validate(): Unit = { | ||
| super.validate() | ||
| if (!previousIndexLogEntry.hasLineageColumn(spark)) { | ||
|
imback82 marked this conversation as resolved.
|
||
| throw HyperspaceException( | ||
| "Index refresh (to handle deleted source data) is " + | ||
| "only supported on an index with lineage.") | ||
| } | ||
|
|
||
| if (deletedFiles.isEmpty) { | ||
| throw HyperspaceException("Refresh aborted as no deleted source data file found.") | ||
|
imback82 marked this conversation as resolved.
|
||
| } | ||
| } | ||
|
|
||
| /** | ||
| * For an index with lineage, find all the source data files which have been deleted, | ||
| * and use index records' lineage to mark and remove index entries which belong to | ||
| * deleted source data files as those entries are no longer valid. | ||
| */ | ||
| final override def op(): Unit = { | ||
|
rapoth marked this conversation as resolved.
|
||
| logInfo( | ||
| "Refresh index is updating index by removing index entries " + | ||
| s"corresponding to ${deletedFiles.length} deleted source data files.") | ||
|
|
||
| val refreshDF = | ||
| spark.read | ||
| .parquet(previousIndexLogEntry.content.files.map(_.toString): _*) | ||
| .filter(!col(s"${IndexConstants.DATA_FILE_NAME_COLUMN}").isin(deletedFiles: _*)) | ||
|
|
||
| refreshDF.write.saveWithBuckets( | ||
| refreshDF, | ||
| indexDataPath.toString, | ||
| previousIndexLogEntry.numBuckets, | ||
| indexConfig.indexedColumns) | ||
| } | ||
|
|
||
| /** | ||
| * Compare list of source data files from previous IndexLogEntry to list | ||
| * of current source data files, validate fileInfo for existing files and | ||
| * identify deleted source data files. | ||
| */ | ||
| private lazy val deletedFiles: Seq[String] = { | ||
| val rels = previousIndexLogEntry.relations | ||
| val originalFiles = rels.head.data.properties.content.fileInfos | ||
| val currentFiles = rels.head.rootPaths | ||
| .flatMap { p => | ||
| Content | ||
| .fromDirectory(path = new Path(p), throwIfNotExists = true) | ||
| .fileInfos | ||
| } | ||
| .map(f => f.name -> f) | ||
| .toMap | ||
|
|
||
| var delFiles = Seq[String]() | ||
| originalFiles.foreach { f => | ||
| currentFiles.get(f.name) match { | ||
| case Some(v) => | ||
| if (!f.equals(v)) { | ||
| throw HyperspaceException( | ||
| "Index refresh (to handle deleted source data) aborted. " + | ||
| s"Existing source data file info is changed (file: ${f.name}).") | ||
| } | ||
| case None => delFiles :+= f.name | ||
| } | ||
| } | ||
|
|
||
| delFiles | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.