Skip to content

Commit

Permalink
Metadata changes for sensitive feature information (#457)
Browse files Browse the repository at this point in the history
* Added class for storing information about storing sensitive feature information

* Updated ModelInsights to read out this new class when it is attached to `OpVectorMetadata` objects into FeatureInsights

Co-authored-by: Kevin Moore <jauntbox@gmail.com>
  • Loading branch information
MWYang and Jauntbox authored Jan 29, 2020
1 parent a51212a commit 8c0f67b
Show file tree
Hide file tree
Showing 7 changed files with 548 additions and 45 deletions.
42 changes: 38 additions & 4 deletions core/src/main/scala/com/salesforce/op/ModelInsights.scala
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ case class Discrete(domain: Seq[String], prob: Seq[Double]) extends LabelInfo
* @param metrics sequence containing metrics computed in RawFeatureFilter
* @param distributions distribution information for the raw feature (if calculated in RawFeatureFilter)
* @param exclusionReasons exclusion reasons for the raw feature (if calculated in RawFeatureFilter)
*
* @param sensitiveInformation derived information about sensitive field checks (if performed)
*/
case class FeatureInsights
(
Expand All @@ -342,7 +342,8 @@ case class FeatureInsights
derivedFeatures: Seq[Insights],
metrics: Seq[RawFeatureFilterMetrics] = Seq.empty,
distributions: Seq[FeatureDistribution] = Seq.empty,
exclusionReasons: Seq[ExclusionReasons] = Seq.empty
exclusionReasons: Seq[ExclusionReasons] = Seq.empty,
sensitiveInformation: Seq[SensitiveFeatureInformation] = Seq.empty
)

/**
Expand Down Expand Up @@ -697,8 +698,41 @@ case object ModelInsights {
val metrics = rawFeatureFilterResults.rawFeatureFilterMetrics.filter(_.name == fname)
val distributions = rawFeatureFilterResults.rawFeatureDistributions.filter(_.name == fname)
val exclusionReasons = rawFeatureFilterResults.exclusionReasons.filter(_.name == fname)
FeatureInsights(featureName = fname, featureType = ftype, derivedFeatures = seq.map(_._2),
metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons)
val sensitiveFeatureInformation = vectorInfo.flatMap(_.sensitive.get(fname)) match {
case Some(info) => info
case _ => Seq.empty
}
FeatureInsights(
featureName = fname, featureType = ftype, derivedFeatures = seq.map(_._2),
metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons,
sensitiveInformation = sensitiveFeatureInformation
)
}.toSeq ++ {
/*
Add FeatureInsights for removed sensitive fields that do not have a column in OpVectorMetadata.
With current TMOG settings, this will not happen unless null tracking is turned off since
null indicators are created for all text features, even ignored ones.
*/
vectorInfo match {
case Some(v) =>
// Find features where `actionTaken` is true for all of the sensitive feature informations
v.sensitive.collect {
case (fname, sensitiveFeatureInformation)
if sensitiveFeatureInformation.forall(_.actionTaken) =>
val ftype = allFeatures.find(_.name == fname)
.map(_.typeName)
.getOrElse("")
val metrics = rawFeatureFilterResults.rawFeatureFilterMetrics.filter(_.name == fname)
val distributions = rawFeatureFilterResults.rawFeatureDistributions.filter(_.name == fname)
val exclusionReasons = rawFeatureFilterResults.exclusionReasons.filter(_.name == fname)
FeatureInsights(
featureName = fname, featureType = ftype, derivedFeatures = Seq.empty,
metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons,
sensitiveInformation = sensitiveFeatureInformation
)
}
case None => Seq.empty[FeatureInsights]
}
}.toSeq
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ trait VectorizerDefaults extends OpPipelineStageBase {
val cols =
if (withNullTracking) tf.flatMap { f => Seq(f.toColumnMetaData(), f.toColumnMetaData(isNull = true)) }
else tf.map { f => f.toColumnMetaData() }
OpVectorMetadata(vectorOutputName, cols, Transmogrifier.inputFeaturesToHistory(tf, stageName))
OpVectorMetadata.apply(vectorOutputName, cols, Transmogrifier.inputFeaturesToHistory(tf, stageName))
}

/**
Expand Down Expand Up @@ -697,6 +697,6 @@ trait MapStringPivotHelper extends SaveOthersParams {
): OpVectorMetadata = {
val otherValueString = $(unseenName)
val cols = makeVectorColumnMetadata(topValues, inputFeatures, otherValueString, trackNulls)
OpVectorMetadata(outputName, cols, Transmogrifier.inputFeaturesToHistory(inputFeatures, stageName))
OpVectorMetadata.apply(outputName, cols, Transmogrifier.inputFeaturesToHistory(inputFeatures, stageName))
}
}
78 changes: 75 additions & 3 deletions core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,10 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
indicatorValue = Option(name)
)
},
Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap
Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap,
Map(
"f0" -> Seq(SensitiveNameInformation(0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None))
)
)

it should "correctly extract the LabelSummary from the label and sanity checker info" in {
Expand Down Expand Up @@ -623,6 +626,18 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
f0In.featureName shouldBe "f0"
f0In.featureType shouldBe classOf[PickList].getName
f0In.derivedFeatures.size shouldBe 2
f0In.sensitiveInformation match {
case Seq(SensitiveNameInformation(
probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken
)) =>
actionTaken shouldBe false
probName shouldBe 0.0
genderDetectResults shouldBe Seq.empty[String]
probMale shouldBe 0.0
probFemale shouldBe 0.0
probOther shouldBe 1.0
case _ => fail("SensitiveFeatureInformation was not found.")
}

val f0InDer2 = f0In.derivedFeatures.head
f0InDer2.derivedFeatureName shouldBe "f0_f0_f2_1"
Expand Down Expand Up @@ -690,6 +705,63 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
insights.features.foreach(f => f.distributions shouldBe empty)
}

it should
"""include sensitive feature information
|even for sensitive features that are removed from output vector and output vector metadata""".stripMargin in {
// Copy metadata from above but add new feature that was removed in vectorizing to sensitive info
val f_notInMeta = Feature[Text]("f_notInMeta", isResponse = false, null, Seq(), "test")
val newFeatureName = "fv"
val newColumnMeta = OpVectorColumnMetadata(
parentFeatureName = Seq("f1"),
parentFeatureType = Seq(classOf[Real].getName),
grouping = None,
indicatorValue = None
) +: Array("f2", "f3").map { name =>
OpVectorColumnMetadata(
parentFeatureName = Seq("f0"),
parentFeatureType = Seq(classOf[PickList].getName),
grouping = Option("f0"),
indicatorValue = Option(name)
)
}
val newFeatureHistory = Seq("f1", "f0").map(
name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())
).toMap
val newSensitiveInfo = Map(
"f0" -> Seq(SensitiveNameInformation(
0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None
)),
"f_notInMeta" -> Seq(SensitiveNameInformation(
1.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f_notInMeta", None, actionTaken = true
))
)
val newMeta = OpVectorMetadata(newFeatureName, newColumnMeta, newFeatureHistory, newSensitiveInfo)

val labelSum = ModelInsights.getLabelSummary(Option(lbl), Option(summary))

val featureInsights = ModelInsights.getFeatureInsights(
Option(newMeta), Option(summary), None, Array(f1, f0, f_notInMeta), Array.empty, Map.empty[String, Set[String]],
RawFeatureFilterResults(), labelSum
)
featureInsights.size shouldBe 3
val f_notInMeta_butInInsights = featureInsights.find(_.featureName == "f_notInMeta").get
f_notInMeta_butInInsights.featureName shouldBe "f_notInMeta"
f_notInMeta_butInInsights.featureType shouldBe classOf[Text].getName
f_notInMeta_butInInsights.derivedFeatures.size shouldBe 0
f_notInMeta_butInInsights.sensitiveInformation match {
case Seq(SensitiveNameInformation(
probName, genderDetectResults, probMale, probFemale, probOther, _, _, actionTaken
)) =>
actionTaken shouldBe true
probName shouldBe 1.0
genderDetectResults shouldBe Seq.empty[String]
probMale shouldBe 0.0
probFemale shouldBe 0.0
probOther shouldBe 1.0
case _ => fail("SensitiveFeatureInformation was not found.")
}
}

it should "return model insights for xgboost classification" in {
noException should be thrownBy xgbWorkflowModel.modelInsights(xgbClassifierPred)
val insights = xgbWorkflowModel.modelInsights(xgbClassifierPred)
Expand Down Expand Up @@ -794,8 +866,8 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
}

cardinality.foreach { case (featureName, value) =>
val actualUniques = df.select(featureName).as[Double].distinct.collect.toSet
actualUniques should contain allElementsOf value.valueCounts.keySet.map(_.toDouble)
val actualUniques = df.select(featureName).as[Double].distinct.collect.toSet
actualUniques should contain allElementsOf value.valueCounts.keySet.map(_.toDouble)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

package com.salesforce.op.utils.spark

import com.salesforce.op.FeatureHistory
import com.salesforce.op.{FeatureHistory, SensitiveFeatureInformation}
import com.salesforce.op.features.types.{FeatureType, _}
import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NumericAttribute}
import org.apache.spark.ml.linalg.SQLDataTypes._
Expand All @@ -43,14 +43,17 @@ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField}
*
* @param name name of the feature vector
* @param col information about each element in the vector
* @param history history of parent features used to create the vector map is from
* @param history history of parent features used to create the vector; map is from
* OpVectorColumnMetadata.parentFeatureName (String) to FeatureHistory
* @param sensitive parent features that were detected as sensitive in the creation of the vector;
* map is from OpVectorColumnMetadata.parentFeatureName (String) to SensitiveFeatureInformation
*/
class OpVectorMetadata private
(
val name: String,
col: Array[OpVectorColumnMetadata],
val history: Map[String, FeatureHistory] // TODO fix map -> causes problems when multiple vectorizers used on feature
val history: Map[String, FeatureHistory], // TODO fix map -> causes problems when multiple vectorizers used on feature
val sensitive: Map[String, Seq[SensitiveFeatureInformation]] = Map.empty[String, Seq[SensitiveFeatureInformation]]
) {

/**
Expand Down Expand Up @@ -92,6 +95,7 @@ class OpVectorMetadata private
val meta = new MetadataBuilder()
.putMetadataArray(OpVectorMetadata.ColumnsKey, colMeta.toArray)
.putMetadata(OpVectorMetadata.HistoryKey, FeatureHistory.toMetadata(history))
.putMetadata(OpVectorMetadata.SensitiveKey, SensitiveFeatureInformation.toMetadata(sensitive))
.build()
val attributes = columns.map {
case c if (c.indicatorValue.isDefined || binaryTypes.exists(c.parentFeatureType.contains)) &&
Expand Down Expand Up @@ -161,15 +165,18 @@ class OpVectorMetadata private
override def equals(obj: Any): Boolean =
obj match {
case o: OpVectorMetadata
if o.name == name && o.columns.toSeq == columns.toSeq && history == o.history => true
if o.name == name &&
o.columns.toSeq == columns.toSeq &&
history == o.history &&
sensitive == o.sensitive => true
case _ => false
}

// have to override to support overridden .equals
override def hashCode(): Int = 37 * columns.toSeq.hashCode()

override def toString: String =
s"${this.getClass.getSimpleName}($name,${columns.mkString("Array(", ",", ")")},$history)"
s"${this.getClass.getSimpleName}($name,${columns.mkString("Array(", ",", ")")},$history,$sensitive)"

}

Expand All @@ -179,6 +186,7 @@ object OpVectorMetadata {

val ColumnsKey = "vector_columns"
val HistoryKey = "vector_history"
val SensitiveKey = "vector_detected_sensitive"

/**
* Construct an [[OpVectorMetadata]] from a [[StructField]], assuming that [[ColumnsKey]] is present and conforms
Expand All @@ -197,9 +205,14 @@ object OpVectorMetadata {
if (wrapped.underlyingMap(HistoryKey).asInstanceOf[Metadata].isEmpty) Map.empty[String, FeatureHistory]
else FeatureHistory.fromMetadataMap(field.metadata.getMetadata(HistoryKey))

new OpVectorMetadata(field.name, columns, history)
}
val sensitive =
if (wrapped.underlyingMap(SensitiveKey).asInstanceOf[Metadata].isEmpty) {
Map.empty[String, Seq[SensitiveFeatureInformation]]
}
else SensitiveFeatureInformation.fromMetadataMap(field.metadata.getMetadata(SensitiveKey))

new OpVectorMetadata(field.name, columns, history, sensitive)
}

/**
* Construct an [[OpVectorMetadata]] from a string representing its name, and an array of [[OpVectorColumnMetadata]]
Expand All @@ -214,9 +227,24 @@ object OpVectorMetadata {
name: String,
columns: Array[OpVectorColumnMetadata],
history: Map[String, FeatureHistory]
): OpVectorMetadata = {
new OpVectorMetadata(name, columns, history)
}
): OpVectorMetadata = new OpVectorMetadata(name, columns, history)

/**
* Construct an [[OpVectorMetadata]] from a string representing its name, and an array of [[OpVectorColumnMetadata]]
* representing its columns.
*
* @param name The name of the column the metadata represents
* @param columns The columns within the vectors
* @param history The history of the parent features
* @param sensitive Which columns have been marked as sensitive and related information
* @return The constructed vector metadata
*/
def apply(
name: String,
columns: Array[OpVectorColumnMetadata],
history: Map[String, FeatureHistory],
sensitive: Map[String, Seq[SensitiveFeatureInformation]]
): OpVectorMetadata = new OpVectorMetadata(name, columns, history, sensitive)

/**
* Construct an [[OpVectorMetadata]] from its name and a [[Metadata]], assuming that [[ColumnsKey]] and
Expand All @@ -242,7 +270,8 @@ object OpVectorMetadata {
def flatten(outputName: String, vectors: Seq[OpVectorMetadata]): OpVectorMetadata = {
val allColumns = vectors.flatMap(_.columns).toArray
val allHist = vectors.flatMap(_.history).toMap
new OpVectorMetadata(outputName, allColumns, allHist)
val allSensitive = vectors.flatMap(_.sensitive).toMap
new OpVectorMetadata(outputName, allColumns, allHist, allSensitive)
}

}
Loading

0 comments on commit 8c0f67b

Please sign in to comment.