Skip to content

Commit

Permalink
Enable Html stripping (#478)
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelweilsalesforce committed Jun 10, 2020
1 parent eba38a0 commit e48831a
Show file tree
Hide file tree
Showing 8 changed files with 43 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ trait RichMapFeature {
* options are from the full entry or from the tokens
* @param minLengthStdDev minimum standard deviation of the lengths of tokens in a text field for it to
* be hashed instead of ignored
* @param stripHtml indicates whether to strip HTML tags from the text or not before analyzing
* @param others additional text features
* @return result feature of type Vector
*/
Expand Down Expand Up @@ -304,6 +305,7 @@ trait RichMapFeature {
hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
textLengthType: TextLengthType = SmartTextVectorizer.LengthType,
minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev,
stripHtml: Boolean = TextTokenizer.StripHtml,
others: Array[FeatureLike[TextMap]] = Array.empty
): FeatureLike[OPVector] = {
// scalastyle:on parameter.number
Expand All @@ -318,6 +320,7 @@ trait RichMapFeature {
.setAutoDetectThreshold(autoDetectThreshold)
.setDefaultLanguage(defaultLanguage)
.setMinTokenLength(minTokenLength)
.setStripHtml(stripHtml)
.setToLowercase(toLowercase)
.setTopK(topK)
.setMinSupport(minSupport)
Expand Down Expand Up @@ -426,10 +429,9 @@ trait RichMapFeature {
* @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or
* failed to make a good enough prediction.
* @param hashAlgorithm hash algorithm to use
* @param tokenizeForLengths If true, then the length counts will be lengths of the tokens in the entries.
* If false, then the length counts will be the lengths of the entire entries
* @param minLengthStdDev minimum standard deviation of the lengths of tokens in a text field for it to
* be hashed instead of ignored
* @param stripHtml indicates whether to strip HTML tags from the text or not before analyzing
* @param others additional text features
* @return result feature of type Vector
*/
Expand Down Expand Up @@ -457,6 +459,7 @@ trait RichMapFeature {
hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
textLengthType: TextLengthType = SmartTextVectorizer.LengthType,
minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev,
stripHtml: Boolean = TextTokenizer.StripHtml,
others: Array[FeatureLike[TextAreaMap]] = Array.empty
): FeatureLike[OPVector] = {
// scalastyle:on parameter.number
Expand All @@ -471,6 +474,7 @@ trait RichMapFeature {
.setAutoDetectThreshold(autoDetectThreshold)
.setDefaultLanguage(defaultLanguage)
.setMinTokenLength(minTokenLength)
.setStripHtml(stripHtml)
.setToLowercase(toLowercase)
.setTopK(topK)
.setMinSupport(minSupport)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ trait RichTextFeature {
* confidence greater than the threshold then defaultLanguage is used.
* @param hashSpaceStrategy strategy to determine whether to use shared hash space for all included features
* @param minTokenLength minimum token length, >= 1.
* @param stripHtml indicates whether to strip HTML tags from the text or not before analyzing
* @param trackNulls indicates whether or not to track null values in a separate column.
* Since features may be combined into a shared hash space here, the null value
* should be tracked separately
Expand All @@ -137,6 +138,7 @@ trait RichTextFeature {
autoDetectLanguage: Boolean,
minTokenLength: Int,
toLowercase: Boolean,
stripHtml: Boolean = TextTokenizer.StripHtml,
trackNulls: Boolean = TransmogrifierDefaults.TrackNulls,
trackTextLen: Boolean = TransmogrifierDefaults.TrackTextLen,
hashWithIndex: Boolean = TransmogrifierDefaults.HashWithIndex,
Expand All @@ -153,7 +155,7 @@ trait RichTextFeature {
// scalastyle:on parameter.number
val tokenized = (f +: others).map(_.tokenize(
languageDetector = languageDetector,
analyzer = analyzer,
analyzer = if (stripHtml) TextTokenizer.AnalyzerHtmlStrip else analyzer,
autoDetectLanguage = autoDetectLanguage,
autoDetectThreshold = autoDetectThreshold,
defaultLanguage = defaultLanguage,
Expand Down Expand Up @@ -241,6 +243,7 @@ trait RichTextFeature {
hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
textLengthType: TextLengthType = SmartTextVectorizer.LengthType,
minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev,
stripHtml: Boolean = TextTokenizer.StripHtml,
others: Array[FeatureLike[T]] = Array.empty
): FeatureLike[OPVector] = {
// scalastyle:on parameter.number
Expand All @@ -254,6 +257,7 @@ trait RichTextFeature {
.setAutoDetectThreshold(autoDetectThreshold)
.setDefaultLanguage(defaultLanguage)
.setMinTokenLength(minTokenLength)
.setStripHtml(stripHtml)
.setToLowercase(toLowercase)
.setTopK(topK)
.setMinSupport(minSupport)
Expand Down Expand Up @@ -375,7 +379,7 @@ trait RichTextFeature {
minTokenLength: Int = TextTokenizer.MinTokenLength,
toLowercase: Boolean = TextTokenizer.ToLowercase
): FeatureLike[TextList] = {

// html stripping won't work here due since LuceneRegexTextAnalyzer
tokenize(
languageDetector = TextTokenizer.LanguageDetector,
analyzer = new LuceneRegexTextAnalyzer(pattern, group),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
.setMinTokenLength(getMinTokenLength)
.setToLowercase(getToLowercase)
.setTrackTextLen($(trackTextLen))
.setStripHtml(getStripHtml)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])(
.setMinTokenLength(getMinTokenLength)
.setToLowercase(getToLowercase)
.setTrackTextLen($(trackTextLen))
.setStripHtml(getStripHtml)
}

private def makeVectorMetadata(smartTextParams: SmartTextVectorizerModelArgs): OpVectorMetadata = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,24 @@ trait TextTokenizerParams extends LanguageDetectionParams with TextMatchingParam
def setMinTokenLength(value: Int): this.type = set(minTokenLength, value)
def getMinTokenLength: Int = $(minTokenLength)

final val stripHtml =
new BooleanParam(this, "stripHtml", "enable html stripping")
def setStripHtml(value: Boolean): this.type = set(stripHtml, value)
def getStripHtml: Boolean = $(stripHtml)

setDefault(
minTokenLength -> TextTokenizer.MinTokenLength,
toLowercase -> TextTokenizer.ToLowercase,
autoDetectLanguage -> TextTokenizer.AutoDetectLanguage,
autoDetectThreshold -> TextTokenizer.AutoDetectThreshold,
defaultLanguage -> TextTokenizer.DefaultLanguage.entryName
defaultLanguage -> TextTokenizer.DefaultLanguage.entryName,
stripHtml -> TextTokenizer.StripHtml
)

def tokenize(
text: Text,
languageDetector: LanguageDetector = TextTokenizer.LanguageDetector,
analyzer: TextAnalyzer = TextTokenizer.Analyzer
analyzer: TextAnalyzer = if (getStripHtml) TextTokenizer.AnalyzerHtmlStrip else TextTokenizer.Analyzer
): TextTokenizerResult = TextTokenizer.tokenize(
text = text,
languageDetector = languageDetector,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ private[op] trait TransmogrifierDefaults {
val NullString: String = OpVectorColumnMetadata.NullString
val OtherString: String = OpVectorColumnMetadata.OtherString
val DefaultNumOfFeatures: Int = 512
val MaxNumOfFeatures: Int = 16384
val MaxNumOfFeatures: Int = 1 << 17 // 2^17
val DateListDefault: DateListPivot = DateListPivot.SinceLast
val ReferenceDate: org.joda.time.DateTime = DateTimeUtils.now()
val TopK: Int = 20
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,16 @@ class SmartTextMapVectorizerTest
checkDerivedQuantities(res, "f2", Seq(4, 5, 5, 5, 3).map(_.toLong))
}

it should "turn on stripHTML flag is equivalent to passing in a custom AnalyzerHtmlStrip" +
"inside SmartTextMapVectorizer" in {
val exampleHTML = "<body>Big ones, small <h1>ones</h1>, some as big as your head</body>".toText
val tokensWithFlag = new SmartTextMapVectorizer()
.setStripHtml(true).setInput(m1).tokenize(exampleHTML).tokens.value
val tokensWithAnalyzer = new SmartTextMapVectorizer().setInput(m1)
.tokenize(exampleHTML, analyzer = TextTokenizer.AnalyzerHtmlStrip).tokens.value
tokensWithFlag should contain theSameElementsInOrderAs tokensWithAnalyzer
}

private[op] def assertVectorLength(df: DataFrame, output: FeatureLike[OPVector],
expectedLength: Int, textVectorizationMethod: TextVectorizationMethod): Unit = {
val result = df.collect(output)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -712,4 +712,14 @@ class SmartTextVectorizerTest
ts.lengthStdDev.isNaN shouldBe true
}

it should "turn on stripHTML flag is equivalent to passing in a custom AnalyzerHtmlStrip" +
"inside SmartTextVectorizer" in {
val exampleHTML = "<body>Big ones, small <h1>ones</h1>, some as big as your head</body>".toText
val tokensWithFlag = new SmartTextVectorizer()
.setStripHtml(true).setInput(f1).tokenize(exampleHTML).tokens.value
val tokensWithAnalyzer = new SmartTextVectorizer().setInput(f1)
.tokenize(exampleHTML, analyzer = TextTokenizer.AnalyzerHtmlStrip).tokens.value
tokensWithFlag should contain theSameElementsInOrderAs tokensWithAnalyzer
}

}

0 comments on commit e48831a

Please sign in to comment.