Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

develop MergeListTag filter #28

Merged
merged 6 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ case class Paragraph(
nearFreq: Int = 1,
remove: AnyRef = null
) {
final val cssSelectorSeparator = ">"

def renderInto[T <: Appendable](bldr: T): T = {
if (path != null && path.nonEmpty) {
bldr.append(path)
Expand All @@ -41,6 +43,8 @@ case class Paragraph(
bldr
}

def cssSelectors: Seq[String] = this.path.split(cssSelectorSeparator)

def isAlive: Boolean = remove == null
def isDeleted: Boolean = !isAlive
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package com.worksap.nlp.uzushio.lib.filters

import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
import com.worksap.nlp.uzushio.lib.cleaning.Document

class MergeListTag extends DocFilter {
final val acceptedTags = Seq("li", "option")
final val mdListSymbol = "- "

def containsAcceptedTag(cssSelectorStrs: Seq[String]): Boolean = {
extractDescendantTag(cssSelectorStrs, acceptedTags) match {
case Some(_) => true
case None => false
}
}

def extractDescendantTag(
cssSelectorStrs: Seq[String],
tagNames: Seq[String]
): Option[String] = {
val iter = cssSelectorStrs.reverse.iterator
var i = 0

while (iter.hasNext) {
val tagWithCSS = iter.next()
val tagWithAttrs = tagWithCSS.split("[#\\.]")
i += 1
if (acceptedTags.contains(tagWithAttrs.head)) {
return Option(tagWithCSS)
}
}
return None
}

override def checkDocument(doc: Document): Document = {
var paragraphs = doc.aliveParagraphs.to[Seq]

(0 until paragraphs.length - 1).foreach { i =>
val paragraph = paragraphs(i)
val nextParagraph = paragraphs(i + 1)
val isAccteptedTags = containsAcceptedTag(paragraph.cssSelectors) && containsAcceptedTag(
nextParagraph.cssSelectors
)

if (isAccteptedTags && paragraph.path == nextParagraph.path) {
val mergedParagraph = nextParagraph.copy(
text = List(paragraph.text, nextParagraph.text)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line will have O(n^2) complexity because strings are immutable in Java. Should not still cause much problems I think.

Copy link
Contributor Author

@otakumesi otakumesi Nov 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I can afford to change this line after completing various tasks, I refactor it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we are OK with leaving it as is because

  1. Lists should not be very large
  2. Individual strings are not 1 character long

.map(s => if (s.startsWith(mdListSymbol)) s else mdListSymbol + s).mkString("\n"),
exactFreq = math.min(paragraph.exactFreq, nextParagraph.exactFreq),
nearFreq = math.min(paragraph.nearFreq, nextParagraph.nearFreq)
)
paragraphs = paragraphs.updated(i, paragraph.copy(remove = paragraph))
paragraphs = paragraphs.updated(i + 1, mergedParagraph)
}
}

doc.copy(paragraphs = paragraphs)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package com.worksap.nlp.uzushio.lib.cleaning

import org.scalatest.freespec.AnyFreeSpec

class ParagraphSpec extends AnyFreeSpec {
"Paragraph" - {
"return css selector strings" in {
val par = Paragraph("body>p.text", "hello")
assert(par.cssSelectors == Seq("body", "p.text"))
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package com.worksap.nlp.uzushio.lib.filters

import com.worksap.nlp.uzushio.lib.cleaning.Document
import org.scalatest.freespec.AnyFreeSpec

class MergeListTagSpec extends AnyFreeSpec {
"MergeListTag" - {
val filter = new MergeListTag()
"do no operation for empty document" in {
val doc = testDoc("")
assert("" == filter.checkDocument(doc).aliveParagraphs.map(_.text).mkString(""))
}

"do no operation for no list document" in {
val texts = Seq("test 1", "test 2", "test 3")
val doc = Document(testParagraphs(texts))
assert(texts.mkString("") == filter.checkDocument(doc).aliveParagraphs.map(_.text).mkString(""))
}

"merge list tag texts and put 'remove' sign on rests for document including list tags" in {
val texts = Seq("test 1", "li test 1", "li test 2", "li test 3", "li test 4", "li test 5", "li test 6", "option test 1", "option test 2", "test 2")
val nearFreqs = Seq(1, 2, 3, 5, 4, 1, 1, 1, 2, 1)
val exactFreqs = Seq(1, 2, 3, 5, 4, 1, 1, 1, 2, 1)
val paths = Seq("body>p.text", "body>ul>li.text", "body>ul>li.text", "body>ul>li.text2", "body>ul>li.text2", "body>ul>li.text2", "body>ul>li.text3", "body>datalist>option.text", "body>datalist>option.text", "body>p.text")
val paragraphs = testParagraphs(texts, nearFreqs, exactFreqs, paths)
val doc = Document(paragraphs)

assert(Seq("test 1", "- li test 1\n- li test 2", "- li test 3\n- li test 4\n- li test 5", "li test 6", "- option test 1\n- option test 2", "test 2") == filter.checkDocument(doc).aliveParagraphs.map(_.text).toSeq)
assert(2 == filter.checkDocument(doc).aliveParagraphs.map(_.nearFreq).drop(1).toList.head)
assert(2 == filter.checkDocument(doc).aliveParagraphs.map(_.exactFreq).drop(1).toList.head)
assert(Seq(false, true, false, true, true, false, false, true, false, false) == filter.checkDocument(doc).paragraphs.map(_.remove != null))
assert(Seq("body>p.text", "body>ul>li.text", "body>ul>li.text2", "body>ul>li.text3", "body>datalist>option.text", "body>p.text") == filter.checkDocument(doc).aliveParagraphs.map(_.path).toSeq)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,20 @@ package object filters {
)
}

def testParagraphs(texts: Seq[String], nearFreqs: Seq[Int]): IndexedSeq[Paragraph] = {
(texts, nearFreqs)
.zipped
.map ((text, freq) => Paragraph("", text, 0, 1, freq))
.toIndexedSeq
def testParagraphs(texts: Seq[String], nearFreqs: Seq[Int] = Seq(), exactFreqs: Seq[Int] = Seq(), paths: Seq[String] = Seq()): IndexedSeq[Paragraph] = {
require(texts.length == nearFreqs.length || nearFreqs.isEmpty)
require(texts.length == exactFreqs.length || exactFreqs.isEmpty)
require(texts.length == paths.length || paths.isEmpty)

val nearFreqs_ = if (!nearFreqs.isEmpty) nearFreqs else Seq.fill(texts.length)(1)
val exactFreqs_ = if (!exactFreqs.isEmpty) exactFreqs else Seq.fill(texts.length)(1)
val paths_ = if (!paths.isEmpty) paths else 0.to(texts.length).map(_ => "body>p.text")

texts
.zip(nearFreqs_)
.zip(exactFreqs_)
.zip(paths_)
.map { case (((text, nearFreq), exactFreq), path) => Paragraph(path, text, 0, exactFreq, nearFreq) }
.toIndexedSeq
}
}