WorksApplications · eiennohito · Nov 16, 2023 · Nov 8, 2023 · Nov 8, 2023 · Nov 15, 2023
diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/cleaning/Pipeline.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/cleaning/Pipeline.scala
@@ -32,6 +32,8 @@ case class Paragraph(
     nearFreq: Int = 1,
     remove: AnyRef = null
 ) {
+  final val cssSelectorSeparator = ">"
+
   def renderInto[T <: Appendable](bldr: T): T = {
     if (path != null && path.nonEmpty) {
       bldr.append(path)
@@ -41,6 +43,8 @@ case class Paragraph(
     bldr
   }
 
+  def cssSelectors: Seq[String] = this.path.split(cssSelectorSeparator)
+
   def isAlive: Boolean = remove == null
   def isDeleted: Boolean = !isAlive
 }

diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/MergeListTag.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/MergeListTag.scala
@@ -0,0 +1,59 @@
+package com.worksap.nlp.uzushio.lib.filters
+
+import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
+import com.worksap.nlp.uzushio.lib.cleaning.Document
+
+class MergeListTag extends DocFilter {
+  final val acceptedTags = Seq("li", "option")
+  final val mdListSymbol = "- "
+
+  def containsAcceptedTag(cssSelectorStrs: Seq[String]): Boolean = {
+    extractDescendantTag(cssSelectorStrs, acceptedTags) match {
+      case Some(_) => true
+      case None => false
+    }
+  }
+
+  def extractDescendantTag(
+      cssSelectorStrs: Seq[String],
+      tagNames: Seq[String]
+  ): Option[String] = {
+    val iter = cssSelectorStrs.reverse.iterator
+    var i = 0
+
+    while (iter.hasNext) {
+      val tagWithCSS = iter.next()
+      val tagWithAttrs = tagWithCSS.split("[#\\.]")
+      i += 1
+      if (acceptedTags.contains(tagWithAttrs.head)) {
+        return Option(tagWithCSS)
+      }
+    }
+    return None
+  }
+
+  override def checkDocument(doc: Document): Document = {
+    var paragraphs = doc.aliveParagraphs.to[Seq]
+
+    (0 until paragraphs.length - 1).foreach { i =>
+      val paragraph = paragraphs(i)
+      val nextParagraph = paragraphs(i + 1)
+      val isAccteptedTags = containsAcceptedTag(paragraph.cssSelectors) && containsAcceptedTag(
+        nextParagraph.cssSelectors
+      )
+
+      if (isAccteptedTags && paragraph.path == nextParagraph.path) {
+        val mergedParagraph = nextParagraph.copy(
+          text = List(paragraph.text, nextParagraph.text)
+            .map(s => if (s.startsWith(mdListSymbol)) s else mdListSymbol + s).mkString("\n"),
+          exactFreq = math.min(paragraph.exactFreq, nextParagraph.exactFreq),
+          nearFreq = math.min(paragraph.nearFreq, nextParagraph.nearFreq)
+        )
+        paragraphs = paragraphs.updated(i, paragraph.copy(remove = paragraph))
+        paragraphs = paragraphs.updated(i + 1, mergedParagraph)
+      }
+    }
+
+    doc.copy(paragraphs = paragraphs)
+  }
+}
diff --git a/lib/src/test/scala/com/worksap/nlp/uzushio/lib/cleaning/ParagraphSpec.scala b/lib/src/test/scala/com/worksap/nlp/uzushio/lib/cleaning/ParagraphSpec.scala
@@ -0,0 +1,12 @@
+package com.worksap.nlp.uzushio.lib.cleaning
+
+import org.scalatest.freespec.AnyFreeSpec
+
+class ParagraphSpec extends AnyFreeSpec {
+  "Paragraph" - {
+    "return css selector strings" in {
+      val par = Paragraph("body>p.text", "hello")
+      assert(par.cssSelectors == Seq("body", "p.text"))
+    }
+  }
+}
diff --git a/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/MergeListTagSpec.scala b/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/MergeListTagSpec.scala
@@ -0,0 +1,35 @@
+package com.worksap.nlp.uzushio.lib.filters
+
+import com.worksap.nlp.uzushio.lib.cleaning.Document
+import org.scalatest.freespec.AnyFreeSpec
+
+class MergeListTagSpec extends AnyFreeSpec {
+  "MergeListTag" - {
+    val filter = new MergeListTag()
+    "do no operation for empty document" in {
+      val doc = testDoc("")
+      assert("" == filter.checkDocument(doc).aliveParagraphs.map(_.text).mkString(""))
+    }
+
+    "do no operation for no list document" in {
+      val texts = Seq("test 1", "test 2", "test 3")
+      val doc = Document(testParagraphs(texts))
+      assert(texts.mkString("") == filter.checkDocument(doc).aliveParagraphs.map(_.text).mkString(""))
+    }
+
+    "merge list tag texts and put 'remove' sign on rests for document including list tags" in {
+      val texts = Seq("test 1", "li test 1", "li test 2", "li test 3", "li test 4", "li test 5", "li test 6", "option test 1", "option test 2", "test 2")
+      val nearFreqs = Seq(1, 2, 3, 5, 4, 1, 1, 1, 2, 1)
+      val exactFreqs = Seq(1, 2, 3, 5, 4, 1, 1, 1, 2, 1)
+      val paths = Seq("body>p.text", "body>ul>li.text", "body>ul>li.text", "body>ul>li.text2", "body>ul>li.text2", "body>ul>li.text2", "body>ul>li.text3", "body>datalist>option.text", "body>datalist>option.text", "body>p.text")
+      val paragraphs = testParagraphs(texts, nearFreqs, exactFreqs, paths)
+      val doc = Document(paragraphs)
+
+      assert(Seq("test 1", "- li test 1\n- li test 2", "- li test 3\n- li test 4\n- li test 5", "li test 6", "- option test 1\n- option test 2", "test 2") == filter.checkDocument(doc).aliveParagraphs.map(_.text).toSeq)
+      assert(2 == filter.checkDocument(doc).aliveParagraphs.map(_.nearFreq).drop(1).toList.head)
+      assert(2 == filter.checkDocument(doc).aliveParagraphs.map(_.exactFreq).drop(1).toList.head)
+      assert(Seq(false, true, false, true, true, false, false, true, false, false) == filter.checkDocument(doc).paragraphs.map(_.remove != null))
+      assert(Seq("body>p.text", "body>ul>li.text", "body>ul>li.text2", "body>ul>li.text3", "body>datalist>option.text", "body>p.text") == filter.checkDocument(doc).aliveParagraphs.map(_.path).toSeq)
+    }
+  }
+}
diff --git a/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/package.scala b/lib/src/test/scala/com/worksap/nlp/uzushio/lib/filters/package.scala
@@ -32,10 +32,20 @@ package object filters {
     )
   }
 
-  def testParagraphs(texts: Seq[String], nearFreqs: Seq[Int]): IndexedSeq[Paragraph] = {
-      (texts, nearFreqs)
-        .zipped
-        .map ((text, freq) => Paragraph("", text, 0, 1, freq))
-        .toIndexedSeq
+  def testParagraphs(texts: Seq[String], nearFreqs: Seq[Int] = Seq(), exactFreqs: Seq[Int] = Seq(), paths: Seq[String] = Seq()): IndexedSeq[Paragraph] = {
+    require(texts.length == nearFreqs.length || nearFreqs.isEmpty)
+    require(texts.length == exactFreqs.length || exactFreqs.isEmpty)
+    require(texts.length == paths.length || paths.isEmpty)
+
+    val nearFreqs_ = if (!nearFreqs.isEmpty) nearFreqs else Seq.fill(texts.length)(1)
+    val exactFreqs_ = if (!exactFreqs.isEmpty) exactFreqs else Seq.fill(texts.length)(1)
+    val paths_ = if (!paths.isEmpty) paths else 0.to(texts.length).map(_ => "body>p.text")
+
+    texts
+      .zip(nearFreqs_)
+      .zip(exactFreqs_)
+      .zip(paths_)
+      .map { case (((text, nearFreq), exactFreq), path) => Paragraph(path, text, 0, exactFreq, nearFreq) }
+      .toIndexedSeq
   }
 }