fix: normalize note paragraphs and HTML wrapping

nla · Jun 11, 2024 · 1c5bdd1 · 1c5bdd1
1 parent e4cfc66
commit 1c5bdd1
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 9 deletions.
diff --git a/app/models/solr_document.rb b/app/models/solr_document.rb
@@ -51,12 +51,42 @@ def extents_information
   end
 
   def extract_notes_by_header(header)
+    filtered_notes = filter_notes_by_header(header)
+    if filtered_notes.present?
+      wrap_paragraphs_in_html(parse_note_paragraphs(filtered_notes))
+    end
+  end
+
+  private
+
+  def filter_notes_by_header(header)
     # compares against the parameterized value of the header to ignore case and punctuation
-    # rubocop:disable Rails/OutputSafety
     notes.select { |note| JSON.parse(note)["head"].parameterize == I18n.t("ead_notes.#{header}").parameterize }
-      .map { |note| JSON.parse(note)["p"] }
-      .flatten
-      .map { |para| "<p>#{para}</p>".html_safe }
-    # rubocop:enable Rails/OutputSafety
+  end
+
+  def parse_note_paragraphs(notes)
+    # Sometimes note paragraphs are returned as a simple JSON string, and other times, as an array.
+    # Ensure we always have an array by wrapping the paragraphs in an array, then flatten the array,
+    # so we only return a single dimensional array of paragraphs.
+    notes.map! { |note| [JSON.parse(note)["p"]] }
+      .flatten!
+  end
+
+  def wrap_paragraphs_in_html(paragraphs)
+    # Wraps note paragrphs in HTML `<p></p>` tags, then scrubs the HTML to remove any unknown or
+    # unsafe tags.
+    paragraphs.map! do |para|
+      Loofah.xml_fragment(wrap_in_paragraph(para))
+        .scrub!(:strip)
+        .to_html
+    end
+  end
+
+  def wrap_in_paragraph(value)
+    if value.start_with?("<")
+      value
+    else
+      ActionController::Base.helpers.content_tag(:p, value)
+    end
   end
 end
diff --git a/spec/models/solr_document_spec.rb b/spec/models/solr_document_spec.rb
@@ -141,20 +141,70 @@
       end
 
       it "returns an empty array" do
-        expect(notes_value).to eq []
+        expect(notes_value).to be_nil
       end
     end
 
     context "when header literal contains punctuation" do
       subject(:notes_value) do
-        document = described_class.new(
-          note_json_ssm: ["{\"head\":\"Biographical / Historical\",\"p\":\"Testing\",\"audience\":\"internal\"}"]
+        document.extract_notes_by_header("biog_hist")
+      end
+
+      let(:document) {
+        described_class.new(
+          note_json_ssm: ["{\"head\":\"Biographical / Historical\",\"p\":[\"Testing\",\"Testing 2\"],\"audience\":\"internal\"}"]
         )
+      }
 
-        document.extract_notes_by_header("biog_hist")
+      it "returns the notes with the matching header" do
+        expect(notes_value).to eq ["<p>Testing</p>", "<p>Testing 2</p>"]
+      end
+    end
+
+    context "when there is a single note paragraph" do
+      subject(:notes_value) do
+        document.extract_notes_by_header("references")
+      end
+
+      let(:document) {
+        described_class.new(
+          note_json_ssm: ["{\"head\":\"References\",\"p\":\"Testing\",\"audience\":\"internal\"}"]
+        )
+      }
+
+      it "returns the notes with the matching header" do
+        expect(notes_value).to eq ["<p>Testing</p>"]
+      end
+    end
+
+    context "when there are multiple note paragraphs" do
+      subject(:notes_value) do
+        document.extract_notes_by_header("references")
       end
 
+      let(:document) {
+        described_class.new(
+          note_json_ssm: ["{\"head\":\"References\",\"p\":[\"Testing\", \"Testing 2\"],\"audience\":\"internal\"}"]
+        )
+      }
+
       it "returns the notes with the matching header" do
+        expect(notes_value).to eq ["<p>Testing</p>", "<p>Testing 2</p>"]
+      end
+    end
+
+    context "when note already contains HTML tags" do
+      subject(:notes_value) do
+        document.extract_notes_by_header("references")
+      end
+
+      let(:document) {
+        described_class.new(
+          note_json_ssm: ["{\"head\":\"References\",\"p\":\"<p>Testing</p>\",\"audience\":\"internal\"}"]
+        )
+      }
+
+      it "doesn't wrap the content again" do
         expect(notes_value).to eq ["<p>Testing</p>"]
       end
     end