Skip to content

Commit

Permalink
fix: normalize note paragraphs and HTML wrapping
Browse files Browse the repository at this point in the history
  • Loading branch information
yetti committed Jun 11, 2024
1 parent e4cfc66 commit 1c5bdd1
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 9 deletions.
40 changes: 35 additions & 5 deletions app/models/solr_document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,42 @@ def extents_information
end

def extract_notes_by_header(header)
filtered_notes = filter_notes_by_header(header)
if filtered_notes.present?
wrap_paragraphs_in_html(parse_note_paragraphs(filtered_notes))
end
end

private

def filter_notes_by_header(header)
# compares against the parameterized value of the header to ignore case and punctuation
# rubocop:disable Rails/OutputSafety
notes.select { |note| JSON.parse(note)["head"].parameterize == I18n.t("ead_notes.#{header}").parameterize }
.map { |note| JSON.parse(note)["p"] }
.flatten
.map { |para| "<p>#{para}</p>".html_safe }
# rubocop:enable Rails/OutputSafety
end

def parse_note_paragraphs(notes)
# Sometimes note paragraphs are returned as a simple JSON string, and other times, as an array.
# Ensure we always have an array by wrapping the paragraphs in an array, then flatten the array,
# so we only return a single dimensional array of paragraphs.
notes.map! { |note| [JSON.parse(note)["p"]] }
.flatten!
end

def wrap_paragraphs_in_html(paragraphs)
# Wraps note paragrphs in HTML `<p></p>` tags, then scrubs the HTML to remove any unknown or
# unsafe tags.
paragraphs.map! do |para|
Loofah.xml_fragment(wrap_in_paragraph(para))
.scrub!(:strip)
.to_html
end
end

def wrap_in_paragraph(value)
if value.start_with?("<")
value
else
ActionController::Base.helpers.content_tag(:p, value)
end
end
end
58 changes: 54 additions & 4 deletions spec/models/solr_document_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -141,20 +141,70 @@
end

it "returns an empty array" do
expect(notes_value).to eq []
expect(notes_value).to be_nil
end
end

context "when header literal contains punctuation" do
subject(:notes_value) do
document = described_class.new(
note_json_ssm: ["{\"head\":\"Biographical / Historical\",\"p\":\"Testing\",\"audience\":\"internal\"}"]
document.extract_notes_by_header("biog_hist")
end

let(:document) {
described_class.new(
note_json_ssm: ["{\"head\":\"Biographical / Historical\",\"p\":[\"Testing\",\"Testing 2\"],\"audience\":\"internal\"}"]
)
}

document.extract_notes_by_header("biog_hist")
it "returns the notes with the matching header" do
expect(notes_value).to eq ["<p>Testing</p>", "<p>Testing 2</p>"]
end
end

context "when there is a single note paragraph" do
subject(:notes_value) do
document.extract_notes_by_header("references")
end

let(:document) {
described_class.new(
note_json_ssm: ["{\"head\":\"References\",\"p\":\"Testing\",\"audience\":\"internal\"}"]
)
}

it "returns the notes with the matching header" do
expect(notes_value).to eq ["<p>Testing</p>"]
end
end

context "when there are multiple note paragraphs" do
subject(:notes_value) do
document.extract_notes_by_header("references")
end

let(:document) {
described_class.new(
note_json_ssm: ["{\"head\":\"References\",\"p\":[\"Testing\", \"Testing 2\"],\"audience\":\"internal\"}"]
)
}

it "returns the notes with the matching header" do
expect(notes_value).to eq ["<p>Testing</p>", "<p>Testing 2</p>"]
end
end

context "when note already contains HTML tags" do
subject(:notes_value) do
document.extract_notes_by_header("references")
end

let(:document) {
described_class.new(
note_json_ssm: ["{\"head\":\"References\",\"p\":\"<p>Testing</p>\",\"audience\":\"internal\"}"]
)
}

it "doesn't wrap the content again" do
expect(notes_value).to eq ["<p>Testing</p>"]
end
end
Expand Down

0 comments on commit 1c5bdd1

Please sign in to comment.