From 6889e639ed960026064fe6a7713d034edef272ff Mon Sep 17 00:00:00 2001 From: Ben Pennell Date: Tue, 29 Oct 2024 15:51:42 -0400 Subject: [PATCH 1/3] Sanitize html from abstract field in search results --- app/controllers/catalog_controller.rb | 2 +- app/helpers/application_helper.rb | 3 +++ spec/features/search_results_spec.rb | 27 +++++++++++++++++++++++ spec/support/oai_sample_solr_documents.rb | 2 +- 4 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 spec/features/search_results_spec.rb diff --git a/app/controllers/catalog_controller.rb b/app/controllers/catalog_controller.rb index 4a5db2378..36c8b9d03 100755 --- a/app/controllers/catalog_controller.rb +++ b/app/controllers/catalog_controller.rb @@ -129,7 +129,7 @@ def single_item_search_builder(id) config.add_index_field solr_name('creator_label', :stored_searchable), label: 'Creator', itemprop: 'creator', link_to_search: solr_name('creator', :facetable) config.add_index_field solr_name('date_captured', :stored_searchable), itemprop: 'dateCaptured', label: 'Date captured' config.add_index_field solr_name('date_issued', :stored_searchable), label: 'Date of publication' - config.add_index_field solr_name('abstract', :stored_searchable), label: 'Abstract' + config.add_index_field solr_name('abstract', :stored_searchable), label: 'Abstract', helper_method: :sanitize_abstract_field config.add_index_field solr_name('resource_type', :stored_searchable), label: 'Resource type', link_to_search: solr_name('resource_type', :facetable) config.add_index_field solr_name('based_near_label', :stored_searchable), itemprop: 'contentLocation', label: 'Location', link_to_search: solr_name('based_near_label', :facetable) diff --git a/app/helpers/application_helper.rb b/app/helpers/application_helper.rb index 71249b93b..74f17900e 100755 --- a/app/helpers/application_helper.rb +++ b/app/helpers/application_helper.rb @@ -1,3 +1,6 @@ # frozen_string_literal: true module ApplicationHelper + def sanitize_abstract_field(options = {}) + options[:value].map { |v| ActionController::Base.helpers.strip_tags(v) }.join(' and ') + end end diff --git a/spec/features/search_results_spec.rb b/spec/features/search_results_spec.rb new file mode 100644 index 000000000..6694106df --- /dev/null +++ b/spec/features/search_results_spec.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true +require 'rails_helper' +require Rails.root.join('spec/support/oai_sample_solr_documents.rb') +include Warden::Test::Helpers + +RSpec.describe 'Search Results', type: :feature, js: false do + let(:solr) { Blacklight.default_index.connection } + + before do + solr.delete_by_query('*:*') # delete everything in Solr + solr.add([SLEEPY_HOLLOW, MYSTERIOUS_AFFAIR, TIME_MACHINE]) + solr.commit + end + + after do + solr.delete_by_query('*:*') + solr.commit + end + + it 'html tags are stripped from abstract field' do + visit '/catalog' + expect(page).to have_content(SLEEPY_HOLLOW[:title_tesim][0]) + expect(page).to have_content(MYSTERIOUS_AFFAIR[:title_tesim][0]) + expect(page).to have_content(TIME_MACHINE[:title_tesim][0]) + expect(page).to have_content("Actual Abstract and another abstract") + end +end diff --git a/spec/support/oai_sample_solr_documents.rb b/spec/support/oai_sample_solr_documents.rb index 27f4c0607..9abe9333d 100644 --- a/spec/support/oai_sample_solr_documents.rb +++ b/spec/support/oai_sample_solr_documents.rb @@ -36,7 +36,7 @@ BEN_FRANKLIN = { timestamp: '2021-11-23T16:05:46.046Z', system_create_dtsi: '2021-11-22T22:10:28Z', system_modified_dtsi: '2021-11-22T22:10:29Z', has_model_ssim: ['Article'], id: '4t64gn166', accessControl_ssim: ['1f80985b-5b9a-4046-bafb-7d3c1fab35ac'], depositor_ssim: ['admin'], depositor_tesim: ['admin'], title_tesim: ['Autobiography of Benjamin Franklin'], title_sim: ['Autobiography of Benjamin Franklin'], date_modified_dtsi: '2017-10-02T17:54:29Z', language_label_tesim: ['English'], resource_type_tesim: ['Book'], resource_type_sim: ['Book'], creator_tesim: ['Franklin, Benjamin'], creator_sim: ['Franklin, Benjamin'], contributor_tesim: ['Smith, Jennifer'], contributor_sim: ['Smith, Jennifer'], language_tesim: ['http://id.loc.gov/vocabulary/iso639-2/eng'], language_sim: ['http://id.loc.gov/vocabulary/iso639-2/eng'], description_tesim: ['Abstract'], rights_statement_tesim: ['http://www.europeana.eu/portal/rights/rr-r.html'], date_created_dtsim: ['2017-10-02T17:54:29Z'], related_url_tesim: ['http://dx.doi.org/10.1186/1753-6561-3-S7-S87'], thumbnail_path_ss: '/assets/work-ff055336041c3f7d310ad69109eda4a887b16ec501f35afc0a547c4adb97ee72.png', suppressed_bsi: false, member_ids_ssim: [], member_of_collections_ssim: [], member_of_collection_ids_ssim: [], generic_type_sim: ['Work'], file_set_ids_ssim: [], visibility_ssi: 'open', admin_set_sim: '', admin_set_tesim: 'default', title_sort_ssi: 'autobiography of benjamin franklin', human_readable_type_sim: 'Scholarly Article or Book Chapter', human_readable_type_tesim: 'Scholarly Article or Book Chapter', read_access_group_ssim: ['public'] } -TIME_MACHINE = { timestamp: '2021-11-23T16:05:47.047Z', system_create_dtsi: '2021-11-22T22:10:30Z', system_modified_dtsi: '2021-11-22T22:10:31Z', has_model_ssim: ['Article'], id: 'x920fw84d', accessControl_ssim: ['1eeaaefa-fc89-49fd-8523-e4c829f956fb'], depositor_ssim: ['admin'], depositor_tesim: ['admin'], title_tesim: ['The Time Machine'], title_sim: ['The Time Machine'], date_modified_dtsi: '2017-10-02T17:58:46Z', language_label_tesim: ['English'], resource_type_tesim: ['Book'], resource_type_sim: ['Book'], creator_tesim: ['Wells, H. G.'], creator_sim: ['Wells, H. G.'], contributor_tesim: ['Smith, Jennifer'], contributor_sim: ['Smith, Jennifer'], language_tesim: ['http://id.loc.gov/vocabulary/iso639-2/eng'], language_sim: ['http://id.loc.gov/vocabulary/iso639-2/eng'], description_tesim: ['Abstract'], rights_statement_tesim: ['http://www.europeana.eu/portal/rights/rr-r.html'], date_created_dtsim: ['2017-10-02T17:58:46Z'], related_url_tesim: ['http://dx.doi.org/10.1186/1753-6561-3-S7-S87'], thumbnail_path_ss: '/assets/work-ff055336041c3f7d310ad69109eda4a887b16ec501f35afc0a547c4adb97ee72.png', suppressed_bsi: false, member_ids_ssim: [], member_of_collections_ssim: [], member_of_collection_ids_ssim: [], generic_type_sim: ['Work'], file_set_ids_ssim: [], visibility_ssi: 'open', admin_set_sim: '', admin_set_tesim: 'default', title_sort_ssi: 'the time machine', human_readable_type_sim: 'Scholarly Article or Book Chapter', human_readable_type_tesim: 'Scholarly Article or Book Chapter', read_access_group_ssim: ['public'] } +TIME_MACHINE = { timestamp: '2021-11-23T16:05:47.047Z', system_create_dtsi: '2021-11-22T22:10:30Z', system_modified_dtsi: '2021-11-22T22:10:31Z', has_model_ssim: ['Article'], id: 'x920fw84d', accessControl_ssim: ['1eeaaefa-fc89-49fd-8523-e4c829f956fb'], depositor_ssim: ['admin'], depositor_tesim: ['admin'], title_tesim: ['The Time Machine'], title_sim: ['The Time Machine'], date_modified_dtsi: '2017-10-02T17:58:46Z', language_label_tesim: ['English'], resource_type_tesim: ['Book'], resource_type_sim: ['Book'], creator_tesim: ['Wells, H. G.'], creator_sim: ['Wells, H. G.'], contributor_tesim: ['Smith, Jennifer'], contributor_sim: ['Smith, Jennifer'], language_tesim: ['http://id.loc.gov/vocabulary/iso639-2/eng'], language_sim: ['http://id.loc.gov/vocabulary/iso639-2/eng'], abstract_tesim: ['Actual Abstract', 'another abstract'], description_tesim: ['Abstract'], rights_statement_tesim: ['http://www.europeana.eu/portal/rights/rr-r.html'], date_created_dtsim: ['2017-10-02T17:58:46Z'], related_url_tesim: ['http://dx.doi.org/10.1186/1753-6561-3-S7-S87'], thumbnail_path_ss: '/assets/work-ff055336041c3f7d310ad69109eda4a887b16ec501f35afc0a547c4adb97ee72.png', suppressed_bsi: false, member_ids_ssim: [], member_of_collections_ssim: [], member_of_collection_ids_ssim: [], generic_type_sim: ['Work'], file_set_ids_ssim: [], visibility_ssi: 'open', admin_set_sim: '', admin_set_tesim: 'default', title_sort_ssi: 'the time machine', human_readable_type_sim: 'Scholarly Article or Book Chapter', human_readable_type_tesim: 'Scholarly Article or Book Chapter', read_access_group_ssim: ['public'] } HUCK_FINN = { timestamp: '2021-11-23T16:05:48.048Z', system_create_dtsi: '2021-11-22T22:10:32Z', system_modified_dtsi: '2021-11-22T22:10:33Z', has_model_ssim: ['Article'], id: 'sn009x76k', accessControl_ssim: ['0b02650c-e24a-469b-880e-61b392bd6acc'], depositor_ssim: ['admin'], depositor_tesim: ['admin'], title_tesim: ['Adventures of Huckleberry Finn'], title_sim: ['Adventures of Huckleberry Finn'], date_modified_dtsi: '2017-10-02T17:41:49Z', language_label_tesim: ['English'], resource_type_tesim: ['Book'], resource_type_sim: ['Book'], creator_tesim: ['Twain, Mark'], creator_sim: ['Twain, Mark'], contributor_tesim: ['Smith, Jennifer'], contributor_sim: ['Smith, Jennifer'], language_tesim: ['http://id.loc.gov/vocabulary/iso639-2/eng'], language_sim: ['http://id.loc.gov/vocabulary/iso639-2/eng'], description_tesim: ['Abstract'], rights_statement_tesim: ['http://www.europeana.eu/portal/rights/rr-r.html'], date_created_dtsim: ['2017-10-02T17:41:49Z'], related_url_tesim: ['http://dx.doi.org/10.1186/1753-6561-3-S7-S87'], thumbnail_path_ss: '/assets/work-ff055336041c3f7d310ad69109eda4a887b16ec501f35afc0a547c4adb97ee72.png', suppressed_bsi: false, member_ids_ssim: [], member_of_collections_ssim: [], member_of_collection_ids_ssim: [], generic_type_sim: ['Work'], file_set_ids_ssim: [], visibility_ssi: 'open', admin_set_sim: '', admin_set_tesim: 'default', title_sort_ssi: 'adventures of huckleberry finn', human_readable_type_sim: 'Scholarly Article or Book Chapter', human_readable_type_tesim: 'Scholarly Article or Book Chapter', read_access_group_ssim: ['public'] } From cbbadec0aa0886a0bc917ae8d416ae462d6a8a06 Mon Sep 17 00:00:00 2001 From: Ben Pennell Date: Tue, 29 Oct 2024 16:25:51 -0400 Subject: [PATCH 2/3] Permit style attributes and blockquotes through the formatted text renderer --- app/renderers/hyrax/renderers/formatted_text_renderer.rb | 4 ++-- app/views/hyrax/artworks/_attribute_rows.html.erb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/app/renderers/hyrax/renderers/formatted_text_renderer.rb b/app/renderers/hyrax/renderers/formatted_text_renderer.rb index fa2a0d7e6..aa7455897 100644 --- a/app/renderers/hyrax/renderers/formatted_text_renderer.rb +++ b/app/renderers/hyrax/renderers/formatted_text_renderer.rb @@ -15,8 +15,8 @@ def attribute_value_to_html(value) # Sanitize the value, allowing only safe HTML tags and attributes def get_sanitized_string(string) # Define allowed tags and attributes - allowed_tags = %w[strong em b i u p br small mark sub sup a ul ol li dl dt dd div span h1 h2 h3 h4 h5 h6] - allowed_attributes = %w[href] + allowed_tags = %w[strong em b i u p br small mark sub sup a ul ol li dl dt dd div span h1 h2 h3 h4 h5 h6 blockquote] + allowed_attributes = %w[href style] sanitize(string, tags: allowed_tags, attributes: allowed_attributes) end diff --git a/app/views/hyrax/artworks/_attribute_rows.html.erb b/app/views/hyrax/artworks/_attribute_rows.html.erb index ad78b0cc5..57e3f38db 100644 --- a/app/views/hyrax/artworks/_attribute_rows.html.erb +++ b/app/views/hyrax/artworks/_attribute_rows.html.erb @@ -1,5 +1,5 @@ <%= presenter.attribute_to_html(:creator_display, label: 'Creator', render_as: :person, html_dl: true) %> -<%= presenter.attribute_to_html(:abstract, render_as: :formatted_text, render_as: :formatted_text, html_dl: true) %> +<%= presenter.attribute_to_html(:abstract, render_as: :formatted_text, html_dl: true) %> <%= presenter.attribute_to_html(:description, html_dl: true) %> <%= presenter.attribute_to_html(:date_issued, html_dl: true, label: "Date of publication") %> <%= presenter.attribute_to_html(:note, html_dl: true) %> From f84c93305f1f3a7013e410c2cc9f4b29949a4230 Mon Sep 17 00:00:00 2001 From: Ben Pennell Date: Tue, 29 Oct 2024 16:56:24 -0400 Subject: [PATCH 3/3] Rubocop --- spec/features/search_results_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/features/search_results_spec.rb b/spec/features/search_results_spec.rb index 6694106df..63be639be 100644 --- a/spec/features/search_results_spec.rb +++ b/spec/features/search_results_spec.rb @@ -22,6 +22,6 @@ expect(page).to have_content(SLEEPY_HOLLOW[:title_tesim][0]) expect(page).to have_content(MYSTERIOUS_AFFAIR[:title_tesim][0]) expect(page).to have_content(TIME_MACHINE[:title_tesim][0]) - expect(page).to have_content("Actual Abstract and another abstract") + expect(page).to have_content('Actual Abstract and another abstract') end end