From 6e9a639194385fc794f5374abd81d896401c44e6 Mon Sep 17 00:00:00 2001 From: Brian Hayden Date: Tue, 19 Dec 2023 17:02:11 -0600 Subject: [PATCH 1/4] Address expiry of studies from clinicaltrials.gov Added a task to un-publish trials that are no longer in the list of recruiting trials for the location that is being used to pull from clinicaltrials.gov. - Refactored ctgov connector nct_ids_for_location method to use current ct.gov API. Also uses recursion without passing arrays of IDs into a new method invocation for memory benefits. - In same method, include filter for only recruiting studies at location. - Allow cleanup_stray_trials and stray_trials to accept an argument of an array of IDs that should be considered "still recruiting"; this can be leveraged for testing bc there isn't a great way to mock ct.gov API responses given the current state of coupling in the connector. - Tests for these updates. --- lib/connectors/ctgov.rb | 93 +++++++++++++++++++++-------------- lib/tasks/ctgov.rake | 8 +++ spec/connectors/ctgov_spec.rb | 46 +++++++++++++++++ spec/factories/parser.rb | 6 +++ spec/factories/trial.rb | 2 + 5 files changed, 119 insertions(+), 36 deletions(-) create mode 100644 spec/connectors/ctgov_spec.rb create mode 100644 spec/factories/parser.rb diff --git a/lib/connectors/ctgov.rb b/lib/connectors/ctgov.rb index 94658d9..72e8ce6 100644 --- a/lib/connectors/ctgov.rb +++ b/lib/connectors/ctgov.rb @@ -90,55 +90,76 @@ def clear TrialCondition.delete_all end - def stray_trials - Trial.where.not(system_id: nct_ids_for_location(@system_info.search_term)) + def site_nct_ids + nct_ids_for_location(SystemInfo.search_term) end - def cleanup_stray_trials - stray_trials.destroy_all + def stray_trials(nct_ids = nil) + nct_ids ||= site_nct_ids + Trial.where(parser_id: @parser_id).where.not(nct_id: nct_ids) end - private - - def extract_zip - dirname = "#{Rails.root}/tmp/" - unless File.directory?(dirname) - FileUtils.mkdir_p(dirname) - end - - unless File.directory?("#{dirname}trials/") - FileUtils.mkdir_p("#{dirname}trials/") - end + def cleanup_stray_trials(nct_ids = nil) + nct_ids ||= site_nct_ids + stray_trials(nct_ids).update(visible: false) + end - FileUtils.rm_rf(Dir.glob("#{dirname}trials/*")) - Zip::File.open("#{dirname}search_result.zip") do |file| - file.each do |entry| - file.extract(entry, "#{dirname}trials/#{entry.name}") - end - end + def nct_ids_for_location(location, page_token = nil) + csc = 'M Health Fairview Clinics and Surgery Center' + ids = [] + q = { + 'query.locn' => "SEARCH[Location](AREA[LocationFacility]#{location} AND AREA[LocationStatus]RECRUITING)", + fields: "NCTId", + countTotal: true, + pageSize: 1000, + format: "json" + } + + # API only wants a pageToken arg at all if we are actually asking for one. + if !page_token.blank? + q[:pageToken] = page_token end - def nct_ids_for_location(location, start = 1, endd = 1000, ids = []) response = HTTParty.get( - "https://classic.clinicaltrials.gov/api/query/study_fields", - query: { - expr: "SEARCH[Location](AREA[LocationFacility]#{location})", - fields: "NCTId", - min_rnk: start, - max_rnk: endd, - fmt: "json" - } + "https://clinicaltrials.gov/api/v2/studies", + query: q ) + payload = JSON.parse(response.body || "{}") - response_ids = Array(JSON.parse(response.body || "{}").dig("StudyFieldsResponse").dig("StudyFields")).map do |result| - Array(result.dig("NCTId")).first + response_ids = Array(payload.dig("studies")).map do |result| + result.dig("protocolSection").dig("identificationModule").dig("nctId") end - if response_ids.empty? - ids - else - nct_ids_for_location(location, endd + 1, endd + 1000, ids + response_ids) + # Add the ids we just received, and ... + ids.push(*response_ids) + + # ... recurse if there's another page. + if payload.dig("nextPageToken") + ids.push(*(nct_ids_for_location(location, payload.dig("nextPageToken")))) end + + return ids end + + private + + def extract_zip + dirname = "#{Rails.root}/tmp/" + unless File.directory?(dirname) + FileUtils.mkdir_p(dirname) + end + + unless File.directory?("#{dirname}trials/") + FileUtils.mkdir_p("#{dirname}trials/") + end + + FileUtils.rm_rf(Dir.glob("#{dirname}trials/*")) + Zip::File.open("#{dirname}search_result.zip") do |file| + file.each do |entry| + file.extract(entry, "#{dirname}trials/#{entry.name}") + end + end + end + end end diff --git a/lib/tasks/ctgov.rake b/lib/tasks/ctgov.rake index b15611d..a180b26 100644 --- a/lib/tasks/ctgov.rake +++ b/lib/tasks/ctgov.rake @@ -34,6 +34,14 @@ namespace :studyfinder do Trial.import force: true end + task cleanup_strays: :environment do |t, args| + puts "Cleaning up stray trials" + connector = Connectors::Ctgov.new + trials = connector.cleanup_stray_trials + puts "Have un-published (system_ids): " + puts trials.map{ |e| " #{e.system_id}\n" } + end + # ============================================================================================== # studyfinder:ctgov:reload_all # Note: Dangerous business here!! This will delete and reload data from every diff --git a/spec/connectors/ctgov_spec.rb b/spec/connectors/ctgov_spec.rb new file mode 100644 index 0000000..650d869 --- /dev/null +++ b/spec/connectors/ctgov_spec.rb @@ -0,0 +1,46 @@ +require 'rails_helper' +require 'connectors/ctgov' + +describe Connectors::Ctgov do + context "cleanup_stray_trials" do + it "hides trials that are no longer actively recruiting at the given location(s)" do + parser = create(:parser) + system_info = create(:system_info, initials: 'TSTU') + ctgov = Connectors::Ctgov.new + will_hide = create(:trial, parser: parser) + wont_hide = create_list(:trial, 5, parser: parser) + remaining_ids = wont_hide.map { |e| e.nct_id } + + expect(will_hide.visible).to be_truthy + expect(wont_hide.first.visible).to be_truthy + + strays = ctgov.stray_trials(remaining_ids) + expect(strays.map { |e| e.nct_id }).to include(will_hide.nct_id) + + ctgov.cleanup_stray_trials(remaining_ids) + will_hide.reload + expect(will_hide.visible).to be_falsey + expect(wont_hide.first.visible).to be_truthy + end + + it "does not hide trials from a different parser" do + parser = create(:parser) + parser2 = create(:parser, name: 'foobar', klass: 'Parsers::Foobar') + system_info = create(:system_info, initials: 'TSTU') + ctgov = Connectors::Ctgov.new + will_hide = create(:trial, parser: parser) + wont_hide = create_list(:trial, 5, parser: parser) + wont_hide_2 = create(:trial, parser: parser2) + remaining_ids = wont_hide.map { |e| e.nct_id } + + expect(will_hide.visible).to be_truthy + expect(wont_hide_2.visible).to be_truthy + + ctgov.cleanup_stray_trials(remaining_ids) + will_hide.reload + + expect(will_hide.visible).to be_falsey + expect(wont_hide_2.visible).to be_truthy + end + end +end diff --git a/spec/factories/parser.rb b/spec/factories/parser.rb new file mode 100644 index 0000000..cabdd70 --- /dev/null +++ b/spec/factories/parser.rb @@ -0,0 +1,6 @@ +FactoryBot.define do + factory :parser do + name { 'clinicaltrials.gov' } + klass { 'Parsers::Ctgov' } + end +end diff --git a/spec/factories/trial.rb b/spec/factories/trial.rb index e8a1e6c..2509c43 100644 --- a/spec/factories/trial.rb +++ b/spec/factories/trial.rb @@ -1,7 +1,9 @@ FactoryBot.define do sequence(:system_id, 10000) { |n| "STUDY#{n}" } + sequence(:nct_id, 10000) { |n| "NCT#{n}" } factory :trial do system_id { generate(:system_id) } + nct_id { generate(:nct_id) } brief_title { Faker::Lorem.sentence } approved { true } visible { true } From f19beea05a06f294580f1c1b5f4bba36bf52e0ca Mon Sep 17 00:00:00 2001 From: Brian Hayden Date: Mon, 8 Jan 2024 14:08:44 -0600 Subject: [PATCH 2/4] Use `update!` so we throw errors if de-publishing a study fails. --- lib/connectors/ctgov.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/connectors/ctgov.rb b/lib/connectors/ctgov.rb index 72e8ce6..65eb3da 100644 --- a/lib/connectors/ctgov.rb +++ b/lib/connectors/ctgov.rb @@ -101,7 +101,7 @@ def stray_trials(nct_ids = nil) def cleanup_stray_trials(nct_ids = nil) nct_ids ||= site_nct_ids - stray_trials(nct_ids).update(visible: false) + stray_trials(nct_ids).update!(visible: false) end def nct_ids_for_location(location, page_token = nil) From 50576614d273e80926402f2fbc1b61c0b8e2447e Mon Sep 17 00:00:00 2001 From: Brian Hayden Date: Tue, 9 Jan 2024 10:01:42 -0600 Subject: [PATCH 3/4] Fix indentation --- lib/connectors/ctgov.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/connectors/ctgov.rb b/lib/connectors/ctgov.rb index 65eb3da..7381bb4 100644 --- a/lib/connectors/ctgov.rb +++ b/lib/connectors/ctgov.rb @@ -127,7 +127,7 @@ def nct_ids_for_location(location, page_token = nil) payload = JSON.parse(response.body || "{}") response_ids = Array(payload.dig("studies")).map do |result| - result.dig("protocolSection").dig("identificationModule").dig("nctId") + result.dig("protocolSection").dig("identificationModule").dig("nctId") end # Add the ids we just received, and ... From e792a2e8505bd5248b1e23bf19cd43b078cddae0 Mon Sep 17 00:00:00 2001 From: Brian Hayden Date: Tue, 9 Jan 2024 12:53:00 -0600 Subject: [PATCH 4/4] Remove ability to inject ID sets via arguments This was introduced solely for testing -- replace with stubbing :site_nct_ids in the spec. --- lib/connectors/ctgov.rb | 10 ++++------ spec/connectors/ctgov_spec.rb | 10 +++++++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/lib/connectors/ctgov.rb b/lib/connectors/ctgov.rb index 7381bb4..292ee40 100644 --- a/lib/connectors/ctgov.rb +++ b/lib/connectors/ctgov.rb @@ -94,14 +94,12 @@ def site_nct_ids nct_ids_for_location(SystemInfo.search_term) end - def stray_trials(nct_ids = nil) - nct_ids ||= site_nct_ids - Trial.where(parser_id: @parser_id).where.not(nct_id: nct_ids) + def stray_trials + Trial.where(parser_id: @parser_id).where.not(nct_id: self.site_nct_ids) end - def cleanup_stray_trials(nct_ids = nil) - nct_ids ||= site_nct_ids - stray_trials(nct_ids).update!(visible: false) + def cleanup_stray_trials + stray_trials.update_all(visible: false) end def nct_ids_for_location(location, page_token = nil) diff --git a/spec/connectors/ctgov_spec.rb b/spec/connectors/ctgov_spec.rb index 650d869..dc1e3f7 100644 --- a/spec/connectors/ctgov_spec.rb +++ b/spec/connectors/ctgov_spec.rb @@ -14,10 +14,12 @@ expect(will_hide.visible).to be_truthy expect(wont_hide.first.visible).to be_truthy - strays = ctgov.stray_trials(remaining_ids) + allow(ctgov).to receive(:site_nct_ids).and_return(remaining_ids) + + strays = ctgov.stray_trials expect(strays.map { |e| e.nct_id }).to include(will_hide.nct_id) - ctgov.cleanup_stray_trials(remaining_ids) + ctgov.cleanup_stray_trials will_hide.reload expect(will_hide.visible).to be_falsey expect(wont_hide.first.visible).to be_truthy @@ -36,7 +38,9 @@ expect(will_hide.visible).to be_truthy expect(wont_hide_2.visible).to be_truthy - ctgov.cleanup_stray_trials(remaining_ids) + allow(ctgov).to receive(:site_nct_ids).and_return(remaining_ids) + + ctgov.cleanup_stray_trials will_hide.reload expect(will_hide.visible).to be_falsey