diff --git a/lib/connectors/ctgov.rb b/lib/connectors/ctgov.rb index 94658d9..292ee40 100644 --- a/lib/connectors/ctgov.rb +++ b/lib/connectors/ctgov.rb @@ -90,55 +90,74 @@ def clear TrialCondition.delete_all end + def site_nct_ids + nct_ids_for_location(SystemInfo.search_term) + end + def stray_trials - Trial.where.not(system_id: nct_ids_for_location(@system_info.search_term)) + Trial.where(parser_id: @parser_id).where.not(nct_id: self.site_nct_ids) end def cleanup_stray_trials - stray_trials.destroy_all + stray_trials.update_all(visible: false) end - private + def nct_ids_for_location(location, page_token = nil) + csc = 'M Health Fairview Clinics and Surgery Center' + ids = [] + q = { + 'query.locn' => "SEARCH[Location](AREA[LocationFacility]#{location} AND AREA[LocationStatus]RECRUITING)", + fields: "NCTId", + countTotal: true, + pageSize: 1000, + format: "json" + } + + # API only wants a pageToken arg at all if we are actually asking for one. + if !page_token.blank? + q[:pageToken] = page_token + end - def extract_zip - dirname = "#{Rails.root}/tmp/" - unless File.directory?(dirname) - FileUtils.mkdir_p(dirname) - end + response = HTTParty.get( + "https://clinicaltrials.gov/api/v2/studies", + query: q + ) + payload = JSON.parse(response.body || "{}") - unless File.directory?("#{dirname}trials/") - FileUtils.mkdir_p("#{dirname}trials/") - end + response_ids = Array(payload.dig("studies")).map do |result| + result.dig("protocolSection").dig("identificationModule").dig("nctId") + end - FileUtils.rm_rf(Dir.glob("#{dirname}trials/*")) - Zip::File.open("#{dirname}search_result.zip") do |file| - file.each do |entry| - file.extract(entry, "#{dirname}trials/#{entry.name}") - end - end + # Add the ids we just received, and ... + ids.push(*response_ids) + + # ... recurse if there's another page. + if payload.dig("nextPageToken") + ids.push(*(nct_ids_for_location(location, payload.dig("nextPageToken")))) end - def nct_ids_for_location(location, start = 1, endd = 1000, ids = []) - response = HTTParty.get( - "https://classic.clinicaltrials.gov/api/query/study_fields", - query: { - expr: "SEARCH[Location](AREA[LocationFacility]#{location})", - fields: "NCTId", - min_rnk: start, - max_rnk: endd, - fmt: "json" - } - ) + return ids + end + + private + + def extract_zip + dirname = "#{Rails.root}/tmp/" + unless File.directory?(dirname) + FileUtils.mkdir_p(dirname) + end - response_ids = Array(JSON.parse(response.body || "{}").dig("StudyFieldsResponse").dig("StudyFields")).map do |result| - Array(result.dig("NCTId")).first + unless File.directory?("#{dirname}trials/") + FileUtils.mkdir_p("#{dirname}trials/") end - if response_ids.empty? - ids - else - nct_ids_for_location(location, endd + 1, endd + 1000, ids + response_ids) + FileUtils.rm_rf(Dir.glob("#{dirname}trials/*")) + Zip::File.open("#{dirname}search_result.zip") do |file| + file.each do |entry| + file.extract(entry, "#{dirname}trials/#{entry.name}") + end end end + end end diff --git a/lib/tasks/ctgov.rake b/lib/tasks/ctgov.rake index b15611d..a180b26 100644 --- a/lib/tasks/ctgov.rake +++ b/lib/tasks/ctgov.rake @@ -34,6 +34,14 @@ namespace :studyfinder do Trial.import force: true end + task cleanup_strays: :environment do |t, args| + puts "Cleaning up stray trials" + connector = Connectors::Ctgov.new + trials = connector.cleanup_stray_trials + puts "Have un-published (system_ids): " + puts trials.map{ |e| " #{e.system_id}\n" } + end + # ============================================================================================== # studyfinder:ctgov:reload_all # Note: Dangerous business here!! This will delete and reload data from every diff --git a/spec/connectors/ctgov_spec.rb b/spec/connectors/ctgov_spec.rb new file mode 100644 index 0000000..dc1e3f7 --- /dev/null +++ b/spec/connectors/ctgov_spec.rb @@ -0,0 +1,50 @@ +require 'rails_helper' +require 'connectors/ctgov' + +describe Connectors::Ctgov do + context "cleanup_stray_trials" do + it "hides trials that are no longer actively recruiting at the given location(s)" do + parser = create(:parser) + system_info = create(:system_info, initials: 'TSTU') + ctgov = Connectors::Ctgov.new + will_hide = create(:trial, parser: parser) + wont_hide = create_list(:trial, 5, parser: parser) + remaining_ids = wont_hide.map { |e| e.nct_id } + + expect(will_hide.visible).to be_truthy + expect(wont_hide.first.visible).to be_truthy + + allow(ctgov).to receive(:site_nct_ids).and_return(remaining_ids) + + strays = ctgov.stray_trials + expect(strays.map { |e| e.nct_id }).to include(will_hide.nct_id) + + ctgov.cleanup_stray_trials + will_hide.reload + expect(will_hide.visible).to be_falsey + expect(wont_hide.first.visible).to be_truthy + end + + it "does not hide trials from a different parser" do + parser = create(:parser) + parser2 = create(:parser, name: 'foobar', klass: 'Parsers::Foobar') + system_info = create(:system_info, initials: 'TSTU') + ctgov = Connectors::Ctgov.new + will_hide = create(:trial, parser: parser) + wont_hide = create_list(:trial, 5, parser: parser) + wont_hide_2 = create(:trial, parser: parser2) + remaining_ids = wont_hide.map { |e| e.nct_id } + + expect(will_hide.visible).to be_truthy + expect(wont_hide_2.visible).to be_truthy + + allow(ctgov).to receive(:site_nct_ids).and_return(remaining_ids) + + ctgov.cleanup_stray_trials + will_hide.reload + + expect(will_hide.visible).to be_falsey + expect(wont_hide_2.visible).to be_truthy + end + end +end diff --git a/spec/factories/parser.rb b/spec/factories/parser.rb new file mode 100644 index 0000000..cabdd70 --- /dev/null +++ b/spec/factories/parser.rb @@ -0,0 +1,6 @@ +FactoryBot.define do + factory :parser do + name { 'clinicaltrials.gov' } + klass { 'Parsers::Ctgov' } + end +end diff --git a/spec/factories/trial.rb b/spec/factories/trial.rb index e8a1e6c..2509c43 100644 --- a/spec/factories/trial.rb +++ b/spec/factories/trial.rb @@ -1,7 +1,9 @@ FactoryBot.define do sequence(:system_id, 10000) { |n| "STUDY#{n}" } + sequence(:nct_id, 10000) { |n| "NCT#{n}" } factory :trial do system_id { generate(:system_id) } + nct_id { generate(:nct_id) } brief_title { Faker::Lorem.sentence } approved { true } visible { true }