ahcis-rds · machinehum · Jan 9, 2024 · Dec 19, 2023 · Jan 8, 2024 · Jan 9, 2024
diff --git a/lib/connectors/ctgov.rb b/lib/connectors/ctgov.rb
@@ -90,55 +90,74 @@ def clear
       TrialCondition.delete_all
     end
 
+    def site_nct_ids
+      nct_ids_for_location(SystemInfo.search_term)
+    end
+
     def stray_trials
-      Trial.where.not(system_id: nct_ids_for_location(@system_info.search_term))
+      Trial.where(parser_id: @parser_id).where.not(nct_id: self.site_nct_ids)
     end
 
     def cleanup_stray_trials
-      stray_trials.destroy_all
+      stray_trials.update_all(visible: false)
     end
 
-    private
+    def nct_ids_for_location(location, page_token = nil)
+      csc = 'M Health Fairview Clinics and Surgery Center'
+      ids = []
+      q = {
+          'query.locn' => "SEARCH[Location](AREA[LocationFacility]#{location} AND AREA[LocationStatus]RECRUITING)",
+          fields: "NCTId",
+          countTotal: true,
+          pageSize: 1000,
+          format: "json"
+        }
+
+      # API only wants a pageToken arg at all if we are actually asking for one.
+      if !page_token.blank?
+        q[:pageToken] = page_token
+      end
 
-      def extract_zip
-        dirname = "#{Rails.root}/tmp/"
-        unless File.directory?(dirname)
-          FileUtils.mkdir_p(dirname)
-        end
+      response = HTTParty.get(
+        "https://clinicaltrials.gov/api/v2/studies",
+        query: q
+      )
+      payload = JSON.parse(response.body || "{}")
 
-        unless File.directory?("#{dirname}trials/")
-          FileUtils.mkdir_p("#{dirname}trials/")
-        end
+      response_ids = Array(payload.dig("studies")).map do |result|
+        result.dig("protocolSection").dig("identificationModule").dig("nctId")
+      end
 
-        FileUtils.rm_rf(Dir.glob("#{dirname}trials/*"))
-        Zip::File.open("#{dirname}search_result.zip") do |file|
-          file.each do |entry|
-            file.extract(entry, "#{dirname}trials/#{entry.name}")
-          end
-        end
+      # Add the ids we just received, and ...
+      ids.push(*response_ids)
+
+      # ... recurse if there's another page. 
+      if payload.dig("nextPageToken")
+        ids.push(*(nct_ids_for_location(location, payload.dig("nextPageToken"))))
       end
 
-    def nct_ids_for_location(location, start = 1, endd = 1000, ids = [])
-      response = HTTParty.get(
-        "https://classic.clinicaltrials.gov/api/query/study_fields",
-        query: {
-          expr: "SEARCH[Location](AREA[LocationFacility]#{location})",
-          fields: "NCTId",
-          min_rnk: start,
-          max_rnk: endd,
-          fmt: "json"
-        }
-      )
+      return ids
+    end
+
+    private
+
+    def extract_zip
+      dirname = "#{Rails.root}/tmp/"
+      unless File.directory?(dirname)
+        FileUtils.mkdir_p(dirname)
+      end
 
-      response_ids = Array(JSON.parse(response.body || "{}").dig("StudyFieldsResponse").dig("StudyFields")).map do |result|
-        Array(result.dig("NCTId")).first
+      unless File.directory?("#{dirname}trials/")
+        FileUtils.mkdir_p("#{dirname}trials/")
       end
 
-      if response_ids.empty?
-        ids
-      else
-        nct_ids_for_location(location, endd + 1, endd + 1000, ids + response_ids)
+      FileUtils.rm_rf(Dir.glob("#{dirname}trials/*"))
+      Zip::File.open("#{dirname}search_result.zip") do |file|
+        file.each do |entry|
+          file.extract(entry, "#{dirname}trials/#{entry.name}")
+        end
       end
     end
+
   end
 end
diff --git a/lib/tasks/ctgov.rake b/lib/tasks/ctgov.rake
@@ -34,6 +34,14 @@ namespace :studyfinder do
       Trial.import force: true
     end
 
+    task cleanup_strays: :environment do |t, args|
+      puts "Cleaning up stray trials"
+      connector = Connectors::Ctgov.new
+      trials = connector.cleanup_stray_trials
+      puts "Have un-published (system_ids): "
+      puts trials.map{ |e| " #{e.system_id}\n" }
+    end
+
     # ==============================================================================================
     # studyfinder:ctgov:reload_all
     # Note: Dangerous business here!!  This will delete and reload data from every

diff --git a/spec/connectors/ctgov_spec.rb b/spec/connectors/ctgov_spec.rb
@@ -0,0 +1,50 @@
+require 'rails_helper'
+require 'connectors/ctgov'
+
+describe Connectors::Ctgov do
+  context "cleanup_stray_trials" do
+    it "hides trials that are no longer actively recruiting at the given location(s)" do
+      parser = create(:parser)
+      system_info = create(:system_info, initials: 'TSTU')
+      ctgov = Connectors::Ctgov.new
+      will_hide = create(:trial, parser: parser)
+      wont_hide = create_list(:trial, 5, parser: parser)
+      remaining_ids = wont_hide.map { |e| e.nct_id }
+
+      expect(will_hide.visible).to be_truthy
+      expect(wont_hide.first.visible).to be_truthy
+
+      allow(ctgov).to receive(:site_nct_ids).and_return(remaining_ids)
+
+      strays = ctgov.stray_trials
+      expect(strays.map { |e| e.nct_id }).to include(will_hide.nct_id)
+
+      ctgov.cleanup_stray_trials
+      will_hide.reload
+      expect(will_hide.visible).to be_falsey
+      expect(wont_hide.first.visible).to be_truthy
+    end
+
+    it "does not hide trials from a different parser" do
+      parser = create(:parser)
+      parser2 = create(:parser, name: 'foobar', klass: 'Parsers::Foobar')
+      system_info = create(:system_info, initials: 'TSTU')
+      ctgov = Connectors::Ctgov.new
+      will_hide = create(:trial, parser: parser)
+      wont_hide = create_list(:trial, 5, parser: parser)
+      wont_hide_2 = create(:trial, parser: parser2)
+      remaining_ids = wont_hide.map { |e| e.nct_id }
+
+      expect(will_hide.visible).to be_truthy
+      expect(wont_hide_2.visible).to be_truthy
+
+      allow(ctgov).to receive(:site_nct_ids).and_return(remaining_ids)
+
+      ctgov.cleanup_stray_trials
+      will_hide.reload
+
+      expect(will_hide.visible).to be_falsey
+      expect(wont_hide_2.visible).to be_truthy
+    end
+  end
+end
diff --git a/spec/factories/parser.rb b/spec/factories/parser.rb
@@ -0,0 +1,6 @@
+FactoryBot.define do
+  factory :parser do
+    name { 'clinicaltrials.gov' }
+    klass { 'Parsers::Ctgov' }
+  end
+end
diff --git a/spec/factories/trial.rb b/spec/factories/trial.rb
@@ -1,7 +1,9 @@
 FactoryBot.define do
   sequence(:system_id, 10000) { |n| "STUDY#{n}" }
+  sequence(:nct_id, 10000) { |n| "NCT#{n}" }
   factory :trial do
     system_id { generate(:system_id) }
+    nct_id { generate(:nct_id) }
     brief_title { Faker::Lorem.sentence }
     approved { true }
     visible { true }