ahcis-rds · machinehum · Jul 24, 2024 · Jul 17, 2024
diff --git a/README.md b/README.md
@@ -10,9 +10,11 @@ Contact the StudyFinder team at studyfinder@umn.edu if you:
 - Have any questions about StudyFinder, or
 - Want to learn more about updates or enhancements of the tool.
 
-## Upgrade notes for 2.1
+## Upgrade notes for 2.2
+The built-in clinicaltrials.gov connector has been transitioned fully to the clinicaltrials.gov V2 API. This includes two breaking changes in the private API for the ctgov connector. 
 
-The main page carousel/video feature was an accessibility and usability issue, and has been replaced with a three-wide panel of "featured studies". These can be configured in the admin panel, where the carousel configuration formerly was.
+1. In `Connectors::Ctgov#load(start_date,end_date)` the start and end dates must now be in ISO format YYYY-MM-DD (the old format was MM/DD/YYYY). Any custom tasks that directly call this method should be updated.
+2. `Connectors::Ctgov#load(start_date,end_date)` now calls `Connectors::Ctgov#process` itself to recurse through the V2 API's paged results. Formerly, `load` and `process` had to be called separately in that order. Remove any direct calls to `process` in order to avoid a redundant re-processing of the last "page" of data from the API. 
 
 ## Development
 

diff --git a/app/views/studies/_clinicaltrialsgov_button.html.erb b/app/views/studies/_clinicaltrialsgov_button.html.erb
@@ -1,5 +1,5 @@
 <% if Trial.is_nct_number?(study.nct_id) %>
-  <a class="btn btn-school btn-more-info" href="https://www.clinicaltrials.gov/ct2/show/study/<%= study.nct_id%>" onclick="track('send', 'event', 'ctgov', 'click', {'nct_id':'<%= study.nct_id %>'});" target="_blank">
+  <a class="btn btn-school btn-more-info" href="https://www.clinicaltrials.gov/study/<%= study.nct_id%>" onclick="track('send', 'event', 'ctgov', 'click', {'nct_id':'<%= study.nct_id %>'});" target="_blank">
     <i class="fa-solid fa-info-circle"></i>
     See this study on ClinicalTrials.gov
   </a>

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,4 +1,3 @@
-version: '3'
 services:
   elasticsearch:
     image: elasticsearch:8.10.2

diff --git a/lib/connectors/ctgov.rb b/lib/connectors/ctgov.rb
@@ -5,77 +5,98 @@ class Ctgov
 
     def initialize
       @system_info = SystemInfo.current
-      @parser_id = Parser.find_by({ klass: 'Parsers::Ctgov'}).id
 
       if @system_info.nil?
         raise "There is no system info associated. Please run the seeds file, or add the info in the system administration section."
       end
-    end
-
-    def load(start_date=nil, end_date=nil)
-      start_load_time = Time.now
 
-      url = "https://clinicaltrials.gov/ct2/results/download_studies?locn=#{ERB::Util.url_encode(@system_info.search_term)}"
+      @parser_id = Parser.find_by({ klass: 'Parsers::Ctgov'}).id
+      @location = @system_info.search_term
+      @page_token = nil
+      @payload = nil
+      @start_date = 'MIN'
+      @end_date = 'MAX'
+      @start_load_time = nil
+      @total_count = nil
+      @count = 0
+    end
 
-      if !start_date.nil? and !end_date.nil?
-        puts "Loading clinicaltrials.gov results for #{@system_info.search_term} ... from #{start_date} to #{end_date}"
-        url = url + "&lup_s=#{ERB::Util.url_encode(start_date)}&lup_e=#{ERB::Util.url_encode(end_date)}"
-      else
-        puts "Loading all clinicaltrials.gov results for #{@system_info.search_term} ..."
+    def study_filters
+      q = {
+          'query.locn' => "AREA[LocationFacility]#{@location} AND AREA[LocationStatus]RECRUITING",
+          'query.term' => "AREA[LastUpdatePostDate]RANGE[#{@start_date},#{@end_date}]",
+          countTotal: true,
+          pageSize: 100,
+          format: "json"
+        }
+      # API only wants a pageToken arg at all if we are actually asking for one.
+      if !@page_token.blank?
+        q[:pageToken] = @page_token
       end
 
-      puts "Search URL: #{url}"
-      # @zipfile = Tempfile.new('file')
-      # @zipfile.binmode
+      return q
+    end
 
-      dirname = "#{Rails.root}/tmp/"
-      unless File.directory?(dirname)
-        FileUtils.mkdir_p(dirname)
-      end
+    def studies_page
+      response = HTTParty.get(
+        "https://clinicaltrials.gov/api/v2/studies",
+        query: self.study_filters
+      )
+      @payload = JSON.parse(response.body || "{}")
+      @total_count ||= @payload.dig('totalCount')
+      puts "Retrieved page (#{@page_token})"
+    end
 
-      FileUtils.rm_rf("#{dirname}search_result.zip")
-      File.open("#{dirname}search_result.zip", "w+") do |f|
-        f.write(HTTParty.get(url).body)
-      end
-      # @zipfile.write(HTTParty.get(url).body)
-      # @zipfile.close
+    def load(start_date="MIN", end_date="MAX")
+      puts "Adding/Updating trials in the database.  If it is a full reload it's going to be awhile...  Maybe get some coffee? :)"
+      @start_date = start_date
+      @end_date = end_date
+      @start_load_time ||= Time.now
 
-      puts "Extracting trials from zip file"
-      extract()
-      end_load_time = Time.now
+      self.studies_page
 
-      puts "Time elapsed #{(end_load_time - start_load_time)} seconds"
-    end
+      # Process the studies we just received, and ...
+      self.process
+      # ... recurse if there's another page. 
 
-    def extract
-      start_load_time = Time.now
-      extract_zip()
-      end_load_time = Time.now
+      if @payload.dig("nextPageToken")
+        @page_token = @payload.dig("nextPageToken")
+      else
+        @page_token = nil
+      end
 
-      puts "Zip time elapsed: #{(end_load_time - start_load_time)}"
-      return true
+      if @page_token.blank? 
+        puts "clinicaltrials.gov load COMPLETE."
+      else
+        puts "Now we'll load page #{@payload.dig("nextPageToken")}}"
+        @payload = nil
+        self.load(@start_date,@end_date)
+      end
     end
 
     def process
-      start_load_time = Time.now
-      count = 0
-      puts "Adding/Updating trials in the database.  If it is a full reload it's going to be awhile...  Maybe get some coffee? :)"
-
-      Dir.glob("#{Rails.root}/tmp/trials/*.xml") do |file|
-        p = Parsers::Ctgov.new( file.gsub("#{Rails.root}/tmp/trials/", "").gsub(".xml", ""), @parser_id)
-        p.load(file)
+      page_start_load_time = Time.now
+      page_count = 0
+      puts "Processing page (#{@page_token})"
+
+      @payload.dig('studies').each do |study|
+        @id = study.dig('protocolSection', 'identificationModule', 'nctId')
+        p = Parsers::Ctgov.new(@id, @parser_id, study)
+        puts "Processing: #{@id} (#{@count + 1} of #{@total_count})"
         p.process
-        count = count + 1
+        page_count = page_count + 1
+        @count = @count + 1
       end
-      end_load_time = Time.now
+      page_end_load_time = Time.now
 
-      puts "Logging update to updaters table. Processed #{count} records."
+      puts "Logging update to updaters table."
       Updater.create({
         parser_id: @parser_id,
-        num_updated: count
+        num_updated: page_count
       })
 
-      puts "Process time elapsed: #{(end_load_time - start_load_time)} seconds"
+      puts "Page time elapsed: #{(page_end_load_time - page_start_load_time)} seconds for #{page_count} records."
+      puts "Total process elapsed: #{(page_end_load_time - @start_load_time)} seconds for #{@count} records."
       return true
     end
 
@@ -86,8 +107,9 @@ def clear
       TrialLocation.delete_all
       TrialKeyword.delete_all
       Location.delete_all
-      Trial.delete_all
+      TrialSubgroup.delete_all
       TrialCondition.delete_all
+      Trial.delete_all
     end
 
     def site_nct_ids
@@ -103,7 +125,6 @@ def cleanup_stray_trials
     end
 
     def nct_ids_for_location(location, page_token = nil)
-      csc = 'M Health Fairview Clinics and Surgery Center'
       ids = []
       q = {
           'query.locn' => "SEARCH[Location](AREA[LocationFacility]#{location} AND AREA[LocationStatus]RECRUITING)",