Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Address expiry of studies from clinicaltrials.gov #157

Merged
merged 4 commits into from
Jan 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 53 additions & 34 deletions lib/connectors/ctgov.rb
Original file line number Diff line number Diff line change
Expand Up @@ -90,55 +90,74 @@ def clear
TrialCondition.delete_all
end

def site_nct_ids
nct_ids_for_location(SystemInfo.search_term)
end

def stray_trials
Trial.where.not(system_id: nct_ids_for_location(@system_info.search_term))
Trial.where(parser_id: @parser_id).where.not(nct_id: self.site_nct_ids)
end

def cleanup_stray_trials
stray_trials.destroy_all
stray_trials.update_all(visible: false)
end

private
def nct_ids_for_location(location, page_token = nil)
csc = 'M Health Fairview Clinics and Surgery Center'
ids = []
q = {
'query.locn' => "SEARCH[Location](AREA[LocationFacility]#{location} AND AREA[LocationStatus]RECRUITING)",
fields: "NCTId",
countTotal: true,
pageSize: 1000,
format: "json"
}

# API only wants a pageToken arg at all if we are actually asking for one.
if !page_token.blank?
q[:pageToken] = page_token
end

def extract_zip
dirname = "#{Rails.root}/tmp/"
unless File.directory?(dirname)
FileUtils.mkdir_p(dirname)
end
response = HTTParty.get(
"https://clinicaltrials.gov/api/v2/studies",
query: q
)
payload = JSON.parse(response.body || "{}")

unless File.directory?("#{dirname}trials/")
FileUtils.mkdir_p("#{dirname}trials/")
end
response_ids = Array(payload.dig("studies")).map do |result|
result.dig("protocolSection").dig("identificationModule").dig("nctId")
end

FileUtils.rm_rf(Dir.glob("#{dirname}trials/*"))
Zip::File.open("#{dirname}search_result.zip") do |file|
file.each do |entry|
file.extract(entry, "#{dirname}trials/#{entry.name}")
end
end
# Add the ids we just received, and ...
ids.push(*response_ids)

# ... recurse if there's another page.
if payload.dig("nextPageToken")
ids.push(*(nct_ids_for_location(location, payload.dig("nextPageToken"))))
end

def nct_ids_for_location(location, start = 1, endd = 1000, ids = [])
response = HTTParty.get(
"https://classic.clinicaltrials.gov/api/query/study_fields",
query: {
expr: "SEARCH[Location](AREA[LocationFacility]#{location})",
fields: "NCTId",
min_rnk: start,
max_rnk: endd,
fmt: "json"
}
)
return ids
end

private

def extract_zip
dirname = "#{Rails.root}/tmp/"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not using File.join and friends here is screaming to my eyeballs, but I don't think it's strictly necessary anymore (i.e. forward slashes work on both *nix and Windows now).

unless File.directory?(dirname)
Copy link
Member

@cdinger cdinger Jan 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FileUtils.mkdir_p doesn't fail if a directory already exists. These checks are probably not needed.

FileUtils.mkdir_p(dirname)
end

response_ids = Array(JSON.parse(response.body || "{}").dig("StudyFieldsResponse").dig("StudyFields")).map do |result|
Array(result.dig("NCTId")).first
unless File.directory?("#{dirname}trials/")
FileUtils.mkdir_p("#{dirname}trials/")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's Dir.mktmpdir that might be appropriate here. It looks kind of like Tempfile but without the built-in cleanup hook I think.

end

if response_ids.empty?
ids
else
nct_ids_for_location(location, endd + 1, endd + 1000, ids + response_ids)
FileUtils.rm_rf(Dir.glob("#{dirname}trials/*"))
Zip::File.open("#{dirname}search_result.zip") do |file|
file.each do |entry|
file.extract(entry, "#{dirname}trials/#{entry.name}")
end
end
end

end
end
8 changes: 8 additions & 0 deletions lib/tasks/ctgov.rake
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ namespace :studyfinder do
Trial.import force: true
end

task cleanup_strays: :environment do |t, args|
puts "Cleaning up stray trials"
connector = Connectors::Ctgov.new
trials = connector.cleanup_stray_trials
puts "Have un-published (system_ids): "
puts trials.map{ |e| " #{e.system_id}\n" }
Comment on lines +40 to +42
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

connector.cleanup_stray_trials returns the result of calling update(visible: true on stray trials. Since update will return the record(s) regardless of whether the update actually succeeded, would you need to do any final checking in this rake task to ensure what you are printing actually includes successfully updated records?

There's only a single validation I saw for Trial, so the likelihood of the update failing is probably minimal. And you aren't actually manipulating the returned data, so maybe this doesn't matter much.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Valid point. Updated to use update! so we get an error alert if there's an issue.

end

# ==============================================================================================
# studyfinder:ctgov:reload_all
# Note: Dangerous business here!! This will delete and reload data from every
Expand Down
50 changes: 50 additions & 0 deletions spec/connectors/ctgov_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
require 'rails_helper'
require 'connectors/ctgov'

describe Connectors::Ctgov do
context "cleanup_stray_trials" do
it "hides trials that are no longer actively recruiting at the given location(s)" do
parser = create(:parser)
system_info = create(:system_info, initials: 'TSTU')
ctgov = Connectors::Ctgov.new
will_hide = create(:trial, parser: parser)
wont_hide = create_list(:trial, 5, parser: parser)
remaining_ids = wont_hide.map { |e| e.nct_id }

expect(will_hide.visible).to be_truthy
expect(wont_hide.first.visible).to be_truthy

allow(ctgov).to receive(:site_nct_ids).and_return(remaining_ids)

strays = ctgov.stray_trials
expect(strays.map { |e| e.nct_id }).to include(will_hide.nct_id)

ctgov.cleanup_stray_trials
will_hide.reload
expect(will_hide.visible).to be_falsey
expect(wont_hide.first.visible).to be_truthy
end

it "does not hide trials from a different parser" do
parser = create(:parser)
parser2 = create(:parser, name: 'foobar', klass: 'Parsers::Foobar')
system_info = create(:system_info, initials: 'TSTU')
ctgov = Connectors::Ctgov.new
will_hide = create(:trial, parser: parser)
wont_hide = create_list(:trial, 5, parser: parser)
wont_hide_2 = create(:trial, parser: parser2)
remaining_ids = wont_hide.map { |e| e.nct_id }

expect(will_hide.visible).to be_truthy
expect(wont_hide_2.visible).to be_truthy

allow(ctgov).to receive(:site_nct_ids).and_return(remaining_ids)

ctgov.cleanup_stray_trials
will_hide.reload

expect(will_hide.visible).to be_falsey
expect(wont_hide_2.visible).to be_truthy
end
end
end
6 changes: 6 additions & 0 deletions spec/factories/parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FactoryBot.define do
factory :parser do
name { 'clinicaltrials.gov' }
klass { 'Parsers::Ctgov' }
end
end
2 changes: 2 additions & 0 deletions spec/factories/trial.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
FactoryBot.define do
sequence(:system_id, 10000) { |n| "STUDY#{n}" }
sequence(:nct_id, 10000) { |n| "NCT#{n}" }
factory :trial do
system_id { generate(:system_id) }
nct_id { generate(:nct_id) }
brief_title { Faker::Lorem.sentence }
approved { true }
visible { true }
Expand Down
Loading