Skip to content

Commit

Permalink
Add workers to download and import ONSPD data
Browse files Browse the repository at this point in the history
  • Loading branch information
KludgeKML committed Jun 5, 2023
1 parent 9b68636 commit b2e1d28
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 1 deletion.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ gem "httparty"
gem "pact", require: false
gem "pact_broker-client"
gem "pg"
gem "rubyzip"
gem "sentry-sidekiq"
gem "sidekiq-scheduler"
gem "sidekiq-unique-jobs"
Expand Down
2 changes: 1 addition & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@ GEM
concurrent-ruby (~> 1.0)
jmespath (1.6.2)
json (2.6.3)
kgio (2.11.4)
link_header (0.0.8)
listen (3.8.0)
rb-fsevent (~> 0.10, >= 0.10.3)
Expand Down Expand Up @@ -448,6 +447,7 @@ DEPENDENCIES
rails (= 7.0.5)
rspec-rails
rubocop-govuk
rubyzip
sentry-sidekiq
sidekiq-scheduler
sidekiq-unique-jobs
Expand Down
12 changes: 12 additions & 0 deletions app/workers/ons_base_worker.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
require "aws-sdk-s3"

class OnsBaseWorker
include Sidekiq::Worker
sidekiq_options queue: :queue_ons, lock: :until_executed, lock_timeout: nil

BUCKET_NAME = "govuk-#{ENV['GOVUK_ENVIRONMENT_NAME']}-locations-api-import-csvs".freeze

def s3_client
@s3_client ||= Aws::S3::Client.new(region: "eu-west-1")
end
end
42 changes: 42 additions & 0 deletions app/workers/ons_download_worker.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
require "open-uri"
require "zip"

class OnsDownloadWorker < OnsBaseWorker
# Example URL: https://www.arcgis.com/sharing/rest/content/items/a2f8c9c5778a452bbf640d98c166657c/data
# retrieved by visiting https://geoportal.statistics.gov.uk/search?collection=Dataset&sort=-created&tags=all(PRD_ONSPD)
# clicking on the first search result and then copying the link from the download button.

DATAFILE_REGEX = /\AData\/multi_csv\/ONSPD_(.*)_UK_(.*).csv\z/

def perform(url)
# 1. Download File
temp_zip_file = Tempfile.new("tmp/ONSPD.zip")
IO.copy_stream(URI.parse(url).open, temp_zip_file.path)

# 2. Unzip File/Data/multi_csv, and post to S3 bucket
Zip::File.open(temp_zip_file.path) do |zip_file|
zip_file.each do |entry|
file_details = entry.name.match(DATAFILE_REGEX)
next unless file_details

begin
s3_key_name = "ons/#{file_details.match(1)}/#{file_details.match(2)}.csv"
content = entry.get_input_stream.read

_response = s3_client.put_object(
bucket: BUCKET_NAME,
key: s3_key_name,
body: content,
)
# TODO: check response.etag (if false, upload failed somehow?)
# TODO: Kick off OnsImportWorker for the file
# OnsImportWorker.new.perform_async(s3_key_name)
puts "Added #{entry.name} to S3 bucket as #{s3_key_name}"
rescue StandardError => e
puts "Error extracting and uploading object #{e.message}"
end
end
end
# TODO: delete the zip file
end
end
31 changes: 31 additions & 0 deletions app/workers/ons_import_worker.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
class OnsImportWorker < OnsBaseWorker
def perform(s3_key_name)
temp_csv_file = Tempfile.new("tmp/ONSPD.csv")

s3_client.get_object(
response_target: temp_csv_file.path,
bucket: BUCKET_NAME,
key: s3_key_name,
)

CSV.foreach(temp_csv_file.path, headers: true) do |row|
postcode = PostcodeHelper.normalise(row["pcds"])
next if Postcode.where(postcode:).count.positive?

results = [
{
"ONS" => {
"AVG_LNG" => row["long"],
"AVG_LAT" => row["lat"],
"TYPE" => row["usertype"] == "0" ? "S" : "L",
"DOTERM" => row["doterm"],
},
},
]

Postcode.create(postcode:, results:)
end
rescue StandardError => e
puts "Error getting object: #{e.message}"
end
end

0 comments on commit b2e1d28

Please sign in to comment.