Skip to content

Commit

Permalink
HYC-1988 - Handle soffice pptx (#1132)
Browse files Browse the repository at this point in the history
* Attempt to kill soffice if it times out

* Add custom retry behavior for create derivs job, have it skill SofficeTimeoutErrors immediately

* Use redlock to prevent multiple soffices from running at once. Remove retry handler since it wasn't getting called

* Explicitly add redlock dependency. Require redlock

* Add tests for encode method to verify timeout and kill works. Reduce the number of retries to 3, to avoid bogging the system down

* Use pgroup, capture more info and reproduce more of the behavior from original timeout method

* Wait for lock to become available rather than giving up immediately. Move lock closer to execution

* Reduce the amount of new code by using the existing execute functionality. Fix locking retries
  • Loading branch information
bbpennel authored Nov 19, 2024
1 parent f6b4909 commit 41282a8
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 4 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ gem 'rails', '~> 6.0'
gem 'rdf', git: 'https://github.com/ruby-rdf/rdf.git', branch: '3.2.11-patch'
# Use Redis adapter to run Action Cable in production
gem 'redis', '~> 4.5.0'
gem 'redlock' # version determined by hyrax
gem 'riiif', '~> 2.5.0'
gem 'roo', '~>2.9.0'
gem 'rsolr', '~> 2.5.0'
Expand Down
1 change: 1 addition & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1130,6 +1130,7 @@ DEPENDENCIES
rails (~> 6.0)
rdf!
redis (~> 4.5.0)
redlock
riiif (~> 2.5.0)
roo (~> 2.9.0)
rsolr (~> 2.5.0)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,56 @@
# frozen_string_literal: true
# [hyc-override] https://github.com/samvera/hydra-derivatives/blob/v3.8.0/lib/hydra/derivatives/processors/document.rb
require 'redlock'

class SofficeTimeoutError < Hydra::Derivatives::TimeoutError; end

Hydra::Derivatives::Processors::Document.class_eval do
# TODO: soffice can only run one command at a time. Right now we manage this by only running one
# background job at a time; however, if we want to up concurrency we'll need to deal with this
# [hyc-override] Use Redlock to manage soffice process lock
LOCK_KEY = 'soffice:document_conversion'
LOCK_TIMEOUT = 6 * 60 * 1000 # Lock timeout should be longer than the job timeout
JOB_TIMEOUT_SECONDS = 5 * 60
LOCK_MANAGER = Redlock::Client.new([Redis.current])

# [hyc-override] Adding in a graceful termination before hard kill, reap the process after kill
def self.execute_with_timeout(timeout, command, context)
Timeout.timeout(timeout) do
execute_without_timeout(command, context)
end
rescue Timeout::Error
pid = context[:pid]
if pid
Rails.logger.warn("Terminating soffice process #{pid} after #{timeout} seconds")
Process.kill('TERM', pid) # Attempt a graceful termination
sleep 5 # Give it a few seconds to exit
if system("ps -p #{pid}")
Rails.logger.warn("Killing soffice process #{pid} after graceful termination failed")
Process.kill('KILL', pid)
end
# Harvest the defunct process so it doesn't linger forever
Process.wait(pid)
end
raise SofficeTimeoutError, "Unable to execute command \"#{command}\"\nThe command took longer than #{timeout} seconds to execute"
end

def self.encode(path, format, outdir, timeout = JOB_TIMEOUT_SECONDS)
# [hyc-override] Use Redlock to manage soffice process lock, since only one soffice process can run at a time
begin
LOCK_MANAGER.lock(LOCK_KEY, LOCK_TIMEOUT) do |locked|
if locked
Rails.logger.debug("Acquired lock for document conversion of #{path}")
command = "#{Hydra::Derivatives.libreoffice_path} --invisible --headless --convert-to #{format} --outdir #{outdir} #{Shellwords.escape(path)}"
# [hyc-override] Use execute_with_timeout directly
execute_with_timeout(timeout, command, {})
else
sleep(0.5)
raise Redlock::LockError, "Failed to acquire lock for document conversion of #{path}"
end
end
Rails.logger.debug("Released soffice lock for #{path}")
rescue Redlock::LockError
retry
end
end

# Converts the document to the format specified in the directives hash.
# TODO: file_suffix and options are passed from ShellBasedProcessor.process but are not needed.
Expand Down
2 changes: 1 addition & 1 deletion config/initializers/hyrax.rb
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@
config.fits_path = ENV['FITS_LOCATION']

# Path to the file derivatives creation tool
# config.libreoffice_path = "soffice"
config.libreoffice_path = ENV['LIBREOFFICE_PATH'] || 'soffice'

# Option to enable/disable full text extraction from PDFs
# Default is true, set to false to disable full text extraction
Expand Down
2 changes: 1 addition & 1 deletion config/sidekiq.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
# In production this is overridden by a template in the vagrant-rails project
:concurrency: 6
:max_retries: 5
:max_retries: 3
:queues:
- default
- derivatives
Expand Down
80 changes: 80 additions & 0 deletions spec/lib/hydra/derivatives/processors/document_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

RSpec.describe Hydra::Derivatives::Processors::Document do
subject { described_class.new(source_path, directives) }
PID = 991234

let(:source_path) { File.join(fixture_path, 'test.doc') }
let(:output_service) { Hyrax::PersistDerivatives }
Expand All @@ -29,4 +30,83 @@
end
end
end

describe '.encode' do
let(:path) { '/path/to/document.pptx' }
let(:format) { 'pdf' }
let(:outdir) { '/output/dir' }
let(:timeout) { 5 }

before do
# Mock the Hydra::Derivatives.libreoffice_path
allow(Hydra::Derivatives).to receive(:libreoffice_path).and_return('/fake/path/to/soffice')
end

context 'when the process completes successfully' do
it 'runs the command and completes without timeout' do
# Mock Process.spawn and Process.wait to simulate successful execution
allow(Hydra::Derivatives::Processors::Document).to receive(:execute_without_timeout).and_return(true)

described_class.encode(path, format, outdir, timeout)

# Verify that the process was executed
expect(Hydra::Derivatives::Processors::Document).to have_received(:execute_without_timeout)
end
end

context 'when the process times out' do
it 'kills the process after a timeout' do
# Mock Process.kill to simulate killing the process
allow(Process).to receive(:kill)
allow(Process).to receive(:wait).with(PID)
allow(Hydra::Derivatives::Processors::Document).to receive(:execute_without_timeout).and_wrap_original do |original_method, command, context|
# Simulate setting the PID in the context hash
context[:pid] = PID
# Raise the Timeout::Error
sleep 2
end
allow(Hydra::Derivatives::Processors::Document).to receive(:system).with("ps -p #{PID}").and_return(true)

expect do
described_class.encode(path, format, outdir, 0.5)
end.to raise_error(SofficeTimeoutError, /Unable to execute command.*/)

# Verify that the process was spawned
expect(Process).to have_received(:kill).with('TERM', PID) # Attempted graceful termination
expect(Process).to have_received(:kill).with('KILL', PID) # Force kill if necessary
# Verify that process was reaped after being killed
expect(Process).to have_received(:wait).with(PID)
end
end

context 'when another job already has lock' do
let(:lock_manager) { instance_double(Redlock::Client) }
let(:lock_key) { 'soffice:document_conversion' }

before do
stub_const('LOCK_MANAGER', lock_manager)
allow(described_class).to receive(:execute_with_timeout)
end

it 'it waits to acquire the lock and then runs the command' do
# Set up the locking block to indicate its locked twice, and then unlocked on the third attempt
lock_attempts = 0
allow(lock_manager).to receive(:lock).with(lock_key, anything) do |_key, _timeout, &block|
lock_attempts += 1
if lock_attempts == 3
block.call({ validity: 3000, resource: lock_key, value: 'lock_value' })
else
block.call(false)
end
end

described_class.encode(path, format, outdir, timeout)

# Verify lock was called multiple times
expect(lock_manager).to have_received(:lock).exactly(3).times
# Verify that the process was executed after those retries
expect(Hydra::Derivatives::Processors::Document).to have_received(:execute_with_timeout)
end
end
end
end

0 comments on commit 41282a8

Please sign in to comment.