Skip to content

Commit

Permalink
HYC-1974 Total Dissertation Page Count (#1122)
Browse files Browse the repository at this point in the history
* init total_page_count

* process_dissertation

* impl sum_relation_values

* write to csv
  • Loading branch information
davidcam-src authored Oct 3, 2024
1 parent af2bb8d commit ca9b29e
Showing 1 changed file with 92 additions and 0 deletions.
92 changes: 92 additions & 0 deletions lib/tasks/dissertations_page_count.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# frozen_string_literal: true
# lib/tasks/dissertations_page_count.rake

namespace :dissertations do
desc 'Calculate total page counts for dissertations'

task :page_count, [:year] => :environment do |task, args|
require 'rdf'
require 'rdf/ntriples'
require 'open-uri'
require 'csv'
year = args[:year]

# Method to fetch all dissertations by paginating through Solr results
def fetch_all_dissertations(year = nil)
start = 0
rows = 1000
all_processed_dissertations = []

loop do
query_string = year ? "admin_set_tesim:\"Dissertations\" AND graduation_year_tesim:\"#{year}\"" : 'admin_set_tesim:"Dissertations"'
Rails.logger.info("Querying Solr with: #{query_string}")
response = ActiveFedora::SolrService.get(query_string, start: start, rows: rows)
processed_dissertations = response['response']['docs'].map { |doc| process_dissertation(doc) }
all_processed_dissertations.concat(processed_dissertations)

# Stop when no more results are returned
break if processed_dissertations.empty? || processed_dissertations.length < rows
start += rows
end

all_processed_dissertations
end

def process_dissertation(dissertation)
res = {}
res[:dissertation_id] = dissertation['id']
res[:title] = dissertation['title_tesim']&.first
# Associating page count with the dissertation instead of the fileset
# dissertation['file_set_ids_ssim'] is an array of fileset ids
res[:page_count] = total_page_count_for_fileset_ids(dissertation['file_set_ids_ssim'])
return res
end

def total_page_count_for_fileset_ids(fileset_ids)
sum = 0
fileset_ids.each do |fileset_id|
fileset_object = ActiveFedora::Base.find(fileset_id)

# Check if the object has a page_count method
if fileset_object.respond_to?(:page_count)
page_count = sum_relation_values(fileset_object.page_count)
Rails.logger.info("Page count for #{fileset_id}: #{page_count}")
sum += page_count
else
Rails.logger.warn("No page_count method found for #{fileset_id}")
end
end
return sum
end

def sum_relation_values(relation)
if relation.is_a?(ActiveTriples::Relation) || relation.is_a?(Array)
# Convert each element to an integer and sum them up
relation.map(&:to_i).sum
else
# If it's a single value, just convert it to an integer
relation.to_i
end
end

def write_to_csv(all_processed_dissertations, year)
total_pages_all = 0
path = File.join(Rails.configuration.log_directory, year ? "dissertations_page_count_#{year}.csv" : 'dissertations_page_count_all.csv')
# Write the processed dissertations to a CSV file
CSV.open(path, 'w') do |csv|
# Write CSV headers
csv << ['Dissertation ID', 'Title', 'Page Count']

# Write each dissertation's data to a new row
all_processed_dissertations.each do |dissertation|
csv << [dissertation[:dissertation_id], dissertation[:title], dissertation[:page_count]]
total_pages_all += dissertation[:page_count]
end
csv << ['N/A', 'All Total Pages', total_pages_all]
end
end

all_processed_dissertations = fetch_all_dissertations(year)
write_to_csv(all_processed_dissertations, year)
end
end

0 comments on commit ca9b29e

Please sign in to comment.