Skip to content

Commit

Permalink
OY-4954 Refactor anonymization
Browse files Browse the repository at this point in the history
  • Loading branch information
tomikat committed Jan 8, 2025
1 parent a3111a3 commit e23b438
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 34 deletions.
64 changes: 62 additions & 2 deletions dev-resources/sql/anonymizer-application-queries.sql
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
-- name: sql-get-all-applications
SELECT id FROM applications;
SELECT id FROM applications ORDER BY id;

-- name: sql-get-application
SELECT id, person_oid, content
SELECT id, person_oid, tunnistautuminen, content
FROM applications
WHERE id = :id;

Expand All @@ -13,6 +13,7 @@ SET preferred_name = :preferred_name,
ssn = :ssn,
email = :email,
dob = :dob::DATE,
tunnistautuminen = :tunnistautuminen,
content = :content
WHERE id = :id;

Expand Down Expand Up @@ -59,3 +60,62 @@ WHERE group_answer_values.application_id = :application_id AND

-- name: sql-application-secret-ids
SELECT id FROM application_secrets;

-- name: sql-update-multi-by-key!
UPDATE multi_answer_values SET value = :val WHERE key = :key;

-- name: sql-anonymize-long-textareas-group!
WITH anonymisoitavat AS (
SELECT gav.application_id,
gav.key
FROM group_answer_values gav
JOIN group_answers ga ON gav.application_id = ga.application_id AND gav.key = ga.key
WHERE field_type IN ('textArea', 'textField')
AND gav.value !~ '(^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$)|(^[0-9]{1,2}[.][0-9]{1,2}[.][0-9]{4}$)'
AND length(gav.value) >= 6
)
UPDATE group_answer_values gav
SET value = substring('Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris ' ||
'nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, ' ||
'sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut ' ||
'enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat ' ||
'nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do ' ||
'eiusmod tempor incididunt ut labore et dolore magna aliqua.' FROM 0 FOR length(gav.value)+1)
FROM anonymisoitavat a
WHERE gav.application_id = a.application_id
AND gav.key = a.key;

-- name: sql-anonymize-long-textareas-multi!
WITH anonymisoitavat AS (
SELECT mav.application_id,
mav.key
FROM multi_answer_values mav
JOIN multi_answers ma ON mav.application_id = ma.application_id AND mav.key = ma.key
WHERE field_type IN ('textArea', 'textField')
AND ma.key NOT IN ('guardian-phone', 'guardian-name', 'guardian-email', 'guardian-phone-secondary', 'guardian-name-secondary', 'guardian-email-secondary')
AND mav.value !~ '(^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$)|(^[0-9]{1,2}[.][0-9]{1,2}[.][0-9]{4}$)'
AND length(mav.value) >= 6
)
UPDATE multi_answer_values mav
SET value = substring('Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris ' ||
'nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, ' ||
'sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut ' ||
'enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat ' ||
'nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do ' ||
'eiusmod tempor incididunt ut labore et dolore magna aliqua.' FROM 0 FOR length(mav.value)+1)
FROM anonymisoitavat a
WHERE mav.application_id = a.application_id
AND mav.key = a.key;

-- name: sql-anonymize-long-textareas!
UPDATE answers ans
SET value = substring('Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris ' ||
'nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, ' ||
'sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut ' ||
'enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat ' ||
'nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do ' ||
'eiusmod tempor incididunt ut labore et dolore magna aliqua.' FROM 0 FOR length(ans.value)+1)
WHERE field_type IN ('textArea', 'textField')
AND ans.key NOT IN ('gender', 'first-name', 'birth-date', 'home-town', 'ssn', 'email', 'preferred-name', 'last-name', 'address', 'phone', 'postal-office', 'postal-code')
AND ans.value !~ '(^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$)|(^[0-9]{1,2}[.][0-9]{1,2}[.][0-9]{4}$)'
AND length(ans.value) >= 6;
33 changes: 32 additions & 1 deletion dev/clj/ataru/anonymizer/anonymizer_application_store.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
[ataru.util.random :as crypto]
[cheshire.core :as json]
[clojure.java.jdbc :as jdbc]
[taoensso.timbre :as log]
[yesql.core :as sql])
(:import org.postgresql.util.PGobject))

Expand All @@ -14,6 +15,36 @@
(defn get-application [id]
(first (db/exec :db sql-get-application {:id id})))

(defn anonymize-guardian! []
(jdbc/with-db-transaction [connection {:datasource (db/get-datasource :db)}]
(sql-update-multi-by-key! {:key "guardian-name" :val "Testi Huoltaja"}
{:connection connection})
(sql-update-multi-by-key! {:key "guardian-name-secondary" :val "Testi Huoltaja"}
{:connection connection})
(sql-update-multi-by-key! {:key "guardian-phone" :val "0501234567"}
{:connection connection})
(sql-update-multi-by-key! {:key "guardian-phone-secondary" :val "0501234567"}
{:connection connection})
(sql-update-multi-by-key! {:key "guardian-email" :val "testi1.huoltaja@testiopintopolku.fi"}
{:connection connection})
(sql-update-multi-by-key! {:key "guardian-email-secondary" :val "testi2.huoltaja@testiopintopolku.fi"}
{:connection connection})))

(defn anonymize-long-textareas-group! []
(jdbc/with-db-transaction [connection {:datasource (db/get-datasource :db)}]
(sql-anonymize-long-textareas-group! {} {:connection connection}))
(log/info "Done anonymizing long textareas in group answers"))

(defn anonymize-long-textareas-multi! []
(jdbc/with-db-transaction [connection {:datasource (db/get-datasource :db)}]
(sql-anonymize-long-textareas-multi! {} {:connection connection}))
(log/info "Done anonymizing long textareas in multi answers"))

(defn anonymize-long-textareas! []
(jdbc/with-db-transaction [connection {:datasource (db/get-datasource :db)}]
(sql-anonymize-long-textareas! {} {:connection connection}))
(log/info "Done anonymizing long textareas in answers"))

(defn update-application [application]
(let [answers (:answers (:content application))
update-answers-args {:application_id (:id application)
Expand All @@ -26,7 +57,7 @@
(sql-update-application-multi-answer-values! update-answers-args {:connection connection})
(sql-update-application-group-answer-values! update-answers-args {:connection connection}))))

(defn regenerate-application-secrets []
(defn regenerate-application-secrets! []
(jdbc/with-db-transaction [connection {:datasource (db/get-datasource :db)}]
(doseq [id-chunk (->> (sql-application-secret-ids {} {:connection connection})
(map :id)
Expand Down
80 changes: 50 additions & 30 deletions dev/clj/ataru/anonymizer/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
[cheshire.core :as json]
clojure.string
clojure.walk
[taoensso.timbre :as log])
(:import java.util.concurrent.Executors))
[taoensso.timbre :as log]))

(defn- date-to-iso8601
[date]
Expand Down Expand Up @@ -36,12 +35,19 @@
(cond-> (assoc answer :value value)
(= "attachment" (:fieldType answer))
(anonymize-attachment attachment-key))))]
(merge application {:preferred_name (:preferred-name fake-person)
:last_name (:last-name fake-person)
:ssn (:fake-ssn fake-person)
:email (:email fake-person)
:dob (date-to-iso8601 (:birth-date fake-person))
:content (update (:content application) :answers #(map anonymize-answer %))})))
(merge application {:preferred_name (:preferred-name fake-person)
:last_name (:last-name fake-person)
:ssn (:fake-ssn fake-person)
:email (:email fake-person)
:dob (date-to-iso8601 (:birth-date fake-person))
:tunnistautuminen (or (some->> (-> application
:tunnistautuminen
:session
:data
:auth-type)
(update-in {} [:session :data] assoc :auth-type))
{})
:content (update (:content application) :answers #(map anonymize-answer %))})))

(defn fake-person->ataru-person [{:keys [sukupuoli
toinennimi
Expand All @@ -68,28 +74,42 @@
:birth-date syntymaaika})

(defn file->fake-persons [file]
(->> file
(slurp)
(clojure.string/split-lines)
(map (comp fake-person->ataru-person
clojure.walk/keywordize-keys
json/parse-string))
(group-by :person-oid)))
(log/info "Indexing persons")
(time
(->> file
(slurp)
(clojure.string/split-lines)
(map (comp fake-person->ataru-person
clojure.walk/keywordize-keys
json/parse-string))
(group-by :person-oid))))

(defn anonymize-data [& args]
(assert (not (clojure.string/blank? (second args))))
(let [executor-service (Executors/newFixedThreadPool
(.availableProcessors (Runtime/getRuntime)))
fake-persons (file->fake-persons (first args))
attachment-key (second args)]
(doseq [id (application-store/get-all-application-ids)]
(.execute
executor-service
(fn []
(let [application (application-store/get-application id)]
(if-let [fake-person (first (get fake-persons (:person_oid application)))]
(do (application-store/update-application (anonymize fake-person attachment-key application))
(log/info "Anonymized application" (:id application)))
(log/info "Did not anonymize application" (:id application)))))))
(.shutdown executor-service)
(application-store/regenerate-application-secrets)))
(let [fake-persons (file->fake-persons (first args))
attachment-key (second args)
application-ids (application-store/get-all-application-ids)
last-id (last application-ids)]
(log/info "Anonymise" (count application-ids) "application ids")
(time
(dorun
(pmap (fn [id]
(let [application (application-store/get-application id)]
(if-let [fake-person (first (get fake-persons (:person_oid application)))]
(do (application-store/update-application (anonymize fake-person attachment-key application))
(when (or (= last-id id)
(= 0 (mod id 1000)))
(log/info "Anonymized application id" (:id application))))
(log/info "Did not anonymize application" (:id application)))))
application-ids)))
(log/info "Anonymize guardians")
(time (application-store/anonymize-guardian!))
(log/info "Anonymize long textareas")
(time
(dorun (pcalls application-store/anonymize-long-textareas-group!
application-store/anonymize-long-textareas-multi!
application-store/anonymize-long-textareas!)))
(log/info "Regenerate application secrets")
(time (application-store/regenerate-application-secrets!)))
(log/info "Shutting down")
(shutdown-agents))
2 changes: 1 addition & 1 deletion project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
:test-paths ["spec"]
:resource-paths ["src/sql" "resources"]
:uberjar-name "ataru.jar"
:jvm-opts ^:replace ["-Xmx2g"]
:jvm-opts ^:replace ["-Xmx8g"]

:plugins [[lein-cljsbuild "1.1.8"]
[lein-doo "0.1.11"]
Expand Down

0 comments on commit e23b438

Please sign in to comment.