From f7a312662ae9cf9f902e3730800ac6e9cf1a641f Mon Sep 17 00:00:00 2001
From: "Chendi.Xue"
Date: Wed, 20 Sep 2023 04:38:11 -0500
Subject: [PATCH] Pii removal with Multi-process support based on minmin's
codes (#62)
* commit pii removal mp
Signed-off-by: Xue, Chendi
* Update README.md
* rename folder
Signed-off-by: Xue, Chendi
* fix
Signed-off-by: Xue, Chendi
* Update README.md
---------
Signed-off-by: Xue, Chendi
---
tools/pii_removal_for_contact_mp/README.md | 27 ++
.../pii_detection_redaction/README.md | 34 ++
.../scripts/pii-pile-hn.sh | 14 +
.../scripts/pii-redpj.sh | 17 +
.../scripts/pii-refinedweb.sh | 14 +
.../scripts/pii-slimpj.sh | 16 +
.../src/bigscience_pii_detect_redact.py | 325 ++++++++++++++++++
.../pii_detection_redaction/src/detect_pii.py | 80 +++++
.../src/pii_redaction.py | 320 +++++++++++++++++
.../src/pii_redaction_v2.py | 294 ++++++++++++++++
.../src/process_exceptions.py | 10 +
.../pii_detection_redaction/src/redact_pii.py | 186 ++++++++++
.../pii_detection_redaction/src/utils.py | 162 +++++++++
.../src/validate_ray_outputs.py | 158 +++++++++
.../pii_redaction.py | 23 ++
.../pii_redaction_impl.py | 142 ++++++++
16 files changed, 1822 insertions(+)
create mode 100644 tools/pii_removal_for_contact_mp/README.md
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/README.md
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-pile-hn.sh
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-redpj.sh
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-refinedweb.sh
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-slimpj.sh
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/bigscience_pii_detect_redact.py
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/detect_pii.py
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction.py
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction_v2.py
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/process_exceptions.py
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/redact_pii.py
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/utils.py
create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/validate_ray_outputs.py
create mode 100644 tools/pii_removal_for_contact_mp/pii_redaction.py
create mode 100644 tools/pii_removal_for_contact_mp/pii_redaction_impl.py
diff --git a/tools/pii_removal_for_contact_mp/README.md b/tools/pii_removal_for_contact_mp/README.md
new file mode 100644
index 000000000..1b8d8fd95
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/README.md
@@ -0,0 +1,27 @@
+# PII removal for contact info
+
+## Intro
+
+PII removal for contact info is to replace personal information such as email, phone number to a random-non-sense string to protect personal infomation
+This script is using multi processing method to speed up PPI removal
+
+## Expected input and Output
+
+Input format: a folder of *parquet, 'text' will required in parquet column names.
+
+Out format: a folder of *parquet, 'text' will be processed and personal info will be replaced.
+
+## How to RUN
+```
+conda create --name pyrecdp
+conda activate pyrecdp
+pip install pyrecdp --pre
+pip install presidio_analyzer
+python -m spacy download en_core_web_lg
+python pii_redaction.py -d ../falcon-refinedweb -o ../falcon-refinedweb-pii_removal -mp 224
+```
+
+## NOTICE
+
+We are running at file-wised parallism, usually a 300MB file took around 15-20min to complete, so you will see slow progress in progress bar.
+One thing to identify the activity of the process may be using 'top' to check of there are multiple activitily running python processes.
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/README.md b/tools/pii_removal_for_contact_mp/pii_detection_redaction/README.md
new file mode 100644
index 000000000..dae0a7658
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/README.md
@@ -0,0 +1,34 @@
+# How to run PII-for-text pipeline
+
+## Overview
+The pipeline detects 5 different types of PIIs: 'PHONE_NUMBER, 'IP_ADDRESS', 'EMAIL', 'USER', 'KEY'. The detection is based on regular expressions using open-source packages including presidio and bigscience-pii. The detection precision/recall has been tuned for web scrapes based datasets, for example, Falcon-RefinedWeb, SlimPajama-StackExchange, PILE-Hackernews. But the detection precion/recall is not good for code data like Github.
+
+Two redaction methods have been implemented:
+1. Replacement with random values
+2. Replacement with tags such as [PHONE_NUMBER], [EMAIL], etc.
+Currently, the option 1) is used.
+
+
+## How to run
+### Step 1: Set up Env
+Please follow [this guide](../workload_in_containers/README.md) on how to set-up the container environment of this workload. When the containers are running, you can enter the container on head node using following command:
+```bash
+docker exec -it ray-leader bash
+```
+
+### Step 2: Run PII removal
+Once you are inside the ray-leader container, go to the scripts folder. You can change the `BATCH_SIZE` and `CPUCORES` depending on the memory and number of cores on your systems. Then you can run the pii script, for example:
+```
+bash pii-refinedweb.sh
+```
+
+### Step 3: Validate outputs
+We implemented 3 checks:
+1. Check schema and sample rows in output parquets by loading parquet with pandas
+2. Count numbers of PIIs per category by sampling from the outputs. You can further get an estimate of the total number of PIIs per category by multiplying total_num_samples/sample_used_for_this_check
+3. Visual check of a small sample by producing a html with yellow highlights of the PIIs and annotating with corresponding category (note that sometimes the highlights are not at the exact location, but should be quite close).
+
+```
+# First change the path to the data files in the python script
+python src/validate_ray_outputs.py
+```
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-pile-hn.sh b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-pile-hn.sh
new file mode 100644
index 000000000..cc3d168b5
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-pile-hn.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+BATCHSIZE=1000
+CPUCORES=48
+DATA=pile_hn
+OUTPUT_PREFIX=pile_hn
+DATA_DIR=/home/user/local/PILE/hn
+
+python ../src/pii_redaction_v2.py \
+--load-batch-size $BATCHSIZE \
+--cpu-per-worker $CPUCORES \
+--dataset-family $DATA \
+--output-prefix $OUTPUT_PREFIX \
+--data-dir $DATA_DIR \
+--local
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-redpj.sh b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-redpj.sh
new file mode 100644
index 000000000..aea0eda67
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-redpj.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+BATCHSIZE=50000
+CPUCORES=48
+INPUT=togethercomputer/RedPajama-Data-1T-Sample
+DATA=slimpajama
+OUTPUT_PREFIX=redpajama
+DATA_DIR=/home/user/local/dataset/RedPajama-Data-1T-Sample/
+
+python ../src/pii_redaction.py \
+--load-batch-size $BATCHSIZE \
+--cpu-per-worker $CPUCORES \
+--input $INPUT \
+--dataset-family $DATA \
+--output-prefix $OUTPUT_PREFIX \
+--data-dir $DATA_DIR \
+--local \
+#--skip 500000
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-refinedweb.sh b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-refinedweb.sh
new file mode 100644
index 000000000..2b3a4b858
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-refinedweb.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+BATCHSIZE=1000
+CPUCORES=48
+DATA=refinedweb
+OUTPUT_PREFIX=pii_test_output
+DATA_DIR=/home/user/local/refinedweb_samples
+
+python ../src/pii_redaction_v2.py \
+--load-batch-size $BATCHSIZE \
+--cpu-per-worker $CPUCORES \
+--dataset-family $DATA \
+--output-prefix $OUTPUT_PREFIX \
+--data-dir $DATA_DIR \
+--local
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-slimpj.sh b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-slimpj.sh
new file mode 100644
index 000000000..3601b582f
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-slimpj.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+BATCHSIZE=50000
+CPUCORES=48
+INPUT=cerebras/SlimPajama-627B
+DATA=slimpajama
+OUTPUT_PREFIX=pii_slimpajama_se
+DATA_DIR=/home/user/local/
+
+python ../src/pii_redaction_v2.py \
+--load-batch-size $BATCHSIZE \
+--cpu-per-worker $CPUCORES \
+--input $INPUT \
+--dataset-family $DATA \
+--output-prefix $OUTPUT_PREFIX \
+--data-dir $DATA_DIR \
+--local
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/bigscience_pii_detect_redact.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/bigscience_pii_detect_redact.py
new file mode 100644
index 000000000..904477dcc
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/bigscience_pii_detect_redact.py
@@ -0,0 +1,325 @@
+# -*- coding: utf-8 -*-
+"""MST BigScience PII Code
+
+Original colab that is a source of this file is located at
+ https://colab.research.google.com/drive/1086H3-LGMz3gX0pGy9ECgr8KflosSKso
+
+# License
+
+Copyright 2022 Authors of this Notebook
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+# What is this colab?
+
+This colab detects the following kinds of PII for all languages in BigScience.
+Languages assumed are ["ar", "as", "bn", "ca", "en", "es", "eu", "fr", "gu", "hi", "id", "ig", "mr", "ny", "pa", "pt", "sn", "st", "sw", "ur", "vi", "xh", "yo", "zh", "zu"]
+
+## Highest Risk
+### Simple spans of characters:
+* **IDs [general]:** This is anything that is a sequence of 6 or more digits, as is common in identifiers for people internationally (national IDs, tax IDs, passport numbers, etc.), credit card numbers, IBAN codes, etc.
+* **Key [general]**: This is anything that is a sequence of digits and letters in the same string, optionally with spaces. Common for Credit Card and API, SSH, GPG keys. (Privacy group doesn't have a regex for this)
+* **Email address**, **User name**: Strings using @
+* **IP address**: Digits with periods in them
+* **Phone number**: At least 7 digits with spaces in them
+* **License plate**: (Privacy group doesn't have cross-lingual handling for this, MST group doesn't have a regex for this)
+
+### More complex spans: (WORK IN PROGRESS)
+* **Full Names**: Requires additional NER package
+* **Address**
+
+
+## Lower Risk: (We're not doing)
+* **URL**
+* **Time**: dateparser dependency
+* **Date**: dateparser dependency
+* **Age**
+
+"""
+
+
+#@title Define highest risk PII. TODO: License plate
+# NUMBER removed last minute due to false positives. See https://huggingface.slack.com/archives/C0307KE5UNT/p1647011702716159
+high_risk_tags = {'KEY', 'EMAIL', 'USER', 'IP_ADDRESS', 'IPv4', 'IPv6'} # , 'NUMBER', "ID"}
+
+"""# Regexes"""
+
+#@title Get the less sophisticated MST regexes for High Risk scenarios (baseline comparison). Not language-specific; all are general.
+import sys
+import regex
+import ipaddress
+# These are ordered so that we can return upon a match; no need to search for a substring.
+year_patterns = [
+ regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/][1-2][0-9]{3})(?:$|[\s@,?!;:\'\"(.\p{Han}])"), # yyyy-yyyy or yyyy/yyyy
+ regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/.][0-3][0-9][\p{Pd}/.][0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])"), # yyyy-mm-dd or yyyy-dd-mm or yyyy/mm/dd or yyyy/dd/mm or yyyy.mm.dd or yyyy.dd.mm
+ regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/.][0-3][0-9][\p{Pd}/.](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])"), # mm-dd-yyyy or dd-mm-yyyy or mm/dd/yyyy or dd/mm/yyyy or mm.dd.yyyy or dd.mm.yyyy or the same but with yy instead of yyyy
+ regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])"), # mm-yyyy or mm/yyyy or the same but with yy
+ regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}-[0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])"), # yyyy-mm or yyyy/mm
+]
+
+# Patterns for high-risk character strings
+id_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([A-Za-z]*(?:[\p{Pd}]*\p{Nd}){6,})(?:$|[\b\s@?,!;:\'\")(.\p{Han}])'
+# https://regex101.com/r/JQkmh8/2
+# key_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[\s\p{Pd}]?){4,})(?:$|[\b\s\p{Han}@?,!;:\'\"])'
+# https://regex101.com/r/JQkmh8/5
+key_pattern = r'(?:^|[\b\s@?,!:;\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:_]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[ \p{Pd}]?){3,})(?:$|[\b\s\p{Han}@?,!;:\'\")(.])'
+ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}'
+ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])'
+
+# presidio
+# #ipv4_pattern = r"\b(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b"
+# this one below gives a lot of false positives ::
+#ipv6_pattern = r"\b(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))\b"
+ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join([ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
+
+
+# bigcode-pii
+ipv4_pattern = r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}"
+ipv6_pattern = r"(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])"
+ip_pattern = (
+ r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])("
+ + r"|".join([ipv4_pattern, ipv6_pattern])
+ + ")(?:$|[\s@,?!;:'\"(.\p{Han}])"
+)
+
+# https://regex101.com/r/EpA5B7/1
+email_pattern = r'''
+ (?<= ^ | [\b\s@,?!;:)('".\p{Han}<] )
+ (
+ [^\b\s@?!;,:)('"<]+
+ @
+ [^\b\s@!?;,/]*
+ [^\b\s@?!;,/:)('">.]
+ \.
+ \p{L} \w{1,}
+ )
+ (?= $ | [\b\s@,?!;:)('".\p{Han}>] )
+'''
+
+# https://regex101.com/r/mOqi1s/3
+#user_pattern = r'(?:^|[\s@,?!;:\'\")(\p{Han}])(@[^\s@,?!;:\'\")(]{3,})'
+user_pattern = r'''
+ (?<= ^ | [)(\s@,?!;:'"\p{Han}] )
+ (@
+ [^)(\s@,?!;:'"]{3,}
+ )
+'''
+# Examples from https://regexpattern.com/phone-number/
+# https://regex101.com/r/lZZ0XP/4
+# Also matches MLS numbers
+# phone_pattern = r'(?:^|[\s\'\"(\p{Han}])((?:\+\p{Nd}+[ \/.\p{Pd}]*)?(?:(?:\(\+?\p{Nd}+\))?(?:[ \/.\p{Pd}]*\p{Nd})){7,}(?:[\t\f #]*\p{Nd}+)?)(?:$|[\s@,?!;:\'\"(.\p{Han}])'
+
+id_regex = regex.compile(id_pattern, flags=regex.MULTILINE) #, re.MULTILINE)
+key_regex = regex.compile(key_pattern, flags=regex.MULTILINE) #, re.MULTILINE)
+ipv4_regex = regex.compile(ipv4_pattern)
+ipv6_regex = regex.compile(ipv6_pattern)
+ip_regex = regex.compile(ip_pattern, flags=regex.MULTILINE) #, re.MULTILINE)
+email_regex = regex.compile(email_pattern, flags=regex.MULTILINE|regex.VERBOSE) #, re.MULTILINE)
+user_regex = regex.compile(user_pattern, flags=regex.MULTILINE|regex.VERBOSE) #, re.MULTILINE)
+# phone_regex = regex.compile(phone_pattern, flags=regex.MULTILINE) #, re.MULTILINE)
+# TODO: license
+
+
+#sasha_regexes = copy.deepcopy(regex_rulebase)
+mst_regexes = {}
+for tag in high_risk_tags:
+ #print(tag)
+ if tag == 'ID':
+ mst_regexes['ID'] = id_regex
+ elif tag == 'KEY':
+ mst_regexes['KEY'] = key_regex
+ elif tag == 'IPv4':
+ mst_regexes['IPv4'] = ipv4_regex
+ elif tag == 'IPv6':
+ mst_regexes['IPv6'] = ipv6_regex
+ elif tag == 'IP_ADDRESS':
+ mst_regexes['IP_ADDRESS'] = ip_regex
+ elif tag == 'EMAIL':
+ mst_regexes['EMAIL'] = email_regex
+ elif tag == 'USER':
+ mst_regexes['USER'] = user_regex
+# elif tag == 'NUMBER':
+# mst_regexes['NUMBER'] = phone_regex
+ else:
+ sys.stderr.write('Dont have tag regex pattern for %s =(' % tag)
+
+#print("MST regexes under examination are:")
+#for tag, regx in mst_regexes.items():
+ #print(tag, end=":\t")
+ #print(regx)
+
+"""# PI Detection and Redaction functions are defined here! """
+
+#@title The detection functions and basic filtering functions are defined here.
+# tag_type = {'ID', 'KEY', 'EMAIL', 'IP_ADDRESS', 'PHONE', 'LICENSE_PLATE'}
+# Choose whether to put this import before or after, depending on which you're testing. =)
+
+def ip_has_digit(matched_str):
+ """Checks to make sure the PII span is not just :: or whatever that may
+ accidentally be picked up by making sure there are digits."""
+ return any(map(str.isdigit, matched_str))
+
+# from bigcode-pii
+def filter_versions(matched_str, context):
+ """Filter addresses in this format x.x.x.x, x.xx.x.x and the words dns/server
+ don't appear in the neighboring context, usually they are just versions"""
+ # count occurrence of dots
+ dot_count = matched_str.count('.')
+ exclude = (dot_count <= 3 and len(matched_str) <= 8)
+ if exclude:
+ if "dns" in context.lower() or "server" in context.lower():
+ return False
+ return exclude
+
+# from bigcode-pii
+def not_ip_address(matched_str):
+ """ make sure the string has a valid IP address format
+ e.g: 33.01.33.33 is not a valid IP address because of the 0 in front of 1
+ TODO: fix this directly in the regex"""
+ try:
+ ipaddress.ip_address(matched_str)
+ return False
+ except:
+ return True
+
+
+def matches_date_pattern(matched_str):
+ # Screen out date false positives
+ for year_regex in year_patterns:
+ if year_regex.match(matched_str):
+ return True
+ return False
+
+def is_website(matched_str):
+ # TODO
+ return False
+
+def detect_pii(text, lang, tag_types):
+ matches = []
+ for tag in tag_types:
+ label_pattern = mst_regexes[tag]
+ # !! regex.match happens here!!
+ matches_tmp = label_pattern.finditer(text)
+ for match in matches_tmp:
+ # TODO: Why does this happen?
+ if match.groups():
+ if len(match.groups()) > 1 and match.groups()[1]:
+ sys.stderr.write("Warning: Found substring matches in the main match.")
+ #print(tag)
+ #print(text)
+ #print(match.groups())
+ matched_str = match.groups()
+ #print(matched_str)
+ # Why does this happen?
+ matched_str = matched_str[0]
+ start, end = match.span(1)
+
+ if matched_str:
+ if tag in ["IP_ADDRESS"]:
+ # Filter out false positive IPs
+ if not ip_has_digit(matched_str):
+ continue
+ # this is to filter out versions, copied from bigcode-pii
+ if filter_versions(matched_str, text[start-100:end+100]):
+ #print('Detected: version: ', matched_str)
+ continue
+ # this is to filer out invalid ip address, copied from bigcode-pii
+ if not_ip_address(matched_str):
+ #print('Detected: invalid id address: ', matched_str)
+ continue
+ if tag in ["ID", "IP_ADDRESS"]: #, "NUMBER"]:
+ # Filter out date false positives
+ if matches_date_pattern(matched_str):
+ continue
+ # TODO: Implement
+ # if tag in ["KEY"]:
+ # # TODO: implement
+ # if is_website(matched_str):
+ # continue
+ matches += [(matched_str, match.span(), str(label_pattern), tag, lang)]
+ return matches
+
+
+#@title Redaction function defined here.
+def redact_pii(text, matches):
+ """Takes a match as defined in the detect_pii function and redacts it from the full string, returning a tuple."""
+ redacted_str = text
+ metadata = []
+ for match in matches:
+ matched_str = match[0]
+ tag = match[3]
+ redact_tag = "PI:" + tag
+ redacted_str = redacted_str.replace(matched_str, redact_tag)
+ # Create the "metadata" as all of the information we had before redaction
+ metadata += [(match)]
+ return (redacted_str, metadata)
+
+#@title General function to run the PII detection and redact it, saving everything else to metadata, is defined here.
+def run_pii(text, lang):
+ """
+ Runs the given set of regexes on the data "lines" and pulls out the
+ tagged items.
+ The lines structure stores the language type(s). This can be used for
+ language-specific regexes, although we're dropping that for now and using
+ only "default"/non-language-specific regexes.
+ """
+
+ #print('Detecting....')
+ # What is this for...?
+ text = text.encode().decode()
+ matches = detect_pii(text, lang, high_risk_tags)
+ #print(matches)
+ match_set = (text, {})
+ if len(matches) > 0:
+ # !!! REDACTION HAPPENS HERE !!!
+ redacted_str, metadata = redact_pii(text, matches)
+ metadata_out = {"regex metadata":metadata, "original": text, "redacted": redacted_str}
+ match_set = (redacted_str, metadata_out)
+ return match_set
+
+
+def run_pii_batch(exs, lang):
+ """
+ Runs the given set of regexes on the data "lines" and pulls out the
+ tagged items.
+ The lines structure stores the language type(s). This can be used for
+ language-specific regexes, although we're dropping that for now and using
+ only "default"/non-language-specific regexes.
+ """
+ regex_metadata = []
+ old_text = []
+ new_text = []
+ modified = []
+ for text in exs["text"]:
+ # What is this for...?
+ text = text.encode().decode()
+ matches = detect_pii(text, lang, high_risk_tags)
+ if len(matches) > 0:
+ # !!! REDACTION HAPPENS HERE !!!
+ redacted_str, metadata = redact_pii(text, matches)
+ regex_metadata.append(repr(metadata))
+ old_text.append(text)
+ new_text.append(redacted_str)
+ modified.append(True)
+ else:
+ regex_metadata.append("")
+ old_text.append(text)
+ new_text.append(text)
+ modified.append(False)
+ result = {
+ "regex_metadata": regex_metadata,
+ "old_text": old_text,
+ "text": new_text,
+ "modified": modified
+ }
+ return result
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/detect_pii.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/detect_pii.py
new file mode 100644
index 000000000..b9df49d64
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/detect_pii.py
@@ -0,0 +1,80 @@
+
+#from presidio_analyzer import AnalyzerEngine
+from utils import parse_recognizer_result, high_risk_tags
+from bigscience_pii_detect_redact import matches_date_pattern, detect_pii
+
+
+
+def detect_phone_numbers(text, analyzer):
+ # use presidio phone recognizer to detect phone numbers
+ # threshold is set to 0.4 based on a sample study
+ results = analyzer.analyze(text=text,
+ entities=['PHONE_NUMBER'],
+ #language='en',
+ #score_threshold=0.4,
+ #return_decision_process=True
+ )
+
+ pii_list = []
+
+ if len(results)>0:
+ for result in results:
+ # parse the output into dictionary
+ pii_dict = parse_recognizer_result(result)
+
+ # check if the number string is a date
+ number_str = text[pii_dict['start']: pii_dict['end']]
+
+ if matches_date_pattern(number_str):
+ #print('Date, not phone number')
+ pass
+
+ else:
+ pii_dict['value']=number_str
+ pii_list.append(pii_dict)
+ #print(pii_dict)
+
+ return pii_list
+
+
+
+def detect_other_piis(text):
+ matches = detect_pii(text, None, high_risk_tags)
+ if len(matches)>0:
+ pii_list = []
+ for m in matches:
+ pii = {}
+ pii['type']=m[-2]
+ pii['start']=m[1][0]
+ pii['end']=m[1][1]
+ pii['value']=m[0]
+ #print(pii)
+ pii_list.append(pii)
+
+ return pii_list
+ else:
+ return None
+
+def merge_outputs(presidio_outputs, bigscience_outputs):
+ if bigscience_outputs!=None:
+ piis = presidio_outputs + bigscience_outputs
+ # TODO: sometimes KEY and PHONE_NUMBER overlap
+ # when merging, only keep one of them
+ # right now, the short-cut is to have the KEY and PHONE_NUMBER replacement to be the same format
+
+ # detected_spans = []
+ # piis_to_remove = []
+ # for pii in piis:
+ # span = (pii['start'], pii['end'])
+ # if span in detected_spans:
+ # #remove pii from piis
+ # print('remove this pii: ', pii)
+ # piis_to_remove.append(pii)
+
+ # detected_spans.append(span)
+
+ # piis = [pii for pii in piis if pii not in piis_to_remove]
+
+ else:
+ piis = presidio_outputs
+ return piis
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction.py
new file mode 100644
index 000000000..d5054a37e
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction.py
@@ -0,0 +1,320 @@
+"""
+this script is for processing pii-detection-dedaction
+"""
+
+# from presidio_analyzer import AnalyzerEngine
+from presidio_analyzer.predefined_recognizers import PhoneRecognizer
+
+from utils import summarize_pii_entities
+import time
+import json
+from redact_pii import redact_pii_with_random_values, redact_pii_with_tags
+from detect_pii import detect_other_piis, detect_phone_numbers, merge_outputs
+
+import os, sys
+import time
+import argparse
+from pprint import pprint
+from typing import Dict, List
+
+import ray
+import ray.data
+import pandas as pd
+import numpy as np
+from datasets import load_dataset
+
+import logging
+import glob
+
+
+def detect_redact_pii_for_one_text(text, analyzer):
+
+ detected_phone_numbers = detect_phone_numbers(text, analyzer)
+
+ # get output from bigscience-pii
+ detected_other_piis = detect_other_piis(text)
+ # merge the two outputs
+ piis = merge_outputs(detected_phone_numbers, detected_other_piis)
+ #print('Merged PIIs: ', piis)
+
+ if len(piis)>0:
+ # save result
+ #redact
+ redacted_text = redact_pii_with_random_values(text, piis)
+ #redacted_text = redact_pii_with_tags(text, piis)
+
+ output = {
+ 'redacted': redacted_text,
+ 'pii': piis,
+ "modified": True
+ }
+
+
+ else:
+ output = {
+ 'redacted': None,
+ 'pii': None,
+ "modified": False
+
+ }
+
+ return output
+
+
+def get_args():
+ parser = argparse.ArgumentParser()
+ group = parser.add_argument_group(title="input data")
+ group.add_argument(
+ "--input",
+ type=str,
+ default="tiiuae/falcon-refinedweb",
+ required=False,
+ help="Name of the dataset repository,e.g. togethercomputer/RedPajama-Data-1T"
+ )
+
+ # group.add_argument(
+ # "--format",
+ # type=str,
+ # default="parquet",
+ # required=False,
+ # help="input data format, parquet or json"
+ # )
+
+ group.add_argument(
+ "--dataset-family",
+ type=str,
+ default="refinedweb",
+ required=False,
+ help="choose from: refinedweb, slimpajama, pile"
+ )
+
+ group.add_argument(
+ "--data-dir",
+ type=str,
+ required=False,
+ help="for local mode, you need to provide local dataset repository, e.g. /home/user/local"
+ )
+ group.add_argument(
+ "--cache-dir",
+ type=str,
+ default='/root/.cache',
+ help="Hugging Face cache dir, where the hugging face dataset it stored"
+ )
+ group.add_argument(
+ '--local',
+ default=False,
+ action='store_true',
+ help="whether to use local mode to preprocess data"
+ )
+ group.add_argument(
+ "--load-batch-size", type=int, default=1000, help="only needed if you use streaming mode to read data from hugging face"
+ )
+ group.add_argument(
+ "--skip", type=int, default=None, help="how many samples to skip"
+ )
+ group = parser.add_argument_group(title="output data")
+ group.add_argument(
+ "--output-prefix",
+ type=str,
+ required=False,
+ default="processed",
+ help="Path to binary output file without suffix",
+ )
+ group = parser.add_argument_group(title="runtime")
+ group.add_argument(
+ "--cpu-per-worker", type=int, default=1, help="Number of CPUs to use per worker"
+ )
+
+ args = parser.parse_args()
+ args.output_path = '/home/user/local'
+ return args
+
+
+def main():
+ args = get_args()
+
+ # if args.format not in ['parquet', 'json']:
+ # raise ValueError('data file format must be parquet or json')
+
+ output_dir = os.path.join(args.output_path, args.output_prefix)
+ if not os.path.exists(output_dir):
+ os.mkdir(output_dir)
+ exception_dir = output_dir+'/exceptions/'
+ cache_dir = args.cache_dir
+ dataset_family = args.dataset_family
+ log_dir = output_dir+'/logs/'
+ if not os.path.exists(log_dir):
+ os.mkdir(log_dir)
+
+ logging.basicConfig(filename=log_dir+"newlog.txt",
+ format='%(asctime)s %(message)s',
+ filemode='w')
+
+ logger = logging.getLogger()
+ logger.setLevel(logging.DEBUG)
+
+ logger.info(args)
+ logger.info('processing {} data.....'.format(dataset_family))
+
+ # init ray
+ ray.init(address='auto')
+ pprint(ray.cluster_resources())
+ num_nodes = len(ray.nodes())
+ parallelism = num_nodes * args.cpu_per_worker
+
+ logger.info('num of ray nodes: {}'.format(num_nodes))
+ logger.info('parallelism: {}'.format(parallelism))
+
+ def preprocess_fn(contents, metas, analyzer) -> pd.DataFrame:
+ # inputs are in batches
+ redacted_content = []
+ modified = []
+ piis = []
+ meta_output = []
+ original_content = []
+
+ exceptions = []
+
+ for i, text in enumerate(contents):
+ try:
+ # # for testing exception
+ # if i%5 == 0:
+ # raise ValueError
+ output = detect_redact_pii_for_one_text(text, analyzer)
+ modified.append(output['modified'])
+ piis.append(output['pii'])
+ if output['pii'] != None:
+ redacted_content.append(output['redacted'])
+ else:
+ redacted_content.append(text)
+ meta_output.append(metas[i])
+ except:
+ logger.debug('exception occurred!') # seems cannot log from ray actor using this method
+ exceptions.append({
+ 'text':text,
+ 'meta': metas[i]
+ })
+ if len(exceptions)>0:
+ if not os.path.exists(exception_dir):
+ os.mkdir(exception_dir)
+ task_id = ray.get_runtime_context().get_task_id()
+ with open(exception_dir + task_id+'.json', 'w') as f:
+ json.dump(exceptions, f)
+
+ return pd.DataFrame({#"original": original_content,
+ 'redacted': redacted_content,
+ 'piis': piis,
+ 'meta': meta_output,
+ 'modified': modified})
+
+
+
+ def pii_removal_refinedweb(batch: Dict[str, List]) -> pd.DataFrame:
+ # analyzer = AnalyzerEngine()
+ analyzer = PhoneRecognizer()
+
+ contents = batch['content'].tolist()
+
+ try:
+ urls = batch['url'].tolist()
+ timestamps = batch['timestamp'].tolist()
+ dump = batch['dump'].tolist()
+ segment = batch['segment'].tolist()
+ metas = []
+
+ for i in range(len(urls)):
+ metas.append({
+ 'url': urls[i],
+ 'timestamp': timestamps[i],
+ 'dump': dump[i],
+ 'segment': segment[i]
+ })
+ except:
+ metas = [None]*len(contents)
+
+ return preprocess_fn(contents, metas, analyzer)
+
+ def pii_removal_slimpajama_pile(batch: Dict[str, List]) -> pd.DataFrame:
+ # analyzer = AnalyzerEngine()
+ analyzer = PhoneRecognizer()
+ # try:
+ contents = batch['text'].tolist()
+ try:
+ metas = batch['meta'].tolist()
+ # print(metas)
+ except:
+ metas = [None]*len(contents)
+ return preprocess_fn(contents, metas, analyzer)
+ # except:
+ # if not os.path.exists(exception_dir):
+ # os.mkdir(exception_dir)
+ # task_id = ray.get_runtime_context().get_task_id()
+ # with open(exception_dir + task_id+'.json', 'w') as f:
+ # json.dump(batch, f)
+
+
+
+ if not args.local:
+ dataset = load_dataset(args.input, streaming=True)['train']
+ else:
+ data_dir = args.data_dir
+ if dataset_family == 'refinedweb':
+ datafiles = glob.glob(data_dir + '*.parquet')
+ dataset = load_dataset('parquet', data_files = datafiles, streaming=True)['train']
+ elif dataset_family == 'slimpajama' or dataset_family == 'pile':
+ datafiles = glob.glob(data_dir+'*.jsonl')
+ dataset = load_dataset('json', data_files = datafiles, streaming=True)['train']
+ else:
+ raise ValueError('{} not supported'.format(dataset_family))
+
+ if args.skip != None:
+ dataset_to_process = dataset.skip(args.skip)
+ else:
+ dataset_to_process = dataset
+
+ idx = 1
+
+ t0 = time.time()
+ for rows in dataset_to_process.iter(batch_size=args.load_batch_size):
+ logger.info('Start processing batch # {}'.format(idx))
+ print("-----------------------------")
+ df = pd.DataFrame(rows)
+ # logger.info(df['meta'])
+ ray_dataset = ray.data.from_pandas(df)
+ ray_dataset = ray_dataset.repartition(parallelism) #, shuffle = True)
+
+ if dataset_family == 'refinedweb':
+ print('process refinedweb')
+ process_fn = pii_removal_refinedweb
+ elif dataset_family == 'slimpajama' or dataset_family == 'pile':
+ print('process slimpj or pile')
+ process_fn = pii_removal_slimpajama_pile
+ else:
+ raise ValueError('{} not supported'.format(dataset_family))
+
+ tokenized_data = ray_dataset.map_batches(process_fn, batch_format="numpy", batch_size=None)
+
+ if dataset_family == 'refinedweb':
+ tokenized_data.write_parquet(output_dir)
+ elif dataset_family == 'slimpajama' or dataset_family == 'pile':
+ tokenized_data.write_json(output_dir)
+ else:
+ raise ValueError('{} not supported'.format(dataset_family))
+
+ logger.info('Finished processing batch # {}'.format(idx))
+ logger.info(f"{idx} * {args.load_batch_size} samples were written to disk.")
+ idx += 1
+ print("============================")
+ if idx == 2:
+ #sys.exit()
+ break
+ t1 = time.time()
+ logger.info('Processing {} samples took {:.3f} sec'.format((idx-1)*args.load_batch_size, t1-t0))
+
+
+if __name__ == "__main__":
+ start = time.time()
+ main()
+ end = time.time()
+ print(f"\nthis script took {end-start}s.")
+
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction_v2.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction_v2.py
new file mode 100644
index 000000000..ff78bb9c9
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction_v2.py
@@ -0,0 +1,294 @@
+"""
+this script is for processing pii-detection-dedaction
+"""
+
+# from presidio_analyzer import AnalyzerEngine
+from presidio_analyzer.predefined_recognizers import PhoneRecognizer
+
+from utils import summarize_pii_entities
+import time
+import json
+from redact_pii import redact_pii_with_random_values, redact_pii_with_tags
+from detect_pii import detect_other_piis, detect_phone_numbers, merge_outputs
+
+import os, sys
+import time
+import argparse
+from pprint import pprint
+from typing import Dict, List
+
+try:
+ import ray
+ import ray.data
+except:
+ pass
+import pandas as pd
+import numpy as np
+from datasets import load_dataset
+
+import logging
+import glob
+
+
+def detect_redact_pii_for_one_text(text, analyzer):
+
+ detected_phone_numbers = detect_phone_numbers(text, analyzer)
+
+ # get output from bigscience-pii
+ detected_other_piis = detect_other_piis(text)
+ # merge the two outputs
+ piis = merge_outputs(detected_phone_numbers, detected_other_piis)
+ #print('Merged PIIs: ', piis)
+
+ if len(piis)>0:
+ # save result
+ #redact
+ redacted_text = redact_pii_with_random_values(text, piis)
+ #redacted_text = redact_pii_with_tags(text, piis)
+
+ output = {
+ 'redacted': redacted_text,
+ 'pii': piis,
+ "modified": True
+ }
+
+
+ else:
+ output = {
+ 'redacted': None,
+ 'pii': None,
+ "modified": False
+
+ }
+
+ return output
+
+
+def get_args():
+ parser = argparse.ArgumentParser()
+ group = parser.add_argument_group(title="input data")
+ group.add_argument(
+ "--input",
+ type=str,
+ default="tiiuae/falcon-refinedweb",
+ required=False,
+ help="Name of the dataset repository,e.g. togethercomputer/RedPajama-Data-1T"
+ )
+
+ # group.add_argument(
+ # "--format",
+ # type=str,
+ # default="parquet",
+ # required=False,
+ # help="input data format, parquet or json"
+ # )
+
+ group.add_argument(
+ "--dataset-family",
+ type=str,
+ default="refinedweb",
+ required=False,
+ help="choose from: refinedweb, slimpajama, pile"
+ )
+
+ group.add_argument(
+ "--data-dir",
+ type=str,
+ required=False,
+ help="for local mode, you need to provide local dataset repository, e.g. /home/user/local"
+ )
+ group.add_argument(
+ "--cache-dir",
+ type=str,
+ default='/root/.cache',
+ help="Hugging Face cache dir, where the hugging face dataset it stored"
+ )
+ group.add_argument(
+ '--local',
+ default=False,
+ action='store_true',
+ help="whether to use local mode to preprocess data"
+ )
+ group.add_argument(
+ "--load-batch-size", type=int, default=1000, help="only needed if you use streaming mode to read data from hugging face"
+ )
+ group.add_argument(
+ "--skip", type=int, default=None, help="how many samples to skip"
+ )
+ group = parser.add_argument_group(title="output data")
+ group.add_argument(
+ "--output-prefix",
+ type=str,
+ required=False,
+ default="processed",
+ help="Path to binary output file without suffix",
+ )
+ group = parser.add_argument_group(title="runtime")
+ group.add_argument(
+ "--cpu-per-worker", type=int, default=1, help="Number of CPUs to use per worker"
+ )
+
+ args = parser.parse_args()
+ args.output_path = '/home/user/local'
+ return args
+
+
+def main():
+ args = get_args()
+
+ # if args.format not in ['parquet', 'json']:
+ # raise ValueError('data file format must be parquet or json')
+
+ output_dir = os.path.join(args.output_path, args.output_prefix)
+ if not os.path.exists(output_dir):
+ os.mkdir(output_dir)
+ exception_dir = output_dir+'/exceptions/'
+ cache_dir = args.cache_dir
+ dataset_family = args.dataset_family
+ log_dir = output_dir+'/logs/'
+ if not os.path.exists(log_dir):
+ os.mkdir(log_dir)
+
+ logging.basicConfig(filename=log_dir+"newlog.txt",
+ format='%(asctime)s %(message)s',
+ filemode='w')
+
+ logger = logging.getLogger()
+ logger.setLevel(logging.DEBUG)
+
+ logger.info(args)
+ logger.info('processing {} data.....'.format(dataset_family))
+
+ # init ray
+ ray.init(address='auto')
+ pprint(ray.cluster_resources())
+ num_nodes = len(ray.nodes())
+ parallelism = num_nodes * args.cpu_per_worker
+
+ logger.info('num of ray nodes: {}'.format(num_nodes))
+ logger.info('parallelism: {}'.format(parallelism))
+
+ def preprocess_fn(contents, analyzer) -> pd.DataFrame:
+ # inputs are in batches
+ text, doc_id, hash, meta, source, bytes = contents
+ redacted_content = []
+ modified = []
+ piis = []
+ meta_output = []
+ doc_id_output = []
+ hash_output = []
+ source_output = []
+ bytes_output = []
+
+ exceptions = []
+
+ for i, txt in enumerate(text):
+ try:
+ # # for testing exception
+ # if i%5 == 0:
+ # raise ValueError
+ output = detect_redact_pii_for_one_text(txt, analyzer)
+ modified.append(output['modified'])
+ piis.append(output['pii'])
+ if output['pii'] != None: # have PII so output redacted text
+ redacted_content.append(output['redacted'])
+ else: # did not have PII so output original text
+ redacted_content.append(txt)
+ meta_output.append(meta[i])
+ doc_id_output.append(doc_id[i])
+ hash_output.append(hash[i])
+ source_output.append(source[i])
+ bytes_output.append(bytes[i])
+ except:
+ logger.debug('exception occurred!') # seems cannot log from ray actor using this method
+ exceptions.append({
+ 'text':txt,
+ 'doc_id': doc_id[i]
+ })
+ if len(exceptions)>0:
+ if not os.path.exists(exception_dir):
+ os.mkdir(exception_dir)
+ task_id = ray.get_runtime_context().get_task_id()
+ with open(exception_dir + task_id+'.json', 'w') as f:
+ json.dump(exceptions, f)
+
+ return pd.DataFrame({#"original": original_content,
+ 'new_content': redacted_content,
+ 'meta': meta_output,
+ 'doc_id':doc_id_output,
+ 'hash': hash_output,
+ 'source': source_output,
+ 'bytesize':bytes_output,
+ 'secrets': piis,
+ 'modified': modified})
+
+
+ def pii_removal(batch: Dict[str, List]) -> pd.DataFrame:
+ # analyzer = AnalyzerEngine()
+ analyzer = PhoneRecognizer()
+ text = batch['text'].tolist()
+ doc_id = batch['doc_id'] #.to_list()
+ hash = batch['hash']#.to_list()
+ source = batch['source'].tolist()
+ bytes = batch['bytesize'].tolist()
+ meta = batch['meta'].tolist()
+
+ # try:
+ # meta = batch['meta'].tolist()
+ # # print(metas)
+ # except:
+ # meta = [None]*len(contents)
+
+ contents = (text, doc_id, hash, meta, source, bytes)
+
+ return preprocess_fn(contents, analyzer)
+
+
+ if not args.local:
+ dataset = load_dataset(args.input, streaming=True)['train']
+ else:
+ data_dir = args.data_dir
+ if data_dir[-1] != '/':
+ data_dir+='/'
+ datafiles = glob.glob(data_dir + '*.parquet')
+ dataset = load_dataset('parquet', data_files = datafiles, streaming=True)['train']
+
+ if args.skip != None:
+ dataset_to_process = dataset.skip(args.skip)
+ else:
+ dataset_to_process = dataset
+
+ idx = 1
+
+ t0 = time.time()
+ for rows in dataset_to_process.iter(batch_size=args.load_batch_size):
+ logger.info('Start processing batch # {}'.format(idx))
+ print("-----------------------------")
+ df = pd.DataFrame(rows)
+ # logger.info(df['meta'])
+ ray_dataset = ray.data.from_pandas(df)
+ # partition batch into total number of workers
+ ray_dataset = ray_dataset.repartition(parallelism) #, shuffle = True)
+ # processing batch
+ tokenized_data = ray_dataset.map_batches(pii_removal, batch_format="numpy", batch_size=None)
+ # gather data into one file per node
+ tokenized_data = tokenized_data.repartition(num_nodes)
+ tokenized_data.write_parquet(output_dir)
+
+ logger.info('Finished processing batch # {}'.format(idx))
+ logger.info(f"{idx} * {args.load_batch_size} samples were written to disk.")
+ idx += 1
+ print("============================")
+ # if idx == 2:
+ # #sys.exit()
+ # break
+ t1 = time.time()
+ logger.info('Processing {} samples took {:.3f} sec'.format((idx-1)*args.load_batch_size, t1-t0))
+
+
+if __name__ == "__main__":
+ start = time.time()
+ main()
+ end = time.time()
+ print(f"\nthis script took {end-start}s.")
+
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/process_exceptions.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/process_exceptions.py
new file mode 100644
index 000000000..ac156c750
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/process_exceptions.py
@@ -0,0 +1,10 @@
+import json
+import glob
+
+data_dir = "/home/user/local/processed/refinedweb/exceptions/"
+filename = "2af2834dff874b4affffffffffffffffffffffff13000000.json"
+
+with open(data_dir+filename, 'r') as f:
+ data = json.load(f)
+
+print(len(data))
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/redact_pii.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/redact_pii.py
new file mode 100644
index 000000000..0fee73d1a
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/redact_pii.py
@@ -0,0 +1,186 @@
+import ipaddress
+import random
+import string
+
+
+
+# option 1: similar to bigscience-pii redaction:
+# # replace with [TAG], e.g., [EMAIL]
+# #redacted_str, metadata = redact_pii(text, matches)
+# option 2: similar to bigcode-pii redaction:
+# # IP: replace with predefined random IP address, or DNS servers
+# # EMAIL, USERNAME, KEY: replace with random values
+# # also keeping track of pii values through a sample
+# # and replace with the same random value for the same pii value
+# #print(redacted_str)
+# redacted_str = redact_pii_with_random_values(text, matches)
+# # metadata_out = {"regex metadata":metadata, "original": text, "redacted": redacted_str}
+# # match_set = (redacted_str, metadata_out)
+
+
+# The IP replacements are copied from bigcode-pii
+# List of random private IP addresses to use as replacements
+REPLACEMENTS_IP = {
+ "IPv4": ["172.16.31.10", "172.16.58.3", "172.16.17.32", "192.168.127.12", "192.168.3.11"],
+ "IPv6": [
+ "fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b",
+ "fd00:a516:7c1b:17cd:6d81:2137:bd2a:2c5b",
+ "fc00:e968:6179::de52:7100",
+ "fc00:db20:35b:7399::5",
+ "fdf8:f53e:61e4::18",
+ ],
+}
+
+# providergs = ["google", "cloudfare", "alternate-dns", "quad9","open-dns", "comodo", "adguard"]
+POPULAR_DNS_SERVERS = [
+ "8.8.8.8",
+ "8.8.4.4",
+ "1.1.1.1",
+ "1.0.0.1",
+ "76.76.19.19",
+ "76.223.122.150",
+ "9.9.9.9",
+ "149.112.112.112",
+ "208.67.222.222",
+ "208.67.220.220",
+ "8.26.56.26",
+ "8.20.247.20",
+ "94.140.14.14",
+ "94.140.15.15",
+]
+
+letters = string.ascii_lowercase
+digits = string.digits
+lettters_digits = string.ascii_lowercase + string.digits
+
+# random emails
+n = 100
+REPLACEMENT_EMAIL = [
+ "".join(random.choice(letters) for i in range(10)) + "@example.com"
+ for i in range(n)
+ ]
+
+# random keys
+REPLACEMENT_KEY = [
+ "".join(random.choice(digits) for i in range(10))
+ for i in range(n)
+ ]
+# simple hack: make key replacement and phone replacement to be
+# both 10 random digits
+# to simplify redaction
+# [
+# "".join(random.choice(lettters_digits) for i in range(32)) for i in range(n)
+# ]
+
+# random usernames
+REPLACEMENT_USERNAME = [
+ "@"+"".join(random.choice(letters) for i in range(10))
+ for i in range(n)
+ ]
+
+REPLACEMENT_PHONE = [
+ "".join(random.choice(digits) for i in range(10))
+ for i in range(n)
+ ]
+
+REPLACEMENT_DICT={
+ 'EMAIL': REPLACEMENT_EMAIL,
+ 'KEY': REPLACEMENT_KEY,
+ 'USER': REPLACEMENT_USERNAME,
+ 'PHONE_NUMBER':REPLACEMENT_PHONE
+}
+
+def is_private_ip(ip):
+ """Check if an IP address is allocated for private networks"""
+ ip = ipaddress.ip_address(ip)
+ return ip.is_private
+
+def replace_ip(value):
+ """Replace an IP address with a synthetic IP address of the same format"""
+ # ipaddress.ip_address(ip) raises exception when ip i snot valid
+ # if is_private_ip(value) or (value in POPULAR_DNS_SERVERS):
+ # return value
+
+ if value in POPULAR_DNS_SERVERS:
+ #print('IP is one of DNS servers, return original value: ', value)
+ return value
+
+ try:
+ ipaddress.IPv4Address(value)
+ #print('IP is IPv4, return redacted value')
+ return random.choice(REPLACEMENTS_IP["IPv4"])
+ except ValueError:
+ try:
+ ipaddress.IPv6Address(value)
+ #print('IP is IPv6, return redacted value')
+ return random.choice(REPLACEMENTS_IP["IPv6"])
+ except ValueError:
+ # this doesn't happen if we already use ipaddress filter in the detection
+ # this is good as we have another layer of protection to redace false positive
+ #print("Invalid IP address:", value)
+ return value
+
+def redact_email_key_user_phone(value, tag):
+ supported_tags = {'KEY', 'EMAIL', 'USER', 'PHONE_NUMBER'}
+ if tag in supported_tags:
+ #return random.choice(REPLACEMENT_DICT[tag])
+ if tag=='KEY':
+ redact_value = "".join(random.choice(digits) for i in range(10))
+ if tag == 'EMAIL':
+ redact_value = "".join(random.choice(letters) for i in range(10)) + "@{}.com".format("".join(random.choice(letters) for i in range(5)))
+ if tag == 'USER':
+ redact_value = "@"+"".join(random.choice(letters) for i in range(10))
+ if tag == 'PHONE_NUMBER':
+ redact_value = "".join(random.choice(digits) for i in range(10))
+ return redact_value
+ else:
+ #print('{} type is not supported!'.format(tag))
+ return value
+
+
+# TODO: generate random strings on the fly, instead of choose from one of n
+def redact_pii_with_random_values(text, matches):
+ # adapted from bigcode-pii redaction
+ # however, matches here is a list of dictionaries
+ # the dictionary is of this schema:
+ # {'start': 123, 'end': 234, 'value': xyz, 'type': PHONE_NUMBER}
+ redacted_str = text
+ replaced_values = []
+ lookup_dict = {}
+ for match in matches:
+ start_idx = match['start']
+ end_idx = match['end']
+ matched_str = match['value'] #text[start_idx:end_idx]
+ tag = match['type']
+ if matched_str in replaced_values:
+ redact_tag = lookup_dict[matched_str]
+ else:
+ if tag == 'IP_ADDRESS':
+ redact_tag = replace_ip(matched_str)
+
+ else:
+ redact_tag = redact_email_key_user_phone(matched_str, tag)
+
+ replaced_values.append(matched_str)
+ lookup_dict[matched_str]=redact_tag
+
+ # print('original: ', matched_str)
+ # print('redacted tag: ', redact_tag)
+ match['redacted'] = redact_tag
+ redacted_str = redacted_str.replace(matched_str, redact_tag)
+ # Create the "metadata" as all of the information we had before redaction
+ #metadata += [(match)]
+ #print(matches)
+ return redacted_str
+
+
+def redact_pii_with_tags(text, matches):
+ # adapted from bigscience-pii
+ redacted_str = text
+ for match in matches:
+ matched_str = match['value']
+ tag = match['type']
+ redact_tag = "[" + tag +"]"
+ redacted_str = redacted_str.replace(matched_str, redact_tag)
+
+ return redacted_str
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/utils.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/utils.py
new file mode 100644
index 000000000..5e1d4bdd9
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/utils.py
@@ -0,0 +1,162 @@
+import json
+from datasets import load_dataset
+import random
+from bigscience_pii_detect_redact import detect_pii
+
+piis_not_to_consider =['PERSON',
+ 'NRP',
+ 'LOCATION',
+ 'DATE_TIME',
+ 'URL'
+ ]
+
+def write_to_html(result, text_corpus):
+ idx = result['doc_idx']
+ text = text_corpus[idx]['text']
+ text_list = list(text)
+
+ entity_categories = []
+ n = 0
+ for r in result['pii']:
+ start = int(r['start'])
+ end = int(r['end'])
+ text_list.insert(start+3*n, '')
+ text_list.insert(end+1+3*n, "")
+ text_list.insert(end+2+3*n, '[[['+r['type']+']]]')
+ n+=1
+ entity_categories.append(r['type'])
+
+ bolded = ''.join(text_list)
+ #html = ""+bolded+""
+ #print(html)
+
+ summary = summarize_pii_entities(entity_categories)
+
+ bolded = summary + ''+ bolded
+ return bolded, entity_categories
+
+
+def summarize_pii_entities(entity_categories):
+ unique_categories = list(set(entity_categories))
+ summary = 'PIIs: '
+ for e in unique_categories:
+ occurences = entity_categories.count(e)
+ summary += (e + ": "+str(occurences)+'; ')
+ return summary
+
+def parse_recognizer_result(result):
+ #temp = result.split(',')
+ #assert len(temp)==4, 'a valid result should have 4 fields, but only got {} fields'.format(len(temp))
+ parsed_dict = {}
+
+ parsed_dict['type']=result.entity_type
+ parsed_dict['start']=result.start
+ parsed_dict['end']=result.end #temp[2][5:]
+ #parsed_dict['score']=result.score #temp[3][6:]
+
+ return parsed_dict
+
+
+def count_num_piis_to_consider(result):
+ #result is a dictionary of this format
+ # {doc_idx:123, num_pii: 2, pii: [{type: ABC, start:234, end: 256, score:0.6}, {}]}
+ filtered_piis = []
+ piis = result['pii']
+ num_piis_to_consider = 0
+ for pii in piis:
+ if pii['type'] in piis_not_to_consider:
+ #print('Not including {} category'.format(pii['type']))
+ continue
+ else:
+ num_piis_to_consider += 1
+ #print(pii)
+ if pii['type'] != 'IP_ADDRESS' and pii['type'] !='EMAIL_ADDRESS':
+ pii['type'] = 'ID_NUM_STR'
+ #print(pii)
+ filtered_piis.append(pii)
+ #print('number of piis to consider: ', num_piis_to_consider)
+ #print('filtered piis: ',filtered_piis)
+
+ return num_piis_to_consider, filtered_piis
+
+def filter_results_by_category(results):
+ filtered_results = []
+ for result in results:
+ num_piis_to_consider, filtered_piis = count_num_piis_to_consider(result)
+ if num_piis_to_consider>0:
+ result['pii'] = filtered_piis
+ result['num_pii']=len(filtered_piis)
+ filtered_results.append(result)
+ #print('filtered results: ',filtered_results)
+ return filtered_results
+
+def sample_results(results, number_of_samples):
+ random.seed(1234)
+ if len(results) > number_of_samples:
+ return random.sample(results, number_of_samples)
+ else:
+ return results
+
+# this tag list is copied from
+# https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/02_pii/bigscience_pii_detect_redact.py#L53
+high_risk_tags = {'KEY', 'EMAIL', 'USER', 'IP_ADDRESS'} # , 'NUMBER', "ID"}
+
+def detect_with_bigscience_pii_single_sample(text):
+ matches = detect_pii(text, None, high_risk_tags)
+ if len(matches)>0:
+ pii_list = []
+ for m in matches:
+ #print(m)
+ pii = {}
+ pii['type']=m[-2]
+ pii['start']=m[1][0]
+ pii['end']=m[1][1]
+ #print(pii)
+ pii_list.append(pii)
+
+ return pii_list
+ else:
+ return None
+
+def is_phone_number(matched_str):
+ DEFAULT_SUPPORTED_REGIONS = ("US", "UK", "DE", "FE", "IL", "IN", "CA", "BR")
+ #valid = phonenumbers.is_valid_number(matched_str)
+ #print(matched_str)
+ for region in DEFAULT_SUPPORTED_REGIONS:
+ try:
+ parsed_number = phonenumbers.parse(matched_str)
+ except:
+ #print('cannot parse the string as phone number')
+ return False
+
+ flag = phonenumbers.is_possible_number(parsed_number)
+ if flag == True:
+ #print('KEY is PHONE_NUMBER')
+ return True
+
+ return False
+
+
+
+def remove_phone_numbers_from_bigscience_results(matches):
+ # use bigscience-pii to detect
+ # emails, ip addresses, usernames, id alphanumerics
+ if len(matches)>0:
+ pii_list = []
+ phone_matches = []
+ for i, m in enumerate(matches):
+ matched_str = m[0]
+ if is_phone_number(matched_str):
+ phone_matches.append(i)
+ # else:
+ # # print(m)
+ # pii = {}
+ # pii['type']=m[-2]
+ # pii['start']=m[1][0]
+ # pii['end']=m[1][1]
+ # print(pii)
+ # pii_list.append(pii)
+
+
+ matches = [matches[i] for i in range(len(matches)) if i not in phone_matches]
+ return matches
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/validate_ray_outputs.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/validate_ray_outputs.py
new file mode 100644
index 000000000..c7b32203f
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/validate_ray_outputs.py
@@ -0,0 +1,158 @@
+from datasets import load_dataset
+# from presidio_analyzer import AnalyzerEngine
+from utils import summarize_pii_entities
+# import time
+import json
+# from redact_pii import redact_pii_with_random_values, redact_pii_with_tags
+# from detect_pii import detect_other_piis, detect_phone_numbers, merge_outputs
+import glob
+import random
+import os
+import pandas as pd
+# from matplotlib import pyplot as plt
+
+def write_merged_detect_redact_results_to_html(sample):
+ # sample is a dictionary {'text': xxxx, pii: []}
+ #text = sample['original']
+ piis = sample['secrets']
+ entity_categories = []
+
+ if sample['modified']:
+ #text_list = list(text)
+ redacted_text = sample['text']
+ redacted_text_list = list(redacted_text)
+
+
+ n = 0
+ for r in piis:
+ start = int(r['start'])
+ end = int(r['end'])
+ # text_list.insert(start+3*n, '')
+ # text_list.insert(end+1+3*n, "")
+ # text_list.insert(end+2+3*n, '[[['+r['type']+']]]')
+
+ redacted_text_list.insert(start+3*n, '')
+ redacted_text_list.insert(end+1+3*n, "")
+ redacted_text_list.insert(end+2+3*n, '[[['+r['type']+']]]')
+
+ n+=1
+ entity_categories.append(r['type'])
+
+
+ bolded = ''.join(redacted_text_list)
+ #html = ""+bolded+""
+ #print(html)
+
+ # redacted_marked = ''.join(redacted_text_list)
+ summary = summarize_pii_entities(entity_categories)
+ bolded = summary + ''+ bolded
+
+ else:
+ bolded = sample['text']
+ # redacted_marked = None
+
+ return bolded, entity_categories
+
+
+
+
+path = '/home/vmagent/app/falcon-refinedweb-pii-remove/'
+datafile = glob.glob(path + '*.parquet')
+# randomly pick one file from output
+filename = random.choice(datafile)
+output = 'pii_test'
+
+# Check 1: load with pd to check schema and content
+df = pd.read_parquet(filename)
+print(df.head(10))
+
+print(df.shape)
+
+
+# Check 2: get statistics from a sample
+def get_stats(row):
+ count_dict = {
+ 'PHONE_NUMBER': 0,
+ 'IP_ADDRESS':0,
+ 'EMAIL': 0,
+ 'USER':0,
+ 'KEY':0
+ }
+
+ if row['modified'] == True:
+ pii = row['secrets']
+ num_piis = len(pii)
+ for x in pii:
+ count_dict[x['type']] += 1
+
+ else:
+ num_piis = 0
+
+ return num_piis, count_dict
+
+
+sample_files = random.sample(datafile, min(10, len(datafile)))
+total_num_piis = 0
+count_dict_all = {
+ 'PHONE_NUMBER': 0,
+ 'IP_ADDRESS':0,
+ 'EMAIL': 0,
+ 'USER':0,
+ 'KEY':0
+ }
+
+for f in sample_files:
+ df = pd.read_parquet(f).sample(1000)
+ for _, row in df.iterrows():
+ num_piis, count_dict = get_stats(row)
+ total_num_piis += num_piis
+ for k, v in count_dict.items():
+ count_dict_all[k] += v
+
+print(count_dict_all)
+
+
+# Check 3: visual check with html
+df= df.sample(100)
+html=""
+num_piis = []
+entities = []
+# summary = 'Total number of samples: '+str(len(samples)) +''
+summary = ""
+
+for _, sample in df.iterrows():
+ #print(sample['meta'])
+ bolded, entity_categories = write_merged_detect_redact_results_to_html(sample)
+
+ try:
+ meta = sample['meta']
+ html += (meta + ''+bolded+"")
+ except:
+ html += '---------------------------'
+ html += ''+bolded+""
+
+ if sample['modified']:
+ # html += ''+redacted+""
+ num_piis.append(len(sample['secrets']))
+ entities.extend(entity_categories)
+
+
+assert sum(num_piis)==len(entities), 'number of entities not match'
+
+
+summary += 'Total number of PIIs: {}'.format(len(entities))
+
+summary += '' + summarize_pii_entities(entities) +''
+
+html = ''+summary+html+""
+
+
+output_path = path + 'validation/'
+if not os.path.exists(output_path):
+ os.mkdir(output_path)
+
+
+output_file = output_path + '{}-pii-validation.html'.format(output)
+f = open(output_file,"w")
+f.write(html)
+f.close()
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_redaction.py b/tools/pii_removal_for_contact_mp/pii_redaction.py
new file mode 100644
index 000000000..bcac8a0f9
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_redaction.py
@@ -0,0 +1,23 @@
+import argparse
+import os
+from multiprocessing import Pool, cpu_count
+from pyrecdp.core.utils import Timer
+from math import ceil
+from tqdm import tqdm
+import json
+from pii_redaction_impl import *
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-d", dest="data_dir", type=str)
+ parser.add_argument("-o", dest="out_dir", type=str)
+ parser.add_argument("-mp", dest="mp", type=int, default=-1)
+ args = parser.parse_args()
+
+ data_dir = args.data_dir
+ out_dir = args.out_dir
+ n_parallel = args.mp
+
+ with Timer(f"generate hash to {data_dir}"):
+ pii_remove_MP(data_dir, out_dir, n_parallel)
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_redaction_impl.py b/tools/pii_removal_for_contact_mp/pii_redaction_impl.py
new file mode 100644
index 000000000..9589e7833
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_redaction_impl.py
@@ -0,0 +1,142 @@
+import argparse
+import os, sys
+from pyrecdp.core.utils import Timer
+import json
+from pyrecdp.primitives.llmutils.utils import get_nchunks_and_nproc, clean_str
+import hashlib
+import pandas as pd
+from tqdm import tqdm
+import subprocess #nosec
+import io
+import re
+import time
+import hashlib
+
+from presidio_analyzer.predefined_recognizers import PhoneRecognizer
+
+import sys, pathlib
+cur_path = str(pathlib.Path(__file__).parent.resolve())
+import_path = os.path.join(cur_path, "pii_detection_redaction", "src")
+print(f"add new import_path: {import_path}")
+sys.path.append(import_path)
+
+from pii_redaction_v2 import *
+
+def pii_removal_impl_parquet_to_parquet(in_file_name, out_file_name, base_file_name):
+ analyzer = PhoneRecognizer()
+ batch = pd.read_parquet(in_file_name).reset_index(drop=True)
+ text = batch['text'].tolist()
+ redacted_content = []
+ modified = []
+ piis = []
+
+ #for txt in tqdm(text, total=len(text), desc = f"process {in_file_name}"):
+ for txt in text:
+ # # for testing exception
+ # if i%5 == 0:
+ # raise ValueError
+ output = detect_redact_pii_for_one_text(txt, analyzer)
+ modified.append(output['modified'])
+ piis.append(output['pii'])
+ if output['pii'] != None: # have PII so output redacted text
+ redacted_content.append(output['redacted'])
+ else: # did not have PII so output original text
+ redacted_content.append(txt)
+
+ batch['text'] = pd.Series(redacted_content)
+ batch['secrets'] = pd.Series(piis)
+ batch['modified'] = pd.Series(modified)
+
+ batch.to_parquet(out_file_name)
+
+# define actual work
+def pii_remove(proc_id, x_list, out_type):
+ #for x in tqdm(x_list, total=len(x_list), desc=f"proc-{proc_id}", position=proc_id+1):
+ for x in x_list:
+ try:
+ in_file_name, out_file_name, base_file_name = x
+ base_file_name = os.path.basename(base_file_name)
+ out_dir = os.path.dirname(out_file_name)
+ os.makedirs(out_dir, exist_ok=True)
+ pii_removal_impl_parquet_to_parquet(in_file_name, out_file_name, base_file_name)
+
+ except Exception as e:
+ with open(f"{out_file_name}.error.log", 'w') as f:
+ f.write(f"Failed to process {base_file_name}, error is {e}")
+ return True
+
+def wait_and_check(pool):
+ for proc_id, (process, cmd) in pool.items():
+ std_out, std_err = process.communicate()
+ rc = process.wait()
+ if rc != 0:
+ file_name = f"pii-redaction-proc-{proc_id}.error.log"
+ print(f"Task failed, please check {file_name} for detail information")
+ with open(file_name, "a") as f:
+ f.write(f"=== {time.ctime()} {' '.join(cmd)} failed. ===\n")
+ f.write(std_err.decode(sys.getfilesystemencoding()))
+ f.write("\n")
+
+def launch_cmdline_mp(args, data_dir, out_dir, mp):
+ pool = {}
+ for arg in tqdm(args, total=len(args), desc="pii redaction"):
+ proc_id, x_list = arg
+ cmd = ["python", "pii_redaction_impl.py", "--proc_id", f"{proc_id}", "--in_dir", f"{data_dir}", "--out_dir", f"{out_dir}", "--file_list", f"{x_list}"]
+ #f.write(' '.join(cmd) + "\n")
+ pool[proc_id] = (subprocess.Popen(cmd , stdout=subprocess.PIPE, stderr=subprocess.PIPE), cmd)
+
+ if len(pool) >= mp:
+ wait_and_check(pool)
+ pool = {}
+
+ wait_and_check(pool)
+
+def get_target_file_list(data_dir, file_type):
+ cmd = ["find", data_dir, "-name", f"*.{file_type}"]
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+ stdout, stderr = proc.communicate()
+ exitcode = proc.returncode
+ if exitcode != 0:
+ return []
+ else:
+ ret = stdout.decode("utf-8").split('\n')[:-1]
+ ret = [i.replace(data_dir, "") for i in ret]
+ ret = [i[1:] if i[0] == '/' else i for i in ret]
+ return ret
+
+def pii_remove_MP(data_dir, out_dir, n_part = -1):
+ files = get_target_file_list(data_dir, 'parquet')
+ #print(files)
+
+ if len(files) == 0:
+ print("Detect 0 files, exit here")
+ return
+
+ if n_part != -1:
+ n_proc = n_part
+ else:
+ _, n_proc = get_nchunks_and_nproc(len(files), n_part = n_part)
+ print(f"resetting to {n_proc} for number of processes")
+
+ args = [(idx, [i]) for idx, i in enumerate(files)]
+ launch_cmdline_mp(args, data_dir, out_dir, n_proc)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--proc_id", dest="proc_id", type=int)
+ parser.add_argument("--in_dir", dest="in_dir", type=str)
+ parser.add_argument("--out_dir", dest="out_dir", type=str)
+ parser.add_argument("--file_list", dest="file_list", type=str)
+ args = parser.parse_args()
+
+ proc_id = args.proc_id
+ in_dir = args.in_dir
+ out_dir = args.out_dir
+ in_file_list = eval(args.file_list)
+ out_type = 'parquet'
+
+ file_args = [(os.path.join(in_dir, f_name), os.path.join(out_dir, f"{f_name}.pii_remove.{out_type}"), f_name) for f_name in in_file_list]
+
+ with Timer(f"generate hash index with proc-id {proc_id}"):
+ pii_remove(proc_id, file_args, out_type)