From f7a312662ae9cf9f902e3730800ac6e9cf1a641f Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Wed, 20 Sep 2023 04:38:11 -0500
Subject: [PATCH] Pii removal with Multi-process support based on minmin's
 codes (#62)

* commit pii removal mp

Signed-off-by: Xue, Chendi <chendi.xue@intel.com>

* Update README.md

* rename folder

Signed-off-by: Xue, Chendi <chendi.xue@intel.com>

* fix

Signed-off-by: Xue, Chendi <chendi.xue@intel.com>

* Update README.md

---------

Signed-off-by: Xue, Chendi <chendi.xue@intel.com>
---
 tools/pii_removal_for_contact_mp/README.md    |  27 ++
 .../pii_detection_redaction/README.md         |  34 ++
 .../scripts/pii-pile-hn.sh                    |  14 +
 .../scripts/pii-redpj.sh                      |  17 +
 .../scripts/pii-refinedweb.sh                 |  14 +
 .../scripts/pii-slimpj.sh                     |  16 +
 .../src/bigscience_pii_detect_redact.py       | 325 ++++++++++++++++++
 .../pii_detection_redaction/src/detect_pii.py |  80 +++++
 .../src/pii_redaction.py                      | 320 +++++++++++++++++
 .../src/pii_redaction_v2.py                   | 294 ++++++++++++++++
 .../src/process_exceptions.py                 |  10 +
 .../pii_detection_redaction/src/redact_pii.py | 186 ++++++++++
 .../pii_detection_redaction/src/utils.py      | 162 +++++++++
 .../src/validate_ray_outputs.py               | 158 +++++++++
 .../pii_redaction.py                          |  23 ++
 .../pii_redaction_impl.py                     | 142 ++++++++
 16 files changed, 1822 insertions(+)
 create mode 100644 tools/pii_removal_for_contact_mp/README.md
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/README.md
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-pile-hn.sh
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-redpj.sh
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-refinedweb.sh
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-slimpj.sh
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/bigscience_pii_detect_redact.py
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/detect_pii.py
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction.py
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction_v2.py
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/process_exceptions.py
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/redact_pii.py
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/utils.py
 create mode 100644 tools/pii_removal_for_contact_mp/pii_detection_redaction/src/validate_ray_outputs.py
 create mode 100644 tools/pii_removal_for_contact_mp/pii_redaction.py
 create mode 100644 tools/pii_removal_for_contact_mp/pii_redaction_impl.py

diff --git a/tools/pii_removal_for_contact_mp/README.md b/tools/pii_removal_for_contact_mp/README.md
new file mode 100644
index 000000000..1b8d8fd95
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/README.md
@@ -0,0 +1,27 @@
+# PII removal for contact info
+
+## Intro
+
+PII removal for contact info is to replace personal information such as email, phone number to a random-non-sense string to protect personal infomation
+This script is using multi processing method to speed up PPI removal
+
+## Expected input and Output
+
+Input format: a folder of *parquet, 'text' will required in parquet column names.
+
+Out format: a folder of *parquet, 'text' will be processed and personal info will be replaced.
+
+## How to RUN
+```
+conda create --name pyrecdp
+conda activate pyrecdp
+pip install pyrecdp --pre
+pip install presidio_analyzer
+python -m spacy download en_core_web_lg
+python pii_redaction.py -d ../falcon-refinedweb -o ../falcon-refinedweb-pii_removal -mp 224
+```
+
+## NOTICE
+
+We are running at file-wised parallism, usually a 300MB file took around 15-20min to complete, so you will see slow progress in progress bar.
+One thing to identify the activity of the process may be using 'top' to check of there are multiple activitily running python processes.
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/README.md b/tools/pii_removal_for_contact_mp/pii_detection_redaction/README.md
new file mode 100644
index 000000000..dae0a7658
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/README.md
@@ -0,0 +1,34 @@
+# How to run PII-for-text pipeline
+
+## Overview
+The pipeline detects 5 different types of PIIs: 'PHONE_NUMBER, 'IP_ADDRESS', 'EMAIL', 'USER', 'KEY'. The detection is based on regular expressions using open-source packages including presidio and bigscience-pii. The detection precision/recall has been tuned for web scrapes based datasets, for example, Falcon-RefinedWeb, SlimPajama-StackExchange, PILE-Hackernews. But the detection precion/recall is not good for code data like Github. </p>
+
+Two redaction methods have been implemented:
+1. Replacement with random values
+2. Replacement with tags such as [PHONE_NUMBER], [EMAIL], etc. </br>
+Currently, the option 1) is used.
+
+
+## How to run
+### Step 1: Set up Env
+Please follow [this guide](../workload_in_containers/README.md) on how to set-up the container environment of this workload. When the containers are running, you can enter the container on head node using following command:
+```bash  
+docker exec -it ray-leader bash 
+```
+
+### Step 2: Run PII removal
+Once you are inside the ray-leader container, go to the scripts folder. You can change the `BATCH_SIZE` and `CPUCORES` depending on the memory and number of cores on your systems. Then you can run the pii script, for example:
+```
+bash pii-refinedweb.sh
+```
+
+### Step 3: Validate outputs
+We implemented 3 checks:
+1. Check schema and sample rows in output parquets by loading parquet with pandas
+2. Count numbers of PIIs per category by sampling from the outputs. You can further get an estimate of the total number of PIIs per category by multiplying total_num_samples/sample_used_for_this_check
+3. Visual check of a small sample by producing a html with yellow highlights of the PIIs and annotating with corresponding category (note that sometimes the highlights are not at the exact location, but should be quite close).
+
+```
+# First change the path to the data files in the python script
+python src/validate_ray_outputs.py
+```
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-pile-hn.sh b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-pile-hn.sh
new file mode 100644
index 000000000..cc3d168b5
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-pile-hn.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+BATCHSIZE=1000
+CPUCORES=48
+DATA=pile_hn
+OUTPUT_PREFIX=pile_hn
+DATA_DIR=/home/user/local/PILE/hn
+
+python ../src/pii_redaction_v2.py \
+--load-batch-size $BATCHSIZE \
+--cpu-per-worker $CPUCORES \
+--dataset-family $DATA \
+--output-prefix $OUTPUT_PREFIX \
+--data-dir $DATA_DIR \
+--local
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-redpj.sh b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-redpj.sh
new file mode 100644
index 000000000..aea0eda67
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-redpj.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+BATCHSIZE=50000
+CPUCORES=48
+INPUT=togethercomputer/RedPajama-Data-1T-Sample
+DATA=slimpajama
+OUTPUT_PREFIX=redpajama
+DATA_DIR=/home/user/local/dataset/RedPajama-Data-1T-Sample/
+
+python ../src/pii_redaction.py \
+--load-batch-size $BATCHSIZE \
+--cpu-per-worker $CPUCORES \
+--input $INPUT \
+--dataset-family $DATA \
+--output-prefix $OUTPUT_PREFIX \
+--data-dir $DATA_DIR \
+--local \
+#--skip 500000
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-refinedweb.sh b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-refinedweb.sh
new file mode 100644
index 000000000..2b3a4b858
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-refinedweb.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+BATCHSIZE=1000
+CPUCORES=48
+DATA=refinedweb
+OUTPUT_PREFIX=pii_test_output
+DATA_DIR=/home/user/local/refinedweb_samples
+
+python ../src/pii_redaction_v2.py \
+--load-batch-size $BATCHSIZE \
+--cpu-per-worker $CPUCORES \
+--dataset-family $DATA \
+--output-prefix $OUTPUT_PREFIX \
+--data-dir $DATA_DIR \
+--local
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-slimpj.sh b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-slimpj.sh
new file mode 100644
index 000000000..3601b582f
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/scripts/pii-slimpj.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+BATCHSIZE=50000
+CPUCORES=48
+INPUT=cerebras/SlimPajama-627B
+DATA=slimpajama
+OUTPUT_PREFIX=pii_slimpajama_se
+DATA_DIR=/home/user/local/
+
+python ../src/pii_redaction_v2.py \
+--load-batch-size $BATCHSIZE \
+--cpu-per-worker $CPUCORES \
+--input $INPUT \
+--dataset-family $DATA \
+--output-prefix $OUTPUT_PREFIX \
+--data-dir $DATA_DIR \
+--local
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/bigscience_pii_detect_redact.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/bigscience_pii_detect_redact.py
new file mode 100644
index 000000000..904477dcc
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/bigscience_pii_detect_redact.py
@@ -0,0 +1,325 @@
+# -*- coding: utf-8 -*-
+"""MST BigScience PII Code
+
+Original colab that is a source of this file is located at
+    https://colab.research.google.com/drive/1086H3-LGMz3gX0pGy9ECgr8KflosSKso
+
+# License
+
+Copyright 2022 Authors of this Notebook
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+# What is this colab?
+
+This colab detects the following kinds of PII for all languages in BigScience.
+Languages assumed are ["ar", "as", "bn", "ca", "en", "es", "eu", "fr", "gu", "hi", "id", "ig", "mr", "ny", "pa", "pt", "sn", "st", "sw", "ur", "vi", "xh", "yo", "zh", "zu"]
+
+## Highest Risk
+### Simple spans of characters:
+*   **IDs [general]:** This is anything that is a sequence of 6 or more digits, as is common in identifiers for people internationally (national IDs, tax IDs, passport numbers, etc.), credit card numbers, IBAN codes, etc.
+*   **Key [general]**: This is anything that is a sequence of digits and letters in the same string, optionally with spaces.  Common for Credit Card and API, SSH, GPG keys. (Privacy group doesn't have a regex for this)
+*   **Email address**, **User name**: Strings using @
+*   **IP address**: Digits with periods in them
+*   **Phone number**: At least 7 digits with spaces in them
+*   **License plate**: (Privacy group doesn't have cross-lingual handling for this, MST group doesn't have a regex for this)
+
+### More complex spans: (WORK IN PROGRESS)
+* **Full Names**: Requires additional NER package
+* **Address**
+
+
+## Lower Risk: (We're not doing)
+*   **URL**
+*   **Time**: dateparser dependency
+*   **Date**: dateparser dependency
+*   **Age**
+
+"""
+
+
+#@title Define highest risk PII. TODO: License plate
+# NUMBER removed last minute due to false positives. See https://huggingface.slack.com/archives/C0307KE5UNT/p1647011702716159
+high_risk_tags = {'KEY', 'EMAIL', 'USER', 'IP_ADDRESS', 'IPv4', 'IPv6'} # , 'NUMBER', "ID"}
+
+"""# Regexes"""
+
+#@title Get the less sophisticated MST regexes for High Risk scenarios (baseline comparison). Not language-specific; all are general.
+import sys
+import regex
+import ipaddress
+# These are ordered so that we can return upon a match; no need to search for a substring.
+year_patterns = [
+  regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/][1-2][0-9]{3})(?:$|[\s@,?!;:\'\"(.\p{Han}])"), # yyyy-yyyy or yyyy/yyyy
+  regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}[\p{Pd}/.][0-3][0-9][\p{Pd}/.][0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])"), # yyyy-mm-dd or yyyy-dd-mm or yyyy/mm/dd or yyyy/dd/mm or yyyy.mm.dd or yyyy.dd.mm
+  regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/.][0-3][0-9][\p{Pd}/.](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])"), # mm-dd-yyyy or dd-mm-yyyy or mm/dd/yyyy or dd/mm/yyyy or mm.dd.yyyy or dd.mm.yyyy or the same but with yy instead of yyyy
+  regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([0-3][0-9][\p{Pd}/](?:[0-9]{2}|[1-2][0-9]{3}))(?:$|[\s@,?!;:\'\"(.\p{Han}])"), # mm-yyyy or mm/yyyy or the same but with yy
+  regex.compile(r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([1-2][0-9]{3}-[0-3][0-9])(?:$|[\s@,?!;:\'\"(.\p{Han}])"), # yyyy-mm or yyyy/mm
+]
+
+# Patterns for high-risk character strings
+id_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])([A-Za-z]*(?:[\p{Pd}]*\p{Nd}){6,})(?:$|[\b\s@?,!;:\'\")(.\p{Han}])'
+# https://regex101.com/r/JQkmh8/2
+# key_pattern = r'(?:^|[\b\s@?,!;:\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[\s\p{Pd}]?){4,})(?:$|[\b\s\p{Han}@?,!;:\'\"])'
+# https://regex101.com/r/JQkmh8/5
+key_pattern = r'(?:^|[\b\s@?,!:;\'\")(.\p{Han}])((?:(?:[A-Za-z]+[\p{Nd}\p{Pd}\/\+\=:_]+|[\p{Nd}\p{Pd}\/\+\=:]+[A-Za-z]+)){4,}|(?:(?:\p{Nd}{3,}|[A-Z]+\p{Nd}+[A-Z]*|\p{Nd}+[A-Z]+\p{Nd}*)[ \p{Pd}]?){3,})(?:$|[\b\s\p{Han}@?,!;:\'\")(.])'
+ipv4_pattern = r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}'
+ipv6_pattern = r'(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])'
+
+# presidio
+# #ipv4_pattern = r"\b(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b"
+# this one below gives a lot of false positives ::
+#ipv6_pattern = r"\b(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))\b"
+ip_pattern = r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])(" + r"|".join([ipv4_pattern, ipv6_pattern]) + ")(?:$|[\s@,?!;:\'\"(.\p{Han}])"
+
+
+# bigcode-pii
+ipv4_pattern = r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}"
+ipv6_pattern = r"(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])"
+ip_pattern = (
+    r"(?:^|[\b\s@?,!;:\'\")(.\p{Han}])("
+    + r"|".join([ipv4_pattern, ipv6_pattern])
+    + ")(?:$|[\s@,?!;:'\"(.\p{Han}])"
+)
+
+# https://regex101.com/r/EpA5B7/1
+email_pattern = r'''
+    (?<= ^ | [\b\s@,?!;:)('".\p{Han}<] )
+    (
+      [^\b\s@?!;,:)('"<]+
+      @
+      [^\b\s@!?;,/]*
+      [^\b\s@?!;,/:)('">.]
+      \.
+      \p{L} \w{1,}
+    )
+    (?= $ | [\b\s@,?!;:)('".\p{Han}>] )
+'''
+
+# https://regex101.com/r/mOqi1s/3
+#user_pattern = r'(?:^|[\s@,?!;:\'\")(\p{Han}])(@[^\s@,?!;:\'\")(]{3,})'
+user_pattern = r'''
+  (?<= ^ | [)(\s@,?!;:'"\p{Han}] )
+  (@
+    [^)(\s@,?!;:'"]{3,}
+  )
+'''
+# Examples from https://regexpattern.com/phone-number/
+# https://regex101.com/r/lZZ0XP/4
+# Also matches MLS numbers
+# phone_pattern = r'(?:^|[\s\'\"(\p{Han}])((?:\+\p{Nd}+[ \/.\p{Pd}]*)?(?:(?:\(\+?\p{Nd}+\))?(?:[ \/.\p{Pd}]*\p{Nd})){7,}(?:[\t\f #]*\p{Nd}+)?)(?:$|[\s@,?!;:\'\"(.\p{Han}])'
+
+id_regex = regex.compile(id_pattern, flags=regex.MULTILINE) #, re.MULTILINE)
+key_regex = regex.compile(key_pattern, flags=regex.MULTILINE) #, re.MULTILINE)
+ipv4_regex = regex.compile(ipv4_pattern)
+ipv6_regex = regex.compile(ipv6_pattern)
+ip_regex = regex.compile(ip_pattern, flags=regex.MULTILINE) #, re.MULTILINE)
+email_regex = regex.compile(email_pattern, flags=regex.MULTILINE|regex.VERBOSE) #, re.MULTILINE)
+user_regex = regex.compile(user_pattern, flags=regex.MULTILINE|regex.VERBOSE) #, re.MULTILINE)
+# phone_regex = regex.compile(phone_pattern, flags=regex.MULTILINE) #, re.MULTILINE)
+# TODO: license
+
+
+#sasha_regexes = copy.deepcopy(regex_rulebase)
+mst_regexes = {}
+for tag in high_risk_tags:
+  #print(tag)
+  if tag == 'ID':
+    mst_regexes['ID'] = id_regex
+  elif tag == 'KEY':
+    mst_regexes['KEY'] = key_regex
+  elif tag == 'IPv4':
+    mst_regexes['IPv4'] = ipv4_regex
+  elif tag == 'IPv6':
+    mst_regexes['IPv6'] = ipv6_regex
+  elif tag == 'IP_ADDRESS':
+    mst_regexes['IP_ADDRESS'] = ip_regex
+  elif tag == 'EMAIL':
+    mst_regexes['EMAIL'] = email_regex
+  elif tag == 'USER':
+    mst_regexes['USER'] = user_regex
+#  elif tag == 'NUMBER':
+#    mst_regexes['NUMBER'] = phone_regex
+  else:
+    sys.stderr.write('Dont have tag regex pattern for %s =(' % tag)
+
+#print("MST regexes under examination are:")
+#for tag, regx in mst_regexes.items():
+  #print(tag, end=":\t")
+  #print(regx)
+
+"""# PI Detection and Redaction functions are defined here! """
+
+#@title The detection functions and basic filtering functions are defined here.
+# tag_type = {'ID', 'KEY', 'EMAIL', 'IP_ADDRESS', 'PHONE', 'LICENSE_PLATE'}
+# Choose whether to put this import before or after, depending on which you're testing. =)
+
+def ip_has_digit(matched_str):
+  """Checks to make sure the PII span is not just :: or whatever that may
+  accidentally be picked up by making sure there are digits."""
+  return any(map(str.isdigit, matched_str))
+
+# from bigcode-pii
+def filter_versions(matched_str, context):
+    """Filter addresses in this format x.x.x.x, x.xx.x.x  and the words dns/server
+    don't appear in the neighboring context, usually they are just versions"""
+    # count occurrence of dots 
+    dot_count = matched_str.count('.')
+    exclude = (dot_count <= 3 and len(matched_str) <= 8) 
+    if exclude:
+        if "dns" in context.lower() or "server" in context.lower():
+            return False
+    return exclude
+
+# from bigcode-pii
+def not_ip_address(matched_str):
+    """ make sure the string has a valid IP address format
+    e.g: 33.01.33.33 is not a valid IP address because of the 0 in front of 1
+    TODO: fix this directly in the regex"""
+    try:
+        ipaddress.ip_address(matched_str)
+        return False
+    except:
+        return True
+    
+
+def matches_date_pattern(matched_str):
+  # Screen out date false positives
+  for year_regex in year_patterns:
+    if year_regex.match(matched_str):
+      return True
+  return False
+
+def is_website(matched_str):
+  # TODO
+  return False
+
+def detect_pii(text, lang, tag_types):
+  matches = []
+  for tag in tag_types:
+    label_pattern = mst_regexes[tag]
+    # !! regex.match happens here!!
+    matches_tmp = label_pattern.finditer(text)
+    for match in matches_tmp:
+      # TODO: Why does this happen?
+      if match.groups():
+        if len(match.groups()) > 1 and match.groups()[1]:
+          sys.stderr.write("Warning: Found substring matches in the main match.")
+          #print(tag)
+          #print(text)
+          #print(match.groups())
+        matched_str = match.groups()
+        #print(matched_str)
+        # Why does this happen?
+        matched_str = matched_str[0]
+        start, end = match.span(1)
+        
+        if matched_str:
+          if tag in ["IP_ADDRESS"]:
+            # Filter out false positive IPs
+            if not ip_has_digit(matched_str):
+              continue
+            # this is to filter out versions, copied from bigcode-pii
+            if filter_versions(matched_str, text[start-100:end+100]):
+              #print('Detected: version: ', matched_str)
+              continue
+            # this is to filer out invalid ip address, copied from bigcode-pii
+            if not_ip_address(matched_str):
+               #print('Detected: invalid id address: ', matched_str)
+               continue
+          if tag in ["ID", "IP_ADDRESS"]: #, "NUMBER"]:
+            # Filter out date false positives
+            if matches_date_pattern(matched_str):
+              continue
+          # TODO: Implement
+          # if tag in ["KEY"]:
+          #  # TODO: implement
+          #  if is_website(matched_str):
+          #    continue
+          matches += [(matched_str, match.span(), str(label_pattern), tag, lang)]
+  return matches
+
+
+#@title Redaction function defined here.
+def redact_pii(text, matches):
+  """Takes a match as defined in the detect_pii function and redacts it from the full string, returning a <redacted text, metadata> tuple."""
+  redacted_str = text
+  metadata = []
+  for match in matches:
+    matched_str = match[0]
+    tag = match[3]
+    redact_tag = "PI:" + tag
+    redacted_str = redacted_str.replace(matched_str, redact_tag)
+    # Create the "metadata" as all of the information we had before redaction
+    metadata += [(match)]
+  return (redacted_str, metadata)
+
+#@title General function to run the PII detection and redact it, saving everything else to metadata, is defined here.
+def run_pii(text, lang):
+  """
+  Runs the given set of regexes on the data "lines" and pulls out the
+  tagged items.
+  The lines structure stores the language type(s). This can be used for
+  language-specific regexes, although we're dropping that for now and using
+  only "default"/non-language-specific regexes.
+  """
+
+  #print('Detecting....')
+  # What is this for...?
+  text = text.encode().decode()
+  matches = detect_pii(text, lang, high_risk_tags)
+  #print(matches)
+  match_set = (text, {})
+  if len(matches) > 0:
+    # !!! REDACTION HAPPENS HERE !!!
+    redacted_str, metadata = redact_pii(text, matches)
+    metadata_out = {"regex metadata":metadata, "original": text, "redacted": redacted_str}
+    match_set = (redacted_str, metadata_out)
+  return match_set
+
+
+def run_pii_batch(exs, lang):
+    """
+    Runs the given set of regexes on the data "lines" and pulls out the
+    tagged items.
+    The lines structure stores the language type(s). This can be used for
+    language-specific regexes, although we're dropping that for now and using
+    only "default"/non-language-specific regexes.
+    """
+    regex_metadata = []
+    old_text = []
+    new_text = []
+    modified = []
+    for text in exs["text"]:
+        # What is this for...?
+        text = text.encode().decode()
+        matches = detect_pii(text, lang, high_risk_tags)
+        if len(matches) > 0:
+            # !!! REDACTION HAPPENS HERE !!!
+            redacted_str, metadata = redact_pii(text, matches)
+            regex_metadata.append(repr(metadata))
+            old_text.append(text)
+            new_text.append(redacted_str)
+            modified.append(True)
+        else:
+            regex_metadata.append("")
+            old_text.append(text)
+            new_text.append(text)
+            modified.append(False)
+    result = {
+        "regex_metadata": regex_metadata,
+        "old_text": old_text,
+        "text": new_text,
+        "modified": modified
+    }
+    return result
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/detect_pii.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/detect_pii.py
new file mode 100644
index 000000000..b9df49d64
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/detect_pii.py
@@ -0,0 +1,80 @@
+
+#from presidio_analyzer import AnalyzerEngine
+from utils import parse_recognizer_result, high_risk_tags
+from bigscience_pii_detect_redact import matches_date_pattern, detect_pii
+
+
+
+def detect_phone_numbers(text, analyzer):
+    # use presidio phone recognizer to detect phone numbers
+    # threshold is set to 0.4 based on a sample study
+    results = analyzer.analyze(text=text,
+                            entities=['PHONE_NUMBER'],
+                            #language='en',
+                            #score_threshold=0.4,
+                            #return_decision_process=True
+                            )
+    
+    pii_list = []
+
+    if len(results)>0:
+        for result in results:
+            # parse the output into dictionary
+            pii_dict = parse_recognizer_result(result)
+
+            # check if the number string is a date
+            number_str = text[pii_dict['start']: pii_dict['end']]
+
+            if matches_date_pattern(number_str):
+                #print('Date, not phone number')
+                pass
+        
+            else:
+                pii_dict['value']=number_str
+                pii_list.append(pii_dict)
+                #print(pii_dict)
+
+    return pii_list
+
+
+
+def detect_other_piis(text):
+    matches = detect_pii(text, None, high_risk_tags)
+    if len(matches)>0:
+        pii_list = []
+        for m in matches:
+            pii = {}
+            pii['type']=m[-2]
+            pii['start']=m[1][0]
+            pii['end']=m[1][1]
+            pii['value']=m[0]
+            #print(pii)
+            pii_list.append(pii)
+        
+        return pii_list
+    else:
+        return None
+    
+def merge_outputs(presidio_outputs, bigscience_outputs): 
+    if bigscience_outputs!=None:
+        piis = presidio_outputs + bigscience_outputs
+        # TODO: sometimes KEY and PHONE_NUMBER overlap
+        # when merging, only keep one of them
+        # right now, the short-cut is to have the KEY and PHONE_NUMBER replacement to be the same format
+    
+        # detected_spans = []
+        # piis_to_remove = []
+        # for pii in piis:
+        #     span = (pii['start'], pii['end'])
+        #     if span in detected_spans:
+        #         #remove pii from piis
+        #         print('remove this pii: ', pii)
+        #         piis_to_remove.append(pii)
+            
+        #     detected_spans.append(span)
+
+        # piis = [pii for pii in piis if pii not in piis_to_remove]
+
+    else:
+        piis = presidio_outputs
+    return piis
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction.py
new file mode 100644
index 000000000..d5054a37e
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction.py
@@ -0,0 +1,320 @@
+"""
+this script is for processing pii-detection-dedaction 
+"""
+
+# from presidio_analyzer import AnalyzerEngine
+from presidio_analyzer.predefined_recognizers import PhoneRecognizer
+
+from utils import summarize_pii_entities
+import time
+import json
+from redact_pii import redact_pii_with_random_values, redact_pii_with_tags
+from detect_pii import detect_other_piis, detect_phone_numbers, merge_outputs
+
+import os, sys
+import time 
+import argparse
+from pprint import pprint
+from typing import Dict, List
+
+import ray
+import ray.data
+import pandas as pd
+import numpy as np
+from datasets import load_dataset
+
+import logging
+import glob
+
+
+def detect_redact_pii_for_one_text(text, analyzer):
+
+    detected_phone_numbers = detect_phone_numbers(text, analyzer)
+
+    # get output from bigscience-pii
+    detected_other_piis = detect_other_piis(text)
+    # merge the two outputs
+    piis = merge_outputs(detected_phone_numbers, detected_other_piis)
+    #print('Merged PIIs: ', piis)
+
+    if len(piis)>0:
+        # save result
+        #redact
+        redacted_text = redact_pii_with_random_values(text, piis)
+        #redacted_text = redact_pii_with_tags(text, piis)
+
+        output = {
+            'redacted': redacted_text,
+            'pii': piis,
+            "modified": True
+        }
+
+        
+    else: 
+        output = {
+            'redacted': None,
+            'pii': None,
+            "modified": False
+
+        }
+
+    return output
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input",
+        type=str,
+        default="tiiuae/falcon-refinedweb",
+        required=False,
+        help="Name of the dataset repository,e.g. togethercomputer/RedPajama-Data-1T"
+    )
+
+    # group.add_argument(
+    #     "--format",
+    #     type=str,
+    #     default="parquet",
+    #     required=False,
+    #     help="input data format, parquet or json"
+    # )
+
+    group.add_argument(
+        "--dataset-family",
+        type=str,
+        default="refinedweb",
+        required=False,
+        help="choose from: refinedweb, slimpajama, pile"
+    )
+
+    group.add_argument(
+        "--data-dir",
+        type=str,
+        required=False,
+        help="for local mode, you need to provide local dataset repository, e.g. /home/user/local"
+    )
+    group.add_argument(
+        "--cache-dir",
+        type=str,
+        default='/root/.cache',
+        help="Hugging Face cache dir, where the hugging face dataset it stored"
+    )
+    group.add_argument(
+        '--local', 
+        default=False, 
+        action='store_true', 
+        help="whether to use local mode to preprocess data"
+    )
+    group.add_argument(
+        "--load-batch-size", type=int, default=1000, help="only needed if you use streaming mode to read data from hugging face"
+    )
+    group.add_argument(
+        "--skip", type=int, default=None, help="how many samples to skip"
+    )
+    group = parser.add_argument_group(title="output data")
+    group.add_argument(
+        "--output-prefix",
+        type=str,
+        required=False,
+        default="processed",
+        help="Path to binary output file without suffix",
+    )
+    group = parser.add_argument_group(title="runtime")
+    group.add_argument(
+        "--cpu-per-worker", type=int, default=1, help="Number of CPUs to use per worker"
+    )
+    
+    args = parser.parse_args()   
+    args.output_path = '/home/user/local'
+    return args
+
+
+def main():
+    args = get_args()
+
+    # if args.format not in ['parquet', 'json']:
+    #     raise ValueError('data file format must be parquet or json')
+
+    output_dir = os.path.join(args.output_path, args.output_prefix)
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    exception_dir = output_dir+'/exceptions/'
+    cache_dir = args.cache_dir 
+    dataset_family = args.dataset_family
+    log_dir = output_dir+'/logs/'
+    if not os.path.exists(log_dir):
+        os.mkdir(log_dir)
+
+    logging.basicConfig(filename=log_dir+"newlog.txt",
+                    format='%(asctime)s %(message)s',
+                    filemode='w')
+    
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG)
+
+    logger.info(args)
+    logger.info('processing {} data.....'.format(dataset_family))
+
+    # init ray
+    ray.init(address='auto')
+    pprint(ray.cluster_resources())
+    num_nodes = len(ray.nodes())
+    parallelism = num_nodes * args.cpu_per_worker
+
+    logger.info('num of ray nodes: {}'.format(num_nodes))
+    logger.info('parallelism: {}'.format(parallelism))
+
+    def preprocess_fn(contents, metas, analyzer) -> pd.DataFrame:
+        # inputs are in batches
+        redacted_content = []
+        modified = []
+        piis = []
+        meta_output = []
+        original_content = []
+
+        exceptions = []
+
+        for i, text in enumerate(contents):
+            try:
+                # # for testing exception
+                # if i%5 == 0:
+                #     raise ValueError
+                output = detect_redact_pii_for_one_text(text, analyzer)
+                modified.append(output['modified'])
+                piis.append(output['pii'])
+                if output['pii'] != None:
+                    redacted_content.append(output['redacted'])
+                else:
+                    redacted_content.append(text)
+                meta_output.append(metas[i])
+            except:
+                logger.debug('exception occurred!') # seems cannot log from ray actor using this method
+                exceptions.append({
+                    'text':text,
+                    'meta': metas[i]
+                })
+        if len(exceptions)>0:
+            if not os.path.exists(exception_dir):
+                os.mkdir(exception_dir)
+            task_id = ray.get_runtime_context().get_task_id()
+            with open(exception_dir + task_id+'.json', 'w') as f:
+                json.dump(exceptions, f)
+
+        return pd.DataFrame({#"original": original_content, 
+                             'redacted': redacted_content,
+                             'piis': piis,
+                             'meta': meta_output,
+                             'modified': modified})
+
+
+
+    def pii_removal_refinedweb(batch: Dict[str, List]) -> pd.DataFrame:
+        # analyzer = AnalyzerEngine()
+        analyzer = PhoneRecognizer()
+
+        contents = batch['content'].tolist()
+
+        try:
+            urls = batch['url'].tolist()
+            timestamps = batch['timestamp'].tolist()
+            dump = batch['dump'].tolist()
+            segment = batch['segment'].tolist()
+            metas = []
+
+            for i in range(len(urls)):
+                metas.append({
+                    'url': urls[i],
+                    'timestamp': timestamps[i],
+                    'dump': dump[i],
+                    'segment': segment[i]
+                })
+        except:
+            metas = [None]*len(contents)
+
+        return preprocess_fn(contents, metas, analyzer)
+    
+    def pii_removal_slimpajama_pile(batch: Dict[str, List]) -> pd.DataFrame:
+        # analyzer = AnalyzerEngine()
+        analyzer = PhoneRecognizer()
+        # try:
+        contents = batch['text'].tolist()
+        try:
+            metas = batch['meta'].tolist()
+            # print(metas)
+        except:
+            metas = [None]*len(contents)
+        return preprocess_fn(contents, metas, analyzer)
+        # except:
+        #     if not os.path.exists(exception_dir):
+        #         os.mkdir(exception_dir)
+        #     task_id = ray.get_runtime_context().get_task_id()
+        #     with open(exception_dir + task_id+'.json', 'w') as f:
+        #         json.dump(batch, f)
+
+    
+
+    if not args.local:
+        dataset = load_dataset(args.input, streaming=True)['train']
+    else:
+        data_dir = args.data_dir
+        if dataset_family == 'refinedweb':
+            datafiles = glob.glob(data_dir + '*.parquet')
+            dataset = load_dataset('parquet', data_files = datafiles, streaming=True)['train']
+        elif dataset_family == 'slimpajama' or dataset_family == 'pile':
+            datafiles = glob.glob(data_dir+'*.jsonl')
+            dataset = load_dataset('json', data_files = datafiles, streaming=True)['train']
+        else:
+            raise ValueError('{} not supported'.format(dataset_family))
+        
+    if args.skip != None:
+        dataset_to_process = dataset.skip(args.skip)
+    else:
+        dataset_to_process = dataset
+
+    idx = 1
+    
+    t0 = time.time()
+    for rows in dataset_to_process.iter(batch_size=args.load_batch_size):
+        logger.info('Start processing batch # {}'.format(idx))
+        print("-----------------------------")
+        df = pd.DataFrame(rows)
+        # logger.info(df['meta'])
+        ray_dataset = ray.data.from_pandas(df)
+        ray_dataset = ray_dataset.repartition(parallelism) #, shuffle = True)
+
+        if dataset_family == 'refinedweb':
+            print('process refinedweb')
+            process_fn = pii_removal_refinedweb
+        elif dataset_family == 'slimpajama' or dataset_family == 'pile':
+            print('process slimpj or pile')
+            process_fn = pii_removal_slimpajama_pile
+        else:
+            raise ValueError('{} not supported'.format(dataset_family))
+
+        tokenized_data = ray_dataset.map_batches(process_fn, batch_format="numpy", batch_size=None)
+
+        if dataset_family == 'refinedweb':
+            tokenized_data.write_parquet(output_dir)
+        elif dataset_family == 'slimpajama' or dataset_family == 'pile':
+            tokenized_data.write_json(output_dir)
+        else:
+            raise ValueError('{} not supported'.format(dataset_family))
+
+        logger.info('Finished processing batch # {}'.format(idx))
+        logger.info(f"{idx} * {args.load_batch_size} samples were written to disk.")
+        idx += 1
+        print("============================")
+        if idx == 2:
+            #sys.exit()
+            break
+    t1 = time.time()
+    logger.info('Processing {} samples took {:.3f} sec'.format((idx-1)*args.load_batch_size, t1-t0))
+
+
+if __name__ == "__main__":
+    start = time.time()
+    main()
+    end = time.time()
+    print(f"\nthis script took {end-start}s.")
+
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction_v2.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction_v2.py
new file mode 100644
index 000000000..ff78bb9c9
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/pii_redaction_v2.py
@@ -0,0 +1,294 @@
+"""
+this script is for processing pii-detection-dedaction 
+"""
+
+# from presidio_analyzer import AnalyzerEngine
+from presidio_analyzer.predefined_recognizers import PhoneRecognizer
+
+from utils import summarize_pii_entities
+import time
+import json
+from redact_pii import redact_pii_with_random_values, redact_pii_with_tags
+from detect_pii import detect_other_piis, detect_phone_numbers, merge_outputs
+
+import os, sys
+import time 
+import argparse
+from pprint import pprint
+from typing import Dict, List
+
+try:
+    import ray
+    import ray.data
+except:
+    pass
+import pandas as pd
+import numpy as np
+from datasets import load_dataset
+
+import logging
+import glob
+
+
+def detect_redact_pii_for_one_text(text, analyzer):
+
+    detected_phone_numbers = detect_phone_numbers(text, analyzer)
+
+    # get output from bigscience-pii
+    detected_other_piis = detect_other_piis(text)
+    # merge the two outputs
+    piis = merge_outputs(detected_phone_numbers, detected_other_piis)
+    #print('Merged PIIs: ', piis)
+
+    if len(piis)>0:
+        # save result
+        #redact
+        redacted_text = redact_pii_with_random_values(text, piis)
+        #redacted_text = redact_pii_with_tags(text, piis)
+
+        output = {
+            'redacted': redacted_text,
+            'pii': piis,
+            "modified": True
+        }
+
+        
+    else: 
+        output = {
+            'redacted': None,
+            'pii': None,
+            "modified": False
+
+        }
+
+    return output
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input",
+        type=str,
+        default="tiiuae/falcon-refinedweb",
+        required=False,
+        help="Name of the dataset repository,e.g. togethercomputer/RedPajama-Data-1T"
+    )
+
+    # group.add_argument(
+    #     "--format",
+    #     type=str,
+    #     default="parquet",
+    #     required=False,
+    #     help="input data format, parquet or json"
+    # )
+
+    group.add_argument(
+        "--dataset-family",
+        type=str,
+        default="refinedweb",
+        required=False,
+        help="choose from: refinedweb, slimpajama, pile"
+    )
+
+    group.add_argument(
+        "--data-dir",
+        type=str,
+        required=False,
+        help="for local mode, you need to provide local dataset repository, e.g. /home/user/local"
+    )
+    group.add_argument(
+        "--cache-dir",
+        type=str,
+        default='/root/.cache',
+        help="Hugging Face cache dir, where the hugging face dataset it stored"
+    )
+    group.add_argument(
+        '--local', 
+        default=False, 
+        action='store_true', 
+        help="whether to use local mode to preprocess data"
+    )
+    group.add_argument(
+        "--load-batch-size", type=int, default=1000, help="only needed if you use streaming mode to read data from hugging face"
+    )
+    group.add_argument(
+        "--skip", type=int, default=None, help="how many samples to skip"
+    )
+    group = parser.add_argument_group(title="output data")
+    group.add_argument(
+        "--output-prefix",
+        type=str,
+        required=False,
+        default="processed",
+        help="Path to binary output file without suffix",
+    )
+    group = parser.add_argument_group(title="runtime")
+    group.add_argument(
+        "--cpu-per-worker", type=int, default=1, help="Number of CPUs to use per worker"
+    )
+    
+    args = parser.parse_args()   
+    args.output_path = '/home/user/local'
+    return args
+
+
+def main():
+    args = get_args()
+
+    # if args.format not in ['parquet', 'json']:
+    #     raise ValueError('data file format must be parquet or json')
+
+    output_dir = os.path.join(args.output_path, args.output_prefix)
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    exception_dir = output_dir+'/exceptions/'
+    cache_dir = args.cache_dir 
+    dataset_family = args.dataset_family
+    log_dir = output_dir+'/logs/'
+    if not os.path.exists(log_dir):
+        os.mkdir(log_dir)
+
+    logging.basicConfig(filename=log_dir+"newlog.txt",
+                    format='%(asctime)s %(message)s',
+                    filemode='w')
+    
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG)
+
+    logger.info(args)
+    logger.info('processing {} data.....'.format(dataset_family))
+
+    # init ray
+    ray.init(address='auto')
+    pprint(ray.cluster_resources())
+    num_nodes = len(ray.nodes())
+    parallelism = num_nodes * args.cpu_per_worker
+
+    logger.info('num of ray nodes: {}'.format(num_nodes))
+    logger.info('parallelism: {}'.format(parallelism))
+
+    def preprocess_fn(contents, analyzer) -> pd.DataFrame:
+        # inputs are in batches
+        text, doc_id, hash, meta, source, bytes = contents
+        redacted_content = []
+        modified = []
+        piis = []
+        meta_output = []
+        doc_id_output = []
+        hash_output = []
+        source_output = []
+        bytes_output = []
+
+        exceptions = []
+
+        for i, txt in enumerate(text):
+            try:
+                # # for testing exception
+                # if i%5 == 0:
+                #     raise ValueError
+                output = detect_redact_pii_for_one_text(txt, analyzer)
+                modified.append(output['modified'])
+                piis.append(output['pii'])
+                if output['pii'] != None: # have PII so output redacted text
+                    redacted_content.append(output['redacted'])
+                else: # did not have PII so output original text
+                    redacted_content.append(txt)
+                meta_output.append(meta[i])
+                doc_id_output.append(doc_id[i])
+                hash_output.append(hash[i])
+                source_output.append(source[i])
+                bytes_output.append(bytes[i])
+            except:
+                logger.debug('exception occurred!') # seems cannot log from ray actor using this method
+                exceptions.append({
+                    'text':txt,
+                    'doc_id': doc_id[i]
+                })
+        if len(exceptions)>0:
+            if not os.path.exists(exception_dir):
+                os.mkdir(exception_dir)
+            task_id = ray.get_runtime_context().get_task_id()
+            with open(exception_dir + task_id+'.json', 'w') as f:
+                json.dump(exceptions, f)
+
+        return pd.DataFrame({#"original": original_content, 
+                             'new_content': redacted_content,
+                              'meta': meta_output,
+                             'doc_id':doc_id_output,
+                             'hash': hash_output,
+                             'source': source_output,
+                             'bytesize':bytes_output,
+                             'secrets': piis, 
+                             'modified': modified})
+
+    
+    def pii_removal(batch: Dict[str, List]) -> pd.DataFrame:
+        # analyzer = AnalyzerEngine()
+        analyzer = PhoneRecognizer()
+        text = batch['text'].tolist()
+        doc_id = batch['doc_id'] #.to_list()
+        hash = batch['hash']#.to_list()
+        source = batch['source'].tolist()
+        bytes = batch['bytesize'].tolist()
+        meta = batch['meta'].tolist()
+
+        # try:
+        #     meta = batch['meta'].tolist()
+        #     # print(metas)
+        # except:
+        #     meta = [None]*len(contents)
+    
+        contents = (text, doc_id, hash, meta, source, bytes)
+
+        return preprocess_fn(contents, analyzer)
+
+
+    if not args.local:
+        dataset = load_dataset(args.input, streaming=True)['train']
+    else:
+        data_dir = args.data_dir
+        if data_dir[-1] != '/':
+            data_dir+='/'
+        datafiles = glob.glob(data_dir + '*.parquet')
+        dataset = load_dataset('parquet', data_files = datafiles, streaming=True)['train']
+        
+    if args.skip != None:
+        dataset_to_process = dataset.skip(args.skip)
+    else:
+        dataset_to_process = dataset
+
+    idx = 1
+    
+    t0 = time.time()
+    for rows in dataset_to_process.iter(batch_size=args.load_batch_size):
+        logger.info('Start processing batch # {}'.format(idx))
+        print("-----------------------------")
+        df = pd.DataFrame(rows)
+        # logger.info(df['meta'])
+        ray_dataset = ray.data.from_pandas(df)
+        # partition batch into total number of workers
+        ray_dataset = ray_dataset.repartition(parallelism) #, shuffle = True)
+        # processing batch
+        tokenized_data = ray_dataset.map_batches(pii_removal, batch_format="numpy", batch_size=None)
+        # gather data into one file per node
+        tokenized_data = tokenized_data.repartition(num_nodes)
+        tokenized_data.write_parquet(output_dir)
+
+        logger.info('Finished processing batch # {}'.format(idx))
+        logger.info(f"{idx} * {args.load_batch_size} samples were written to disk.")
+        idx += 1
+        print("============================")
+        # if idx == 2:
+        #     #sys.exit()
+        #     break
+    t1 = time.time()
+    logger.info('Processing {} samples took {:.3f} sec'.format((idx-1)*args.load_batch_size, t1-t0))
+
+
+if __name__ == "__main__":
+    start = time.time()
+    main()
+    end = time.time()
+    print(f"\nthis script took {end-start}s.")
+
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/process_exceptions.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/process_exceptions.py
new file mode 100644
index 000000000..ac156c750
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/process_exceptions.py
@@ -0,0 +1,10 @@
+import json
+import glob
+
+data_dir = "/home/user/local/processed/refinedweb/exceptions/"
+filename = "2af2834dff874b4affffffffffffffffffffffff13000000.json"
+
+with open(data_dir+filename, 'r') as f:
+    data = json.load(f)
+
+print(len(data))
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/redact_pii.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/redact_pii.py
new file mode 100644
index 000000000..0fee73d1a
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/redact_pii.py
@@ -0,0 +1,186 @@
+import ipaddress
+import random
+import string
+
+
+
+# option 1: similar to bigscience-pii redaction:
+#         # replace with [TAG], e.g., [EMAIL]
+#         #redacted_str, metadata = redact_pii(text, matches)
+# option 2: similar to bigcode-pii redaction:
+#         # IP: replace with predefined random IP address, or DNS servers
+#         # EMAIL, USERNAME, KEY: replace with random values
+#         # also keeping track of pii values through a sample
+#         # and replace with the same random value for the same pii value
+#         #print(redacted_str)
+#         redacted_str = redact_pii_with_random_values(text, matches)
+#         # metadata_out = {"regex metadata":metadata, "original": text, "redacted": redacted_str}
+#         # match_set = (redacted_str, metadata_out)
+
+
+# The IP replacements are copied from bigcode-pii
+# List of random private IP addresses to use as replacements
+REPLACEMENTS_IP = {
+    "IPv4": ["172.16.31.10", "172.16.58.3", "172.16.17.32", "192.168.127.12", "192.168.3.11"],
+    "IPv6": [
+        "fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b",
+        "fd00:a516:7c1b:17cd:6d81:2137:bd2a:2c5b",
+        "fc00:e968:6179::de52:7100",
+        "fc00:db20:35b:7399::5",
+        "fdf8:f53e:61e4::18",
+    ],
+}
+
+# providergs = ["google", "cloudfare", "alternate-dns", "quad9","open-dns", "comodo", "adguard"]
+POPULAR_DNS_SERVERS = [
+    "8.8.8.8",
+    "8.8.4.4",
+    "1.1.1.1",
+    "1.0.0.1",
+    "76.76.19.19",
+    "76.223.122.150",
+    "9.9.9.9",
+    "149.112.112.112",
+    "208.67.222.222",
+    "208.67.220.220",
+    "8.26.56.26",
+    "8.20.247.20",
+    "94.140.14.14",
+    "94.140.15.15",
+]
+
+letters = string.ascii_lowercase
+digits = string.digits
+lettters_digits = string.ascii_lowercase + string.digits
+
+# random emails
+n = 100
+REPLACEMENT_EMAIL = [
+        "".join(random.choice(letters) for i in range(10)) + "@example.com"
+        for i in range(n)
+    ]
+
+# random keys
+REPLACEMENT_KEY = [
+        "".join(random.choice(digits) for i in range(10))
+        for i in range(n)
+    ]
+# simple hack: make key replacement and phone replacement to be 
+# both 10 random digits
+# to simplify redaction
+# [
+#         "".join(random.choice(lettters_digits) for i in range(32)) for i in range(n)
+#     ]
+
+# random usernames
+REPLACEMENT_USERNAME = [
+        "@"+"".join(random.choice(letters) for i in range(10))
+        for i in range(n)
+    ]
+
+REPLACEMENT_PHONE = [
+        "".join(random.choice(digits) for i in range(10))
+        for i in range(n)
+    ]
+
+REPLACEMENT_DICT={
+    'EMAIL': REPLACEMENT_EMAIL,
+    'KEY': REPLACEMENT_KEY,
+    'USER': REPLACEMENT_USERNAME,
+    'PHONE_NUMBER':REPLACEMENT_PHONE
+}
+
+def is_private_ip(ip):
+    """Check if an IP address is allocated for private networks"""
+    ip = ipaddress.ip_address(ip)
+    return ip.is_private
+
+def replace_ip(value):
+    """Replace an IP address with a synthetic IP address of the same format"""
+    # ipaddress.ip_address(ip) raises exception when ip i snot valid
+    # if is_private_ip(value) or (value in POPULAR_DNS_SERVERS):
+    #     return value
+    
+    if value in POPULAR_DNS_SERVERS:
+        #print('IP is one of DNS servers, return original value: ', value)
+        return value
+
+    try:
+        ipaddress.IPv4Address(value)
+        #print('IP is IPv4, return redacted value')
+        return random.choice(REPLACEMENTS_IP["IPv4"])
+    except ValueError:
+        try:
+            ipaddress.IPv6Address(value)
+            #print('IP is IPv6, return redacted value')
+            return random.choice(REPLACEMENTS_IP["IPv6"])
+        except ValueError:
+            # this doesn't happen if we already use ipaddress filter in the detection
+            # this is good as we have another layer of protection to redace false positive
+            #print("Invalid IP address:", value)
+            return value
+
+def redact_email_key_user_phone(value, tag):
+    supported_tags = {'KEY', 'EMAIL', 'USER', 'PHONE_NUMBER'}
+    if tag in supported_tags:
+        #return random.choice(REPLACEMENT_DICT[tag]) 
+        if tag=='KEY':
+            redact_value = "".join(random.choice(digits) for i in range(10))
+        if tag == 'EMAIL':
+            redact_value = "".join(random.choice(letters) for i in range(10)) + "@{}.com".format("".join(random.choice(letters) for i in range(5)))
+        if tag == 'USER':
+            redact_value = "@"+"".join(random.choice(letters) for i in range(10))
+        if tag == 'PHONE_NUMBER':
+            redact_value = "".join(random.choice(digits) for i in range(10))
+        return redact_value
+    else:
+        #print('{} type is not supported!'.format(tag))
+        return value 
+
+
+# TODO: generate random strings on the fly, instead of choose from one of n
+def redact_pii_with_random_values(text, matches):
+  # adapted from bigcode-pii redaction
+  # however, matches here is a list of dictionaries
+  # the dictionary is of this schema:
+  # {'start': 123, 'end': 234, 'value': xyz, 'type': PHONE_NUMBER}
+    redacted_str = text
+    replaced_values = []
+    lookup_dict = {}
+    for match in matches:
+        start_idx = match['start']
+        end_idx = match['end']
+        matched_str = match['value'] #text[start_idx:end_idx]
+        tag = match['type']
+        if matched_str in replaced_values:
+            redact_tag = lookup_dict[matched_str]
+        else:
+            if tag == 'IP_ADDRESS':   
+                redact_tag = replace_ip(matched_str)
+                
+            else:
+                redact_tag = redact_email_key_user_phone(matched_str, tag)
+
+            replaced_values.append(matched_str)
+            lookup_dict[matched_str]=redact_tag
+        
+        # print('original: ', matched_str)
+        # print('redacted tag: ', redact_tag)
+        match['redacted'] = redact_tag
+        redacted_str = redacted_str.replace(matched_str, redact_tag)
+    # Create the "metadata" as all of the information we had before redaction
+    #metadata += [(match)]
+    #print(matches)
+    return redacted_str
+
+
+def redact_pii_with_tags(text, matches):
+    # adapted from bigscience-pii
+    redacted_str = text
+    for match in matches:
+        matched_str = match['value']
+        tag = match['type']
+        redact_tag = "[" + tag +"]"
+        redacted_str = redacted_str.replace(matched_str, redact_tag)
+
+    return redacted_str
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/utils.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/utils.py
new file mode 100644
index 000000000..5e1d4bdd9
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/utils.py
@@ -0,0 +1,162 @@
+import json
+from datasets import load_dataset
+import random
+from bigscience_pii_detect_redact import detect_pii
+
+piis_not_to_consider =['PERSON',
+                       'NRP',
+                       'LOCATION',
+                       'DATE_TIME',
+                       'URL'
+                       ]
+
+def write_to_html(result, text_corpus):
+    idx = result['doc_idx']
+    text = text_corpus[idx]['text']
+    text_list = list(text)
+    
+    entity_categories = []
+    n = 0
+    for r in result['pii']:
+        start = int(r['start'])
+        end = int(r['end'])
+        text_list.insert(start+3*n, '<mark><b>')
+        text_list.insert(end+1+3*n, "</b></mark>")
+        text_list.insert(end+2+3*n, '<b>[[['+r['type']+']]]</b>')
+        n+=1
+        entity_categories.append(r['type'])
+    
+    bolded = ''.join(text_list)
+    #html = "<html>"+bolded+"</html>"
+    #print(html)
+
+    summary = summarize_pii_entities(entity_categories)
+    
+    bolded = summary + '</p>'+ bolded
+    return bolded, entity_categories
+
+
+def summarize_pii_entities(entity_categories):
+    unique_categories = list(set(entity_categories))
+    summary = 'PIIs: '
+    for e in unique_categories:
+        occurences = entity_categories.count(e)
+        summary += (e + ": "+str(occurences)+'; ')
+    return summary
+
+def parse_recognizer_result(result):
+    #temp = result.split(',')
+    #assert len(temp)==4, 'a valid result should have 4 fields, but only got {} fields'.format(len(temp))
+    parsed_dict = {}
+    
+    parsed_dict['type']=result.entity_type
+    parsed_dict['start']=result.start
+    parsed_dict['end']=result.end #temp[2][5:]
+    #parsed_dict['score']=result.score #temp[3][6:]
+    
+    return parsed_dict
+
+
+def count_num_piis_to_consider(result):
+    #result is a dictionary of this format
+    # {doc_idx:123, num_pii: 2, pii: [{type: ABC, start:234, end: 256, score:0.6}, {}]}
+    filtered_piis = []
+    piis = result['pii']
+    num_piis_to_consider = 0
+    for pii in piis:
+        if pii['type'] in piis_not_to_consider:
+            #print('Not including {} category'.format(pii['type']))
+            continue
+        else:
+            num_piis_to_consider += 1
+            #print(pii)
+            if pii['type'] != 'IP_ADDRESS' and pii['type'] !='EMAIL_ADDRESS':
+                pii['type'] = 'ID_NUM_STR'
+            #print(pii)
+            filtered_piis.append(pii)
+    #print('number of piis to consider: ', num_piis_to_consider)
+    #print('filtered piis: ',filtered_piis)
+
+    return num_piis_to_consider, filtered_piis
+
+def filter_results_by_category(results):
+    filtered_results = []
+    for result in results:
+        num_piis_to_consider, filtered_piis = count_num_piis_to_consider(result)
+        if  num_piis_to_consider>0:
+            result['pii'] = filtered_piis
+            result['num_pii']=len(filtered_piis)
+            filtered_results.append(result)
+    #print('filtered results: ',filtered_results)
+    return filtered_results
+
+def sample_results(results, number_of_samples):
+    random.seed(1234)
+    if len(results) > number_of_samples:
+        return random.sample(results, number_of_samples)
+    else:
+        return results
+
+# this tag list is copied from 
+# https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/02_pii/bigscience_pii_detect_redact.py#L53
+high_risk_tags = {'KEY', 'EMAIL', 'USER', 'IP_ADDRESS'} # , 'NUMBER', "ID"}
+    
+def detect_with_bigscience_pii_single_sample(text):
+    matches = detect_pii(text, None, high_risk_tags)
+    if len(matches)>0:
+        pii_list = []
+        for m in matches:
+            #print(m)
+            pii = {}
+            pii['type']=m[-2]
+            pii['start']=m[1][0]
+            pii['end']=m[1][1]
+            #print(pii)
+            pii_list.append(pii)
+        
+        return pii_list
+    else:
+        return None
+    
+def is_phone_number(matched_str):
+    DEFAULT_SUPPORTED_REGIONS = ("US", "UK", "DE", "FE", "IL", "IN", "CA", "BR")
+    #valid = phonenumbers.is_valid_number(matched_str)
+    #print(matched_str)
+    for region in DEFAULT_SUPPORTED_REGIONS:
+        try:
+            parsed_number = phonenumbers.parse(matched_str)
+        except:
+            #print('cannot parse the string as phone number')
+            return False
+        
+        flag = phonenumbers.is_possible_number(parsed_number)
+        if flag == True:
+            #print('KEY is PHONE_NUMBER')
+            return True
+            
+    return False
+    
+
+
+def remove_phone_numbers_from_bigscience_results(matches):
+    # use bigscience-pii to detect
+    # emails, ip addresses, usernames, id alphanumerics
+    if len(matches)>0:
+        pii_list = []
+        phone_matches = []
+        for i, m in enumerate(matches):
+            matched_str = m[0]
+            if is_phone_number(matched_str):
+                phone_matches.append(i)
+            # else:
+            #     # print(m)
+            #     pii = {}
+            #     pii['type']=m[-2]
+            #     pii['start']=m[1][0]
+            #     pii['end']=m[1][1]
+            #     print(pii)
+            #     pii_list.append(pii)
+
+        
+        matches = [matches[i] for i in range(len(matches)) if i not in phone_matches]   
+    return matches
diff --git a/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/validate_ray_outputs.py b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/validate_ray_outputs.py
new file mode 100644
index 000000000..c7b32203f
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_detection_redaction/src/validate_ray_outputs.py
@@ -0,0 +1,158 @@
+from datasets import load_dataset
+# from presidio_analyzer import AnalyzerEngine
+from utils import summarize_pii_entities
+# import time
+import json
+# from redact_pii import redact_pii_with_random_values, redact_pii_with_tags
+# from detect_pii import detect_other_piis, detect_phone_numbers, merge_outputs
+import glob
+import random
+import os
+import pandas as pd
+# from matplotlib import pyplot as plt
+
+def write_merged_detect_redact_results_to_html(sample):
+    # sample is a dictionary {'text': xxxx, pii: []}
+    #text = sample['original']
+    piis = sample['secrets']
+    entity_categories = []
+ 
+    if sample['modified']:
+        #text_list = list(text)
+        redacted_text = sample['text']
+        redacted_text_list = list(redacted_text)
+    
+        
+        n = 0
+        for r in piis:
+            start = int(r['start'])
+            end = int(r['end'])
+            # text_list.insert(start+3*n, '<mark><b>')
+            # text_list.insert(end+1+3*n, "</b></mark>")
+            # text_list.insert(end+2+3*n, '<b>[[['+r['type']+']]]</b>')
+                    
+            redacted_text_list.insert(start+3*n, '<mark><b>')
+            redacted_text_list.insert(end+1+3*n, "</b></mark>")
+            redacted_text_list.insert(end+2+3*n, '<b>[[['+r['type']+']]]</b>')
+
+            n+=1
+            entity_categories.append(r['type'])
+
+    
+        bolded = ''.join(redacted_text_list)
+        #html = "<html>"+bolded+"</html>"
+        #print(html)
+
+        # redacted_marked = ''.join(redacted_text_list)
+        summary = summarize_pii_entities(entity_categories)
+        bolded = summary + '</p>'+ bolded
+
+    else:
+        bolded = sample['text']
+        # redacted_marked = None
+
+    return bolded, entity_categories
+
+
+
+
+path = '/home/vmagent/app/falcon-refinedweb-pii-remove/'
+datafile = glob.glob(path + '*.parquet')
+# randomly pick one file from output
+filename = random.choice(datafile)
+output = 'pii_test'
+
+# Check 1: load with pd to check schema and content
+df = pd.read_parquet(filename)
+print(df.head(10))
+
+print(df.shape)
+
+
+# Check 2: get statistics from a sample
+def get_stats(row):
+    count_dict = {
+            'PHONE_NUMBER': 0,
+            'IP_ADDRESS':0,
+            'EMAIL': 0,
+            'USER':0,
+            'KEY':0
+        }
+     
+    if row['modified'] == True:
+        pii = row['secrets']
+        num_piis = len(pii)
+        for x in pii:
+            count_dict[x['type']] += 1
+
+    else:
+        num_piis = 0
+
+    return num_piis, count_dict
+
+
+sample_files = random.sample(datafile, min(10, len(datafile)))
+total_num_piis = 0
+count_dict_all = {
+        'PHONE_NUMBER': 0,
+        'IP_ADDRESS':0,
+        'EMAIL': 0,
+        'USER':0,
+        'KEY':0
+    }
+
+for f in sample_files:
+    df = pd.read_parquet(f).sample(1000)
+    for _, row in df.iterrows():
+        num_piis, count_dict = get_stats(row)
+        total_num_piis += num_piis
+        for k, v in count_dict.items():
+            count_dict_all[k] += v
+
+print(count_dict_all)
+
+
+# Check 3: visual check with html
+df= df.sample(100)
+html=""
+num_piis = []
+entities = []
+# summary = 'Total number of samples: '+str(len(samples)) +'</p>'
+summary = ""
+
+for _, sample in df.iterrows():
+    #print(sample['meta'])
+    bolded, entity_categories = write_merged_detect_redact_results_to_html(sample)
+
+    try:
+        meta = sample['meta']
+        html += (meta + '</p>'+bolded+"</p>")
+    except:
+        html += '</p>---------------------------</p>'
+        html += '</p>'+bolded+"</p>"
+    
+    if sample['modified']:            
+        # html += '</p>'+redacted+"</p>"
+        num_piis.append(len(sample['secrets']))
+        entities.extend(entity_categories)
+
+
+assert sum(num_piis)==len(entities), 'number of entities not match'
+
+
+summary += 'Total number of PIIs: {}'.format(len(entities))
+
+summary += '</p>' + summarize_pii_entities(entities) +'</p>'
+
+html = '<html>'+summary+html+"</html>"
+
+
+output_path = path + 'validation/'
+if not os.path.exists(output_path):
+    os.mkdir(output_path)
+
+
+output_file = output_path + '{}-pii-validation.html'.format(output)
+f = open(output_file,"w")
+f.write(html)
+f.close()
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_redaction.py b/tools/pii_removal_for_contact_mp/pii_redaction.py
new file mode 100644
index 000000000..bcac8a0f9
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_redaction.py
@@ -0,0 +1,23 @@
+import argparse
+import os
+from multiprocessing import Pool, cpu_count
+from pyrecdp.core.utils import Timer
+from math import ceil
+from tqdm import tqdm
+import json
+from pii_redaction_impl import *
+
+ 
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", dest="data_dir", type=str)
+    parser.add_argument("-o", dest="out_dir", type=str)
+    parser.add_argument("-mp", dest="mp", type=int, default=-1)
+    args = parser.parse_args()
+    
+    data_dir = args.data_dir
+    out_dir = args.out_dir
+    n_parallel = args.mp
+    
+    with Timer(f"generate hash to {data_dir}"):
+        pii_remove_MP(data_dir, out_dir, n_parallel)
\ No newline at end of file
diff --git a/tools/pii_removal_for_contact_mp/pii_redaction_impl.py b/tools/pii_removal_for_contact_mp/pii_redaction_impl.py
new file mode 100644
index 000000000..9589e7833
--- /dev/null
+++ b/tools/pii_removal_for_contact_mp/pii_redaction_impl.py
@@ -0,0 +1,142 @@
+import argparse
+import os, sys
+from pyrecdp.core.utils import Timer
+import json
+from pyrecdp.primitives.llmutils.utils import get_nchunks_and_nproc, clean_str
+import hashlib
+import pandas as pd
+from tqdm import tqdm
+import subprocess #nosec
+import io
+import re
+import time
+import hashlib
+
+from presidio_analyzer.predefined_recognizers import PhoneRecognizer
+
+import sys, pathlib
+cur_path = str(pathlib.Path(__file__).parent.resolve())
+import_path = os.path.join(cur_path, "pii_detection_redaction", "src")
+print(f"add new import_path: {import_path}")
+sys.path.append(import_path)
+
+from pii_redaction_v2 import *
+            
+def pii_removal_impl_parquet_to_parquet(in_file_name, out_file_name, base_file_name):
+    analyzer = PhoneRecognizer()
+    batch = pd.read_parquet(in_file_name).reset_index(drop=True)
+    text = batch['text'].tolist()
+    redacted_content = []
+    modified = []
+    piis = []
+    
+    #for txt in tqdm(text, total=len(text), desc = f"process {in_file_name}"):
+    for txt in text:
+        # # for testing exception
+        # if i%5 == 0:
+        #     raise ValueError
+        output = detect_redact_pii_for_one_text(txt, analyzer)
+        modified.append(output['modified'])
+        piis.append(output['pii'])
+        if output['pii'] != None: # have PII so output redacted text
+            redacted_content.append(output['redacted'])
+        else: # did not have PII so output original text
+            redacted_content.append(txt)
+    
+    batch['text'] = pd.Series(redacted_content)
+    batch['secrets'] = pd.Series(piis)
+    batch['modified'] = pd.Series(modified)
+
+    batch.to_parquet(out_file_name)
+
+# define actual work
+def pii_remove(proc_id, x_list, out_type):
+    #for x in tqdm(x_list, total=len(x_list), desc=f"proc-{proc_id}", position=proc_id+1):
+    for x in x_list:
+        try:
+            in_file_name, out_file_name, base_file_name = x
+            base_file_name = os.path.basename(base_file_name)
+            out_dir = os.path.dirname(out_file_name)
+            os.makedirs(out_dir, exist_ok=True)
+            pii_removal_impl_parquet_to_parquet(in_file_name, out_file_name, base_file_name)
+
+        except Exception as e:
+            with open(f"{out_file_name}.error.log", 'w') as f:
+                f.write(f"Failed to process {base_file_name}, error is {e}")
+    return True
+    
+def wait_and_check(pool):
+    for proc_id, (process, cmd) in pool.items():
+        std_out, std_err = process.communicate()
+        rc = process.wait()
+        if rc != 0:
+            file_name = f"pii-redaction-proc-{proc_id}.error.log"
+            print(f"Task failed, please check {file_name} for detail information")
+            with open(file_name, "a") as f:
+                f.write(f"=== {time.ctime()} {' '.join(cmd)} failed. ===\n")
+                f.write(std_err.decode(sys.getfilesystemencoding()))
+                f.write("\n")
+                                
+def launch_cmdline_mp(args, data_dir, out_dir, mp):
+    pool = {}
+    for arg in tqdm(args, total=len(args), desc="pii redaction"):
+        proc_id, x_list = arg
+        cmd = ["python", "pii_redaction_impl.py", "--proc_id", f"{proc_id}", "--in_dir", f"{data_dir}", "--out_dir", f"{out_dir}", "--file_list", f"{x_list}"]
+        #f.write(' '.join(cmd) + "\n")
+        pool[proc_id] = (subprocess.Popen(cmd , stdout=subprocess.PIPE, stderr=subprocess.PIPE), cmd)
+        
+        if len(pool) >= mp:
+            wait_and_check(pool)
+            pool = {}
+        
+    wait_and_check(pool)
+
+def get_target_file_list(data_dir, file_type):
+    cmd = ["find", data_dir, "-name", f"*.{file_type}"]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    stdout, stderr = proc.communicate()
+    exitcode = proc.returncode
+    if exitcode != 0:
+        return []
+    else:
+        ret = stdout.decode("utf-8").split('\n')[:-1]
+        ret = [i.replace(data_dir, "") for i in ret]
+        ret = [i[1:] if i[0] == '/' else i for i in ret]
+        return ret
+    
+def pii_remove_MP(data_dir, out_dir, n_part = -1):
+    files = get_target_file_list(data_dir, 'parquet')
+    #print(files)
+
+    if len(files) == 0:
+        print("Detect 0 files, exit here")
+        return
+
+    if n_part != -1:
+        n_proc = n_part
+    else:
+        _, n_proc = get_nchunks_and_nproc(len(files), n_part = n_part)
+    print(f"resetting to {n_proc} for number of processes")
+    
+    args = [(idx, [i]) for idx, i in enumerate(files)]
+    launch_cmdline_mp(args, data_dir, out_dir, n_proc)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--proc_id", dest="proc_id", type=int)
+    parser.add_argument("--in_dir", dest="in_dir", type=str)
+    parser.add_argument("--out_dir", dest="out_dir", type=str)
+    parser.add_argument("--file_list", dest="file_list", type=str)
+    args = parser.parse_args()
+
+    proc_id = args.proc_id
+    in_dir = args.in_dir
+    out_dir = args.out_dir
+    in_file_list = eval(args.file_list)
+    out_type = 'parquet'
+    
+    file_args = [(os.path.join(in_dir, f_name), os.path.join(out_dir, f"{f_name}.pii_remove.{out_type}"), f_name) for f_name in in_file_list]
+
+    with Timer(f"generate hash index with proc-id {proc_id}"):
+        pii_remove(proc_id, file_args, out_type)