pseudonymization.cg

## The principle in this CG module is to append the prefix "ps-" into all those instances where
## the form in question needs to be removed. The prefix refers to pseudonymization, which the
## tag can easily be used for, and is used in IKDP-2 research project.

## This is the initial version of the rule set, to be registered into Zenodo. Further refinements 
## will be done as the testing advances.

# LIST @X = @X ;

## This is the list of large places that do not need to be removed

LIST LARGE-PLACES = "Азербайджан" "Башкирия" "Большеземельская" "Германия", "Голландия" "Канада" "Киров" "Кольский" "Куопио" "Лапландия" "Ленинград" "Малоземельская" "Мезень" "Мексико" "Москва" "Перм" "Печера" "Печора" "Салехард" "Санкт-Петербург" "Сомбатхей" "Сыктывкар" "Тайланд" "Ухта" "Хельсинки" "Япония" ;
LIST GENERIC-TERMS = "Правда" "Владимирскей" "Боголюбскей" ;
LIST SMALL-PLACE = Sem/Plc ;
LIST BASIC-NUMBERS = "мӧд" ;

## There are places such as Ёль, which are a well known small village in Izhma region of Komi Republic,
## but also mean simply 'stream'. This kind of names are discussed in the article, and the decision for
## now has been to consider them as removable names if they are written with initial upper case letter.
## This is not a perfect solution, as it also removes them from sentence initial position. However, the current setup works really well as now the pseudonymization 
# tag is kept when the form is in lowercase, but the only possible reading is a proper noun.

REMOVE:PropIfLowerCase (Prop) IF (0 (/<\[а-яӧіё\].+>/r N)) ;

## Large place tag substitution is done here, this simply sets them apart from the rest.

SUBSTITUTE:keep-large-places (Sem/Plc) (Sem/LargePlc) TARGET LARGE-PLACES ;

SUBSTITUTE:mark-small-places (Sem/Plc) (ps-Sem/Plc) TARGET SMALL-PLACE ;

ADD:find-dates-with-years (ps-Date) TARGET (Num) OR (Ord) (1* ("во")) ;

ADD:find-dates-with-born (ps-Date) TARGET (Num) OR (Ord) ((-1* ("чужны")) OR (-1* ("рӧдитчывны")) OR (-1* ("рӧдитчыны"))) ;

ADD:mark-names (ps-Name) TARGET (Prop) ((0 (Sem/Mal)) OR (0 (Sem/Fem)) OR (0 (Sem/Patr)) OR (0 (Sem/Patr-Mal)) OR (0 (Sem/Patr-Fem)) OR (0 (Sem/Sur)) OR (0 (Sem/Sur-Mal)) OR (0 (Sem/Sur-Fem))) ; 

## Since the rules above are not sufficient to detect years and dates correctly,
## we have decided to remove those more radically in this point. Further refinement
## of the system will try to elaborate this.

ADD:find-ordinal-numbers (ps-Number) TARGET (Ord) ;

## There are instances where words that cannot be names do get tagged.
## This is probably due to some foreign names being included into the lexicon,
## which is of course a correc thing to do. But as we know these are never names
## in our corpus, let's remove some of the most common cases.
## This list needs to be extended, especially cases like "Кар" cause lots of
## unnecessary readings.


# Пока should had lost the tag now
# Бабай
# Комин

# These get now only Name reading:

# Бурак Второй Верес Пастух
# Меня Мало Ниче Один Ахо Бедные Большая Великий Готов Дорогая Золотое Печень

# This is more difficult category:
# Волам > Вов N Prop Sem/Mal Sg Ill PxSg1

# Probably a typo: Сие

LIST IMPOSSIBLE-NAMES = "Али" "Бабай" "Баска" "Беда" "Бура" "Бурак" "Буран" "Брак" "Ваго" "Ватага" "Воим" "Вон" "Дата" "Дона" "Горай" "Гусь" "Иг" "Кёрк" "Кар" "Карл" "Карим" "Класс" "Кино" "Коз" "Комин" "Кос" "Кора" "Куим" "Кукла" "Мач" "Машин" "Ной" "Ли" "Лок" "Пап" "Пок" "Пока" "Тат" "Стойка" "Школа" "Юр" "Ы" ;

REMOVE IMPOSSIBLE-NAMES ;

## Cases that still need to be resolved, as it is bit annoying that forms like "Само"
## get currently only one reading. But this more complex problem and relates also to 
## Russian FST.
## "Само" "Севера" "Правда"