Skip to content

Commit

Permalink
fixed minor issues based on the feedbacks of chatbot users
Browse files Browse the repository at this point in the history
- updated entities based on user feedback
- parser Extract() now accepts numbers like "háromszor" "kétszer"
- added test case for this
- version 1.2.0 released
  • Loading branch information
sedthh committed Jun 26, 2018
1 parent cdc3637 commit 45920ed
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 8 deletions.
3 changes: 1 addition & 2 deletions lara/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
# Lara - Lingusitic Aim Recognizer API

__all__ = 'nlp','parser','stemmer','entities'
__version__ = '1.1.15'
__version__ = '1.2.0'
__version_info__ = tuple(int(num) for num in __version__.split('.'))

import sys
import lara.nlp
import lara.parser
import lara.stemmer
Expand Down
8 changes: 4 additions & 4 deletions lara/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def common():
"sorry" : [{"stem":"(meg)?bocs(i(ka)?|esz|[aá](nat([aá][eé]rt)?|nat[aáo]t?|s+|s+on|j?t(ana)?))?","wordclass":"regex"},{"stem":"elnézés","wordclass":"noun","match_stem":False},{"stem":"sajn[aá]l(om|juk)","wordclass":"regex"},{"stem":"s+z*o*r+[iy]+(ka)?","wordclass":"regex"}],
"lol" : [{"stem":"(h[aei]){2,}h?","wordclass":"regex"},{"stem":"o?(lol)+o?","wordclass":"regex"},{"stem":"[\:\;]\-*[dp\)9]+","wordclass":"regex","boundary":False},{"stem":"[\(8]+\-*[:;]","wordclass":"regex","boundary":False},{"stem":"rot?fl","wordclass":"regex"},{"stem":"vicces","exc":[{"stem":"nem"}]},{"stem":"nevet(tem|ek|[uü]nk)","wordclass":"regex","exc":[{"stem":"nem"}]}],
"nvm" : [{"stem":"felejts[ed]n?\sel","wordclass":"regex"},{"stem":"mindegy","exc":[{"stem":"hogy"},{"stem":"nem"}]},{"stem":"nem fontos"},{"stem":"hagy(jad?|d)","wordclass":"regex","inc":[{"stem":"jól","affix":["van"]},{"stem":"á"},{"stem":"mindegy"},{"stem":"inkább"}]},{"stem":"ne\s(is\s)?(foglalkoz+(on|[aá]l)?|t[oö]r[oöő]dj([oö]n|[eé]l)?)\s(vel(e|[uü]k)|[ae][vz]+[ae]l)","wordclass":"regex"},{"stem":"hagy\w+\sfigyelmen\sk[ií]v[uü]l","wordclass":"regex","exc":[{"stem":"ne"}]},{"stem":"nem baj","max_words":2}],
"help" : [{"stem":"segít","wordclass":"verb","prefix":[],"exc":[{"stem":"nem"}]},{"stem":"segítség","wordclass":"noun","exc":[{"stem":"nem"}]},{"stem":"help","wordclass":"verb","prefix":[],"exc":[{"stem":"nem"}]}],
"help" : [{"stem":"segít","wordclass":"verb","prefix":[],"exc":[{"stem":"nem"},{"stem":"miben"}]},{"stem":"segítség","wordclass":"noun","exc":[{"stem":"nem"}]},{"stem":"help","wordclass":"verb","prefix":[],"exc":[{"stem":"nem"}]}],
"again" : [{"stem":"[uú]j(ra|b[oó]l|at)|(meg)?ism[eé]t(l[eé]s|el(je|ni)?d?)?|m[eé]g\s?eg+y(szer|et)|megint?","wordclass":"regex","exc":[{"stem":"vagyok"},{"stem":"ne","affix":["m"]}]}],
"command" : [{"stem":"(csin[aá]l(jad?|d)|(keres|mutas|mond)[aedjos]+n?|n[eé]z[nz]?[eé]?[dl]|akaro[km]|utas[ií]t\w{1,})","wordclass":"regex"},{"stem":"haj[cts]+(a|[aá]?[ld])\sv[eé]gre","wordclass":"regex"}],
"question" : [{"stem":"(\?+$)|(\?+\s\w+)","wordclass":"regex"},{"stem":"([^,][^,\S+]hogy|^hogy)(an)?","wordclass":"regex"},{"stem":"hol"},{"stem":"honnan"},{"stem":"hová"},{"stem":"hány","affix":["an","at","ból"]},{"stem":"mettől"},{"stem":"meddig"},{"stem":"merre"},{"stem":"mennyi","affix":["en","re"]},{"stem":"mi","affix":["t","k","ket","kor","korra","lyen","lyenek","nek","től","kortól","korra","ből","hez","re","vel"]},{"stem":"ki(k?(e?t|nek|[bt][oöő]l|hez|re|[kv]el)|\saz?)","wordclass":"regex","exc":[{"stem":"\w+[ad]\ski","wordclass":"regex"}]}],
Expand Down Expand Up @@ -121,14 +121,14 @@ def smalltalk():
"how_are_you" : [{"stem":"hogy vagy"},{"stem":"j[oó](l|b+an)\svagy","wordclass":"regex"},{"stem":"(j[oó]l|hogy)\s[eé]rzed\s(most\s)?magad(at)?","wordclass":"regex"},{"stem":"mizu","affix":["js","jság"]},{"stem":"hogy ityeg"},{"stem":"(hogy\stelt\sa|milyen(\svolt\sa)?)\snapod(\svan)?","wordclass":"regex"},{"stem":"[vw]+h*[aá]+[csz]+[aáu]+p+","wordclass":"regex"},{"stem":"(j[oó]|milyen)\s(a\s)?kedved(\svan)?","wordclass":"regex"},{"stem":"mi\sa(z\s[aá]bra|\sst[aá]jsz)","wordclass":"regex"},{"stem":"hogy\s[eé]rz(i|ed)\smag[aá][dt]","wordclass":"regex"},{"stem":"mi a","inc":[{"stem":"helyzet"},{"stem":"stájsz"}]},{"stem":"mit csinálsz","max_words":3},{"stem":"mi a stájsz"},{"stem":"hogy ityeg"}],
"about_name" : [{"stem":"(mond*(ja)?\ski|mi\sa)\s(bece)nev[eé][dt](et)?","wordclass":"regex"},{"stem":"(hogy(an)?|minek)\s(is\s)?(h[ií]v([jn][aá](la)?k|hatom)|nevez+(nek|elek))","wordclass":"regex","exc":[{"stem":"engem"},{"stem":"én"}]},{"stem":"(mi?[eé]rt\s|hogy[\s\-]?hogy\s)(let+\s)?(pont\s)?(ezt?\s(let+\s)?(a\s)?|[ií]gy\s|ilyen\s)(nevez[nt]ek|h[ií]v[nt]ak|neved|nevet\s(kapt[aá][dl]|adt[aá]k))","wordclass":"regex"},{"stem":"mi\sa\s(bece)?neved?","wordclass":"regex","exc":[{"stem":"az|[ae]n+[ae]k|amiben?|amelyik\w*","wordclass":"regex"},{"stem":"engem"},{"stem":"én"}]},{"stem":"n[eé]v(ed)?\seredete","wordclass":"regex"}],
"about_you" : [{"stem":"(mes[eé]lj|besz[eé]lj|mondj)([eo]n)?.+?mag(ad|[aá])r[oó]l","wordclass":"regex"},{"stem":"mutatkoz+([aá]l|on)?\s+be","wordclass":"regex"},{"stem":"(be)?muta(koz(hat)?n[aá]l|(tn[aá]d|sd)\s.+?magad(at)?)","wordclass":"regex"},{"stem":"([km]i(\s|\sa\s.+?)vagy te|te [km]i(\s|\sa\s.+?)vagy)","wordclass":"regex"}],
"about_creator" : [{"stem":"(ki|hogy(an)?)\s(a\s)?(k[eé]sz([ií]t([oöő]d|et+(ek)?)|[uü]lt([eé]l)?)|gazd[aá]d|programoz([oó]d|ot+|tak)|[ií]rt[aá]k?|(hoz(ot+|tak)|j[oö]t+[eé]l).+?(l[eé]tre|vil[aá]gra|k[oó]dod(at)?)|alkot([oó][dt]+|tak)|teremt(et+|[oöő]d)|(keresztelt|nevezet+|adtak)\sel|adot+\s(neked\s)?nevet)","wordclass":"regex"}],
"about_creator" : [{"stem":"(ki|hogy(an)?)\s(a\s)?(k[eé]sz([ií]t([oöő]d|et+(ek)?)|[uü]lt([eé]l)?)|gazd[aá]d|programoz([oó]d|ot+|tak)|[ií]rt[aá]k?|(hoz(ot+|tak)|j[oö]t+[eé]l).+?(l[eé]tre|vil[aá]gra|k[oó]dod(at)?)|alkot([oó][dt]+|tak)|teremt(et+|[oöő]d)|(keresztelt|nevezet+|adtak)\sel|adot+\s(neked\s)?nevet)","wordclass":"regex"},{"stem":"kik?\s(k[eé]sz[ií]tet+|fejlesztet+)(ek)?","wordclass":"regex","max_words":3}],
"about_look" : [{"stem":"hogy(an)?\s(n[eé]zn?[eé]l\ski|mutatsz|festesz)","wordclass":"regex"},{"stem":"(k[uü]ldj|mutas+).+?(k[eé]pet|fot[oó]t|sz?elfie?t)\smagadr[oó]l","wordclass":"regex"},{"stem":"(k[uü]ldj|mutas+)\smagadr[oó]l.+?(k[eé]pet|fot[oó]t|sz?elfie?t)","wordclass":"regex"},{"stem":"(van|milyen)\s(az?\s)?(arcod|kin[eé]zeted)","wordclass":"regex"},{"stem":"szép vagy"}],
"about_age" : [{"stem":"mennyi idős vagy"},{"stem":"hány éves vagy"},{"stem":"melyik évben születtél"},{"stem":"mikor születtél"},{"stem":"(melyik\s[eé]vben|mikor)\sk[eé]sz([uü]lt[eé]l|[ií]tet+ek)","wordclass":"regex"},{"stem":"(h[aá]ny(adik|ban)|mikor\s(van|[uü]n+epled)\s?a?)\ssz[uü]l(et[eé]s|i)napod(at)?","wordclass":"regex"},{"stem":"h[aá]ny\s[eé]vesnek\s.+?\smagad(at)?","wordclass":"regex"},{"stem":"sz[uü]l(et[eé]s)?i?napod(at)?\s(h[aá]nyadik[aá]n|mikor|melyik)","wordclass":"regex"}],
"about_zodiac" : [{"stem":"(neked\s)?mi\sa\s(horoszk[oó]pod|csil+agjegyed)","wordclass":"regex"},{"stem":"milyen jegyben születtél"},{"stem":"a\s(te\s)?(horoszk[oó]pod|csil+agjegyed)\smi(csoda)?","wordclass":"regex"},{"stem":"milyen\sjegyben\ssz[uü]let+\w+","wordclass":"regex"}],
"about_location": [{"stem":"(hol|helyen)\s(k[eé]sz[uü]lt[eé]l|k[eé]sz[ií]tet+ek|sz[uü]let+[eé]l|(hoztak|j[oö]t+[eé]l).+?l[eé]tre)","wordclass":"regex"},{"stem":"hon+an\s(sz[aá]rmazol|[ií]rsz|val[oó]\svagy)","wordclass":"regex"},{"stem":"ho(n+an|l)\svagy\s(most\s)?(helyileg|most|pontosan)","wordclass":"regex"},{"stem":"(hol\s|mer+e\s)(laksz|(van|az?).+?ot+honod)","wordclass":"regex"},{"stem":"hol vagy","max_words":3}],
"about_family" : [{"stem":"ki(k|t|ket)?\s(az?\s|tartasz\sa\s)?(te\s)?(csal[aá]dod(nak)?|sz[uü]l(t|et+[eé]l)|sz[uü]leid(nek)?|([eé]des)?(any(uk)?[aá]d|ap(uk)?[aá]d)(nak)?)","wordclass":"regex"},{"stem":"csal[aá]dban\s([eé]l(sz|tek)|sz[uü]let+[eé]l)","wordclass":"regex"},{"stem":"(h[aá]ny|van(nak)?)\stestv[eé]rei?d","wordclass":"regex"},{"stem":"(kik?|van(n?ak)?[\-\s]?e?)(\sa)?(\shoz+[aá]d?\s?tartoz[oó]i?d|csal[aá]dod)","wordclass":"regex"}],
"about_software": [{"stem":"(hogy(hogy|an)?|mit[oöő]l).+?(m[uüű]k[oö]dsz|(tudsz |vagy k[eé]pes )?(meg)?[eé]rte(sz|d|ni)\,? (meg )?(hogy )?(a?mit mond(ok|tam)|a?mit [ií]r(ok|tam)|engem))","wordclass":"regex"},{"stem":"mi(jen|lyen|en|\s?f[eé]le|\s?fajta)\sfekete\s?m[aá]gia","wordclass":"regex"},{"stem":"neur[aá]lis\sh[aá]l[oó]\w*","wordclass":"regex","inc":[{"stem":"vagy"},{"stem":"te"},{"stem":"működ","wordclass":"verb"}]}],
"about_skills" : [{"stem":"mi(lyen|(ke)?t|k?re)\s(funkci[oó](id?|kat)\s|dolgok(at|ra)\s|tr[uü]k+([oö]k(et|re)|jeid?)\s|parancsok(at|ra)\s)?(tud(sz|n[aá]l)?\s(csin[aá]lni|mutatni)?|ismer(sz)?|(vagy\s|van\s)?(k[eé]pes|(be|meg)?tan[ií]tva)|tan[ií]tot+[aá]k\s(be|neked|meg)?|(k[eé]pes+[eé]gei?d?|tulajdons[aá]g(o|ai)d?)\svan(nak)?)","wordclass":"regex","exc":[{"stem":"mond","wordclass":"verb"}]},{"stem":"mihez ért","affix":["esz"]},{"stem":"mi((ke)?t|k?r[oöő]l)\s(lehet\s|szabad\s|tudok\s)?k[eé]rdez+h?e\w+","wordclass":"regex"},{"stem":"miben tudsz"},{"stem":"k[eé]rdez+(het)?(ek|ni)\st[oöő]led","wordclass":"regex"}],
"about_skills" : [{"stem":"mi(lyen|(ke)?t|k?re)\s(funkci[oó](id?|kat)\s|dolgok(at|ra)\s|tr[uü]k+([oö]k(et|re)|jeid?)\s|parancsok(at|ra)\s)?(tud(sz|n[aá]l)?\s(csin[aá]lni|mutatni)?|ismer(sz)?|(vagy\s|van\s)?(k[eé]pes|(be|meg)?tan[ií]tva)|tan[ií]tot+[aá]k\s(be|neked|meg)?|(k[eé]pes+[eé]gei?d?|tulajdons[aá]g(o|ai)d?)\svan(nak)?)","wordclass":"regex","exc":[{"stem":"mond","wordclass":"verb"}]},{"stem":"mihez ért","affix":["esz"]},{"stem":"mi((ke)?t|k?r[oöő]l)\s(lehet\s|szabad\s|tudok\s)?k[eé]rdez+h?e\w+","wordclass":"regex"},{"stem":"miben tudsz"},{"stem":"k[eé]rdez+(het)?(ek|ni)\st[oöő]led","wordclass":"regex"},{"stem":"mi(ben|vel)\s?tud(sz|n[aá]l)\sseg[ií]teni","wordclass":"regex"}],
"about_topics" : [{"stem":"mir[oöő]l\s.*?besz[eé]lge[st]\w+","wordclass":"regex"},{"stem":"milyen\st[eé]m[aá][bk]*r?[aoó][lnt]","wordclass":"regex"}],
"about_thoughts": [{"stem":"mi(n|re)?\s(gondol(kodsz|ko[dz]ol|sz)|agyalsz|t[oö]prenge?sz|j[aá]r\s(az?\s)?(fejed|agyad)(b[ae]n?)?)","wordclass":"regex"}],
"about_favorite": [{"stem":"melyik","inc":[{"stem":"kedvenc","affix":["ed"]},{"stem":"szeret","affix":["i","ed"],"match_stem":False}]}],
Expand All @@ -139,7 +139,7 @@ def smalltalk():
"are_you_busy" : [{"stem":"elfoglalt","inc":[{"stem":"vagy"}]},{"stem":"r[aá]m?\s?[eé]r(n[eé]l|sz)(\smost)?(\segy)?(\skicsit|\skis\s\w+|\svalamen+yi\w*)?","wordclass":"regex"},{"stem":"(van|volna)\s(most\s)?(r[aá]m?\s)?(most\s)?(egy\s)?(kis\s|kev[eé]s\s|valamen+yi\s)?(szabad\s?)?id[oöő]d(\sr[aá]m)?","wordclass":"regex"},{"stem":"sok dolgod van"}],
"are_you_lying" : [{"stem":"hazud","wordclass":"verb"},{"stem":"nem mondt[aá][dl]\s((el|meg)\saz\s)?igaz(at|s[aá]got)","wordclass":"regex"}],
"are_you_serious": [{"stem":"(nem?|csak)\s(vic+el(sz|j)?|mond+(od|ja)?|ideges[ií]ts(en)?)","wordclass":"regex"},{"stem":"(komolyan|t[eé]nyleg)\s?([uúií]gy\s|azt\s)?((mond|gondol|[ií]r)(ja|od|tad?)|hisz(i|ed)|hit+ed?)","wordclass":"regex"},{"stem":"biztos(an)?\s(vagy\s)?(\w+\s)?(ben+e|eb+en|mond(ta|o)d|mond[jt]a)","wordclass":"regex"},{"stem":"ezt?\s(most\s)?komoly(an)?","wordclass":"regex"},{"stem":"viccelsz","max_words":1}],
"can_you_hear_me": [{"stem":"(olvas+a|hal+ja|n[eé]zi|van\sit+)(\sezt)?\s(vala|b[aá]r)ki(\sis)?","wordclass":"regex"},{"stem":"(hal+(asz|od)|l[aá]t(sz|od)|vesze[ld])\s(engem|a?mit\s(mondok|[ií]rok|k[eé]rdezek))","wordclass":"regex"},{"stem":"valaki\s(hal+(ja)?\s|olvas+a|figyeli?(\sar+a)?)\sa?mit\s(ide\s?|it+\s)?([ií]rok|mondok|k[eé]rdezek)","wordclass":"regex"},{"stem":"felfogtad","max_words":3},{"stem":"itt","inc":[{"stem":"vagy"},{"stem":"van"}],"max_words":3},{"stem":"halló","max_words":3},{"stem":"hallasz","max_words":3},{"stem":"mikrofon próba"}],
"can_you_hear_me": [{"stem":"(olvas+a|hal+ja|n[eé]zi|van\sit+)(\sezt)?\s(vala|b[aá]r)ki(\sis)?","wordclass":"regex"},{"stem":"(hal+(asz|od)|l[aá]t(sz|od)|vesze[ld])\s(engem|a?mit\s(mondok|[ií]rok|k[eé]rdezek))","wordclass":"regex"},{"stem":"valaki\s(hal+(ja)?\s|olvas+a|figyeli?(\sar+a)?)\sa?mit\s(ide\s?|it+\s)?([ií]rok|mondok|k[eé]rdezek)","wordclass":"regex"},{"stem":"felfogtad","max_words":3},{"stem":"itt","inc":[{"stem":"vagy"},{"stem":"van"}],"max_words":3},{"stem":"halló","max_words":3},{"stem":"hallasz","max_words":3},{"stem":"mikrofon próba"},{"stem":"miért nem válaszolsz"}],
"can_you_learn": [{"stem":"(k[eé]pes(\svagy)?|tud(sz)?)\stanulni","wordclass":"regex"},{"stem":"tanulsz\s(is|[ae].+?b[oóöő]l)","wordclass":"regex"},{"stem":"[dln][aáeéo][km]\s(be|meg)?tan[ií]tani\b","wordclass":"regex","boundary":False},{"stem":"(lehet|tudlak|tudom)\s(t[eé]ged|[oö]nt)?\stan[ií]tani","wordclass":"regex"}],
"can_you_understand_me":[{"stem":"(meg)?[eé]rt(e(d|sz|t+ed?)|i)\,?((\shogy)?\sa?mit\s([ií]r|mond)\w+|\smagyarul)","wordclass":"regex"}],
"contact" : [{"stem":"mi(lyen)?\s(.+?\s)?(e\-?mail\s?)?c[ií]me[dn]?","wordclass":"regex"},{"stem":"elérhetőség","wordclass":"noun"},{"stem":"elér","wordclass":"verb","inc":[{"stem":"önt"},{"stem":"téged"}]}],
Expand Down
4 changes: 2 additions & 2 deletions lara/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1156,7 +1156,7 @@ def timestamps(self,current=False):
# Converts text representation of numbers to digits
def _convert_numbers(self,text):
if text:
matches = _re.findall(r'((?:m[ií]n[uú]sz\s?|negat[ií]v\s?)?(?:(?:(?:(?:(?:t[ií]z|h[uú]sz|harminc)(?:[eo]n)?)?(?:nulla|eg+y|els[eoöő]|k[eé]t+[oöő]?|m[aá]sod(?:ik)?|h[aá]rom|harmadik|n[eé]g+y|[oö]t|hat|h[eé]t|nyolc|kilenc)(?:v[ae]n)?)(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z)?\W*)|(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z|t[ií]z|h[uú]sz|harminc|nulla|z[eé]r[oó])\W*)+(?:[aeoö]dik)?(?:j?[aáeéi]+[gnt]?|[aáeéoöő]?t|kor|t[oóöő]l|r[ae]|[ckmrtvz]?[ae]l)?)\b', re.IGNORECASE, text)
matches = _re.findall(r'((?:m[ií]n[uú]sz\s?|negat[ií]v\s?)?(?:(?:(?:(?:(?:t[ií]z|h[uú]sz|harminc)(?:[eo]n)?)?(?:nulla|eg+y|els[eoöő]|k[eé]t+[oöő]?|m[aá]sod(?:ik)?|h[aá]rom|harmadik|n[eé]g+y|[oö]t|hat|h[eé]t|nyolc|kilenc)(?:v[ae]n)?)(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z)?\W*)|(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z|t[ií]z|h[uú]sz|harminc|nulla|z[eé]r[oó])\W*)+(?:[aeoö]dik)?(?:j?[aáeéi]+[gnt]?|[aáeéoöő]?t|kor|t[oóöő]l|r[ae]|sz[eoö]r|[ckmrtvz]?[ae]l)?)\b', re.IGNORECASE, text)
results = {}
for match in matches:
value = 0
Expand Down Expand Up @@ -1244,7 +1244,7 @@ def _convert_numbers(self,text):

swap = sorted(results.items(), key=lambda x: x[1], reverse=True)
for item in swap:
text = _re.sub(r'\b('+re.escape(item[0])+r')((?:[aeoö]dik?)?(?:j?[aáeéi]+[gnt]?|[aáeéoöő]?t|kor|t[oóöő]l|r[ae]|[ckmrtvz]?[ae]l)?)?\b', re.IGNORECASE, re.escape(str(item[1]))+r'\2', text)
text = _re.sub(r'\b('+re.escape(item[0])+r')((?:[aeoö]dik?)?(?:j?[aáeéi]+[gnt]?|[aáeéoöő]?t|kor|t[oóöő]l|r[ae]|sz[eoö]r|[ckmrtvz]?[ae]l)?)?\b', re.IGNORECASE, re.escape(str(item[1]))+r'\2', text)
return text
return ''

Expand Down
6 changes: 6 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,12 @@ def test_parser_extract(info):
"in" : "egymillió és százezer és tízezer és tízmilliótíz és százezerszáz",
"out" : "1000000 és 100000 és 10000 és 10000010 és 100100"
}
),
(
{
"in" : "mennyi egyszer egy és kétszer kettő?",
"out" : "mennyi 1szer 1 és 2szer 2?"
}
)
])
def test_parser_extract_convert_numbers(info):
Expand Down

0 comments on commit 45920ed

Please sign in to comment.