Skip to content

Commit

Permalink
validate/fix sentence IDs (#16)
Browse files Browse the repository at this point in the history
  • Loading branch information
nschneid committed Nov 21, 2021
1 parent d9b1cc3 commit fcb2ef6
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 11 deletions.
24 changes: 22 additions & 2 deletions modified_streusle_scripts/conllulex2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ def _postproc_sent(sent):

sent = {}
sent_conllulex = ''
first_sent_in_doc = False

for ln in chain(inF, [""]): # Add empty line at the end to avoid skipping the last sent
ln = ln.strip()
Expand All @@ -248,12 +249,18 @@ def _postproc_sent(sent):

if ln.startswith('#'): # metadata
if store_conllulex=='full': sent_conllulex += ln + '\n'
if ln.startswith('# newdoc ') or ln.startswith('# newpar ') or ln.startswith('# TODO: '): continue
if ln.startswith('# newdoc '):
first_sent_in_doc = True
continue
elif ln.startswith('# newpar ') or ln.startswith('# TODO: '): continue
m = re.match(r'^# (\w+) = (.*)$', ln)
assert m,ln
k, v = m.group(1), m.group(2)
assert k not in ('toks', 'swes', 'smwes', 'wmwes')
assert k not in sent,(k,sent[k])
sent[k] = v
if k=='sent_id':
assert first_sent_in_doc==(int(v.rsplit('-',1)[1])==1),v # First sentence in document should end in -1, -01, -001 etc.
else: # regular and ellipsis tokens
if 'toks' not in sent:
sent['toks'] = [] # excludes ellipsis tokens, so they don't interfere with indexing
Expand Down Expand Up @@ -368,6 +375,8 @@ def _postproc_sent(sent):
sent['etoks'].append(tok)
else:
sent['toks'].append(tok)

first_sent_in_doc = False
if sent:
if store_conllulex: sent['conllulex'] = sent_conllulex
_postproc_sent(sent)
Expand Down Expand Up @@ -407,11 +416,22 @@ def print_sent_json(sent):
def print_json(sents):
print('[')
first = True
sent_ids = set()
sent_id = None
for sent in sents:
# check for duplicate sentence IDs
prev_sent_id = sent_id
sent_id = sent['sent_id']
assert sent_id not in sent_ids,('Duplicate sent_id:',sent['sent_id'])
sent_ids.add(sent['sent_id'])

# specially format the output
if first:
first = False
else:
else: # check that sentence IDs count from 1 within docs
pdoc, pnum = prev_sent_id.rsplit('-',1)
doc, num = sent_id.rsplit('-',1)
assert pdoc==doc and int(num)==int(pnum)+1 or pdoc!=doc and int(num)==1,('Invalid sent_id:',sent_id,'after',prev_sent_id)
print(',')
print_sent_json(sent)
print(']')
Expand Down
6 changes: 3 additions & 3 deletions pastrie.conllulex
Original file line number Diff line number Diff line change
Expand Up @@ -7183,7 +7183,7 @@
22 itself itself PRON PRP Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs 21 nmod:npmod _ _ _ PRON itself _ _ _ _ _ O-PRON
23 ? ? PUNCT . _ 2 punct _ _ _ PUNCT ? _ _ _ _ _ O-PUNCT

# sent_id = german-cfa8fd9b-a8c2-9379-c816-a94b0e42b253-03
# sent_id = german-cfa8fd9b-a8c2-9379-c816-a94b0e42b253-02
# text = you just answered the question yourself.
# mwe = you just answered the question yourself .
1 you you PRON PRP Case=Nom|Person=2|PronType=Prs 3 nsubj _ _ _ PRON you _ _ _ _ _ O-PRON
Expand All @@ -7194,7 +7194,7 @@
6 yourself yourself PRON PRP Case=Acc|Number=Sing|Person=2|PronType=Prs|Reflex=Yes 3 obl:npmod _ _ _ PRON yourself _ _ _ _ _ O-PRON
7 . . PUNCT . _ 3 punct _ _ _ PUNCT . _ _ _ _ _ O-PUNCT

# sent_id = german-cfa8fd9b-a8c2-9379-c816-a94b0e42b253-04
# sent_id = german-cfa8fd9b-a8c2-9379-c816-a94b0e42b253-03
# text = Where else should they be if not on the PCB?
# mwe = Where else should they be if not on the PCB ?
1 Where where ADV WRB PronType=Int 10 advmod _ _ _ ADV where _ _ _ _ _ O-ADV
Expand Down Expand Up @@ -9614,7 +9614,7 @@
13 trial trial NOUN NN Number=Sing 9 parataxis _ _ _ N trial _ _ _ _ _ O-N
14 ) ) PUNCT -RRB- _ 13 punct _ _ _ PUNCT ) _ _ _ _ _ O-PUNCT

# sent_id = german-b7b4ea19-f3c6-0ca9-f523-ae25f9908c04-07
# sent_id = german-b7b4ea19-f3c6-0ca9-f523-ae25f9908c04-08
# text = Looks pretty Nazi-like to me
# mwe = Looks pretty Nazi - like to me
1 Looks look VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ _ V look _ _ _ _ _ O-V
Expand Down
6 changes: 3 additions & 3 deletions pastrie.govobj.json
Original file line number Diff line number Diff line change
Expand Up @@ -150369,7 +150369,7 @@
"wmwes": {}
},
{
"sent_id": "german-cfa8fd9b-a8c2-9379-c816-a94b0e42b253-03",
"sent_id": "german-cfa8fd9b-a8c2-9379-c816-a94b0e42b253-02",
"text": "you just answered the question yourself.",
"mwe": "you just answered the question yourself .",
"toks": [
Expand Down Expand Up @@ -150549,7 +150549,7 @@
"wmwes": {}
},
{
"sent_id": "german-cfa8fd9b-a8c2-9379-c816-a94b0e42b253-04",
"sent_id": "german-cfa8fd9b-a8c2-9379-c816-a94b0e42b253-03",
"text": "Where else should they be if not on the PCB?",
"mwe": "Where else should they be if not on the PCB ?",
"toks": [
Expand Down Expand Up @@ -201101,7 +201101,7 @@
"wmwes": {}
},
{
"sent_id": "german-b7b4ea19-f3c6-0ca9-f523-ae25f9908c04-07",
"sent_id": "german-b7b4ea19-f3c6-0ca9-f523-ae25f9908c04-08",
"text": "Looks pretty Nazi-like to me",
"mwe": "Looks pretty Nazi - like to me",
"toks": [
Expand Down
6 changes: 3 additions & 3 deletions pastrie.json
Original file line number Diff line number Diff line change
Expand Up @@ -15371,7 +15371,7 @@
"wmwes": {}
},
{
"sent_id": "german-cfa8fd9b-a8c2-9379-c816-a94b0e42b253-03",
"sent_id": "german-cfa8fd9b-a8c2-9379-c816-a94b0e42b253-02",
"text": "you just answered the question yourself.",
"mwe": "you just answered the question yourself .",
"toks": [
Expand All @@ -15397,7 +15397,7 @@
"wmwes": {}
},
{
"sent_id": "german-cfa8fd9b-a8c2-9379-c816-a94b0e42b253-04",
"sent_id": "german-cfa8fd9b-a8c2-9379-c816-a94b0e42b253-03",
"text": "Where else should they be if not on the PCB?",
"mwe": "Where else should they be if not on the PCB ?",
"toks": [
Expand Down Expand Up @@ -20542,7 +20542,7 @@
"wmwes": {}
},
{
"sent_id": "german-b7b4ea19-f3c6-0ca9-f523-ae25f9908c04-07",
"sent_id": "german-b7b4ea19-f3c6-0ca9-f523-ae25f9908c04-08",
"text": "Looks pretty Nazi-like to me",
"mwe": "Looks pretty Nazi - like to me",
"toks": [
Expand Down

0 comments on commit fcb2ef6

Please sign in to comment.