Skip to content

Commit

Permalink
Merge pull request #968 from xxyzz/pt
Browse files Browse the repository at this point in the history
[pt] improve extract example list and expression section code
  • Loading branch information
xxyzz authored Jan 3, 2025
2 parents 5d7b826 + 4303806 commit f889f65
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 3 deletions.
31 changes: 29 additions & 2 deletions src/wiktextract/extractor/pt/linkage.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import re

from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
TemplateNode,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
Expand Down Expand Up @@ -35,6 +41,16 @@ def extract_expression_list_item(
node = node.lstrip(": ")
if node != "":
sense_nodes.append(node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
link_str = clean_node(wxr, None, node)
if link_str.startswith("Wikisaurus:"):
extract_wikisaurus_page(
wxr, word_entry, link_str, "expressions", "", 0, []
)
elif expression_data.word == "":
expression_data.word = link_str
else:
sense_nodes.append(node)
elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
sense_nodes.append(node)

Expand Down Expand Up @@ -89,6 +105,17 @@ def extract_linkage_section(
source,
tags,
)
elif isinstance(node, WikiNode) and node.kind in LEVEL_KIND_FLAGS:
extract_linkage_section(
wxr,
word_entry,
node,
linkage_type,
sense,
sense_index,
source,
tags,
)


def extract_fraseini_template(
Expand Down Expand Up @@ -220,7 +247,7 @@ def extract_wikisaurus_page(
if pos_title != word_entry.pos_title:
continue
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
linkage_title = clean_node(wxr, None, level3_node.largs)
linkage_title = clean_node(wxr, None, level3_node.largs).lower()
if LINKAGE_SECTIONS.get(linkage_title) != linkage_type:
continue
extract_linkage_section(
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def parse_section(
) -> None:
cats = {}
title_text = clean_node(wxr, cats, level_node.largs).strip(
"⁰¹²³⁴⁵⁶⁷⁸⁹0123456789:"
"⁰¹²³⁴⁵⁶⁷⁸⁹0123456789: \n"
)
if title_text.lower() in POS_DATA:
extract_pos_section(
Expand Down
18 changes: 18 additions & 0 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,21 @@ def extract_example_list_item(
if example.ref == "":
example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n")
sense.examples.append(example)
else:
extract_example_text_list(wxr, sense, list_item)


def extract_example_text_list(
wxr: WiktextractContext,
sense: Sense,
list_item: WikiNode,
) -> None:
list_item_text = clean_node(
wxr, sense, list(list_item.invert_find_child(NodeKind.LIST))
)
example = Example(text=list_item_text)
if "-" in example.text:
tr_start = example.text.index("-")
example.translation = example.text[tr_start + 1 :].strip()
example.text = example.text[:tr_start].strip()
sense.examples.append(example)
1 change: 1 addition & 0 deletions src/wiktextract/extractor/pt/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
"palavras com o kanji": "related",
"compostos": "derived",
"vermos derivados": "derived",
"expressões": "expressions",
}

LINKAGE_TAGS = {
Expand Down
23 changes: 23 additions & 0 deletions tests/test_pt_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,26 @@ def test_text_above_ref_child_list(self):
],
},
)

def test_plain_text_example_list(self):
self.wxr.wtp.add_page("Predefinição:-ja-", 10, "Japonês")
data = parse_page(
self.wxr,
"大家",
"""={{-ja-}}=
==Substantivo 2==
# [[mestre]]; [[perito]]; [[autoridade]]
#* [[雑学]]の'''大家''' - mestre em conhecimentos gerais""",
)
self.assertEqual(
data[0]["senses"][0],
{
"glosses": ["mestre; perito; autoridade"],
"examples": [
{
"text": "雑学の大家",
"translation": "mestre em conhecimentos gerais",
}
],
},
)

0 comments on commit f889f65

Please sign in to comment.