forked from lascivaroma/digiliblt
-
Notifications
You must be signed in to change notification settings - Fork 7
/
xpaths.py
48 lines (38 loc) · 1.31 KB
/
xpaths.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from glob import glob
from lxml.etree import parse
from MyCapytain.common.constants import XPATH_NAMESPACES
def compute_xpath(element):
path = [element]
parent = element
while parent is not None:
parent = parent.getparent()
if parent is not None:
path.append(parent)
_path = []
cnt_n = 1
for p in path[::-1]:
if p is not None:
_path.append("tei:"+p.tag.replace("{http://www.tei-c.org/ns/1.0}", ""))
if p.get("n") is not None and p.get("type") is not None:
_path[-1] = _path[-1]+"[@n='$"+str(cnt_n)+"']"
cnt_n += 1
return "/".join(_path)
files = glob("sources/*/*.xml")
data = ["\t".join(["Filename"] + ["Xpath"]*3)]
for file in files:
try:
with open(file) as f:
xml = parse(f)
except Exception as E:
print(file+" is failing")
raise E
divs = xml.xpath("//*[@type and @n]", namespaces=XPATH_NAMESPACES)
divs += xml.xpath("//tei:p[@n]", namespaces=XPATH_NAMESPACES)
if len(divs) == 0:
# We use P
xpaths = ["tei:TEI/tei:text/tei:body//tei:p[@n='$1']"]
else:
xpaths = sorted(list(set([compute_xpath(div) for div in divs])))
data.append("\t".join([file] + xpaths))
with open("xpaths.csv", "w") as f:
f.write("\n".join(data))