forked from OpenDataScotland/the_od_bods
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdcat.py
71 lines (58 loc) · 2.13 KB
/
dcat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import copy
from dateutil import parser
try:
from processor import Processor
except:
from .processor import Processor
class ProcessorDCAT(Processor):
def __init__(self):
super().__init__(type="dcat")
def get_datasets(self, owner, start_url, fname):
d = processor.get_json(start_url)
if d != "NULL":
datasets = d["dcat:dataset"]
print(f"Found {len(datasets)} datasets")
prepped = []
for e in datasets:
ds = [
e.get("dct:title", ""),
e.get("dct:publisher", "").replace(" Mapping", ""),
"", # link to page
"", # Link to data
"", #FileName
"", # date created
parser.parse(e.get("dct:issued", "")).date(),
"", # size
"", # size unit
"", # filetype
"", # numrecords
";".join(e.get("dcat:keyword", [])),
"", # Manual tags
"", # license
e.get("dct:description", "").strip("\u200b"),
]
pages = e.get("dcat:distribution")
for p in pages:
if p.get("dct:description", "") == "Web Page":
ds[2] = p.get("dcat:accessUrl", "")
break
dsl = []
for p in pages:
if p.get("dct:description", "") == "Web Page":
continue
ds[3] = p.get("dcat:accessUrl", "")
ds[9] = p.get("dct:title", "")
dsl.append(copy.deepcopy(ds))
if not dsl:
dsl.append(ds)
prepped += dsl
print(f"{len(prepped)} lines for csv")
processor.write_csv(fname, prepped)
def get_license(dataset):
try:
return dataset["attributes"]["structuredLicense"]["url"]
except:
return ""
processor = ProcessorDCAT()
if __name__ == "__main__":
processor.process()