-
Notifications
You must be signed in to change notification settings - Fork 2
/
config_wikidata.py
174 lines (137 loc) · 5.51 KB
/
config_wikidata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""
This file defines a few constants which configure
which Wikibase instance and which property/item ids
should be used
"""
# Endpoint of the MediaWiki API of the Wikibase instance
mediawiki_api_endpoint = 'https://www.wikidata.org/w/api.php'
# SPARQL endpoint
wikibase_sparql_endpoint = 'https://query.wikidata.org/sparql'
# Name of the Wikibase instance
wikibase_name = 'Wikidata'
# URL of the main page of the Wikibase instance
wikibase_main_page = 'https://www.wikidata.org/wiki/Wikidata:Main_Page'
# Wikibase namespace ID, used to search for items
# For Wikidata this is 0, but most by default Wikibase uses 120, which is the default Wikibase 'Item:' namespace
# CHANGE THIS TO 120 if you are adapting this configuration file to another Wikibase
wikibase_namespace_id = 0
# Namespace prefix of Wikibase items (including colon, e.g. 'Item:')
wikibase_namespace_prefix = ''
# User agent to connect to the Wikidata APIs
user_agent = 'OpenRefine-Wikidata reconciliation interface'
# Regexes and group ids to extracts Qids and Pids from URLs
import re
q_re = re.compile(r'(<?https?://www.wikidata.org/(entity|wiki)/)?(Q[0-9]+)>?')
q_re_group_id = 3
p_re = re.compile(r'(<?https?://www.wikidata.org/(entity/|wiki/Property:))?(P[0-9]+)>?')
p_re_group_id = 3
# Identifier space and schema space exposed to OpenRefine.
# This should match the IRI prefixes used in RDF serialization.
# Note that you should be careful about using http or https there,
# because any variation will break comparisons at various places.
identifier_space = 'http://www.wikidata.org/entity/'
schema_space = 'http://www.wikidata.org/prop/direct/'
# Pattern used to form the URL of a Qid.
# This is only used for viewing so it is fine to use any protocol (therefore, preferably HTTPS if supported)
qid_url_pattern = 'https://www.wikidata.org/wiki/{{id}}'
# By default, filter out any items which are instance
# of a subclass of this class.
# For Wikidata, this is "Wikimedia internal stuff".
# This filters out the disambiguation pages, categories, ...
# Set to None to disable this filter
avoid_items_of_class = 'Q17442446'
# Service name exposed at various places,
# mainly in the list of reconciliation services of users
service_name = 'DEV Wikidata'
# URL (without the trailing slash) where this server runs
this_host = 'http://localhost:8000'
# The default limit on the number of results returned by us
default_num_results = 25
# The maximum number of search results to retrieve from the Wikidata search API
wd_api_max_search_results = 50 # need a bot account to get more
# The matching score above which we should automatically match an item
validation_threshold = 95
# Redis client used for caching at various places
redis_uri = 'redis://localhost:6379/0?encoding=utf-8'
# Redis prefix to use in front of all keys
redis_key_prefix = 'openrefine_wikidata:'
# Headers for the HTTP requests made by the tool
headers = {
'User-Agent':service_name + ' (OpenRefine-Wikibase reconciliation service)',
}
# Previewing settings
# Dimensions of the preview
zoom_ratio = 1.0
preview_height = 100
preview_width = 400
# With which should be requested from Commons for the thumbnail
thumbnail_width = 130
# All properties to use to get an image. Set to empty list [] if no image properties are available.
image_properties = [
'P18',
'P14',
'P15',
'P158',
'P181',
'P242',
'P1766',
'P1801',
'P1846',
'P2713',
'P2716',
'P2910',
'P3311',
'P3383',
'P3451',
'P1621',
'P154',
]
# URL pattern to retrieve an image from its filename
image_download_pattern = 'https://upload.wikimedia.org/wikipedia/commons/thumb/%s/%s/%s/%dpx-%s'
# Fallback URL of the image to use when previewing an item with no image
fallback_image_url = this_host + '/static/wikidata.png'
# Alt text of the fallback image
fallback_image_alt = 'Wikidata'
# Autodescribe endpoint to use.
# this is used to generate automatic descriptions from item contents.
# (disable this with: autodescribe_endpoint = None )
autodescribe_endpoint = 'https://autodesc.toolforge.org/'
# Property proposal settings
# Default type : entity (Q35120)
# Set to None if so such item exists.
default_type_entity = 'Q35120'
# Property path used to obtain the type of an item
type_property_path = 'P31'
# Property to follow to fetch properties for a given type.
# Set to None if this is not available
property_for_this_type_property = 'P1963'
# Optional prefix in front of properties in SPARQL-like property paths
wdt_prefix = 'wdt:'
# Sparql query used to fetch all the subclasses of a given item.
# The '$qid' string will be replaced by the qid whose children should be fetched.
sparql_query_to_fetch_subclasses = """
SELECT ?child WHERE { ?child wdt:P279* wd:$qid }
"""
# Sparql query used to fetch all the properties which store unique identifiers
sparql_query_to_fetch_unique_id_properties = """
SELECT ?pid WHERE { ?pid wdt:P31/wdt:P279* wd:Q19847637 }
"""
# Sparql query used to propose properties to fetch for items of a given class.
# Set to None if property proposal should be disabled.
sparql_query_to_propose_properties = """
SELECT ?prop ?propLabel ?depth WHERE {
SERVICE gas:service {
gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.BFS" .
gas:program gas:in wd:$base_type .
gas:program gas:out ?out .
gas:program gas:out1 ?depth .
gas:program gas:maxIterations 10 .
gas:program gas:maxVisited 100 .
gas:program gas:linkType wdt:P279 .
}
SERVICE wikibase:label { bd:serviceParam wikibase:language "$lang" }
?out wdt:$property_for_this_type ?prop .
}
ORDER BY ?depth
LIMIT $limit
"""