-
Notifications
You must be signed in to change notification settings - Fork 0
/
bc-geonames2sqlite.pl
executable file
·293 lines (226 loc) · 8.12 KB
/
bc-geonames2sqlite.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/bin/perl
# NOTE: if you just want the resulting database:
# http://geonames2.db.94y.info
# This script converts the geonames files at:
# http://download.geonames.org/export/dump/
# into an SQLite3 db
# NOTE: this program does NOT include allCountries.zip (it'd be
# out of date anyway); please obtain it yourself:
# http://download.geonames.org/export/dump/allCountries.zip
# These files are assumed to be in the current directory:
# allCountries.zip
# admin1CodesASCII.txt
# countryInfo.txt
use utf8;
use Text::Unidecode;
use Math::Round;
push(@INC,"/usr/local/lib");
require "bclib.pl";
# this program takes time to run, so warn about missing files ASAP
for $i ("admin1CodesASCII.txt", "countryInfo.txt", "allCountries.zip") {
unless (-f $i) {die "$i must exist in current directory";}
}
# these files are pretty important so using /var/tmp not /tmp
open(B,">/var/tmp/altnames.out");
open(C,">/var/tmp/geonames.out");
open(D,">/var/tmp/tzones.out");
# TODO: I never create a table from the file below, but should
open(E,">/var/tmp/featurecodes.out");
# things that are listed as ADM/PCL, but aren't really
# probably bad to hardcode ids here: one of them doesn't even exist anymore!
# if you're REALLY curious what these are:
# http://ws.geonames.org/get?geonameId=2634343 (for example)
# removing as test 11 Feb 2012
# @fakeadm = (2411430,3370684,6940286,921810,6693220,2634343);
# %fakeadm = list2hash(@fakeadm);
# create cheat table for parents
unless (-f "/var/tmp/admpcl.txt") {
system("zgrep -e 'ADM|PCL' allCountries.zip 1> /var/tmp/admpcl.txt");
}
open(A,"/var/tmp/admpcl.txt");
while (<A>) {
chomp($_);
($geonameid, $name, $asciiname, $alternatenames, $latitude, $longitude,
$featureclass, $featurecode, $countrycode, $cc2, $admin1code,
$admin2code, $admin3code, $admin4code, $population, $elevation,
$gtopo30, $timezone, $modificationdate) = split("\t",$_);
# ignore fake ADM/PCL
if ($fakeadm{$geonameid}) {next;}
# ignore admin1code of 00 meaning unknown (unless this is a PCL)
if ($admin1code eq "00" && $featurecode=~/^ADM/) {next;}
# for ADM1-4 and PCL, record full path
if ($featurecode eq "ADM4" && $admin4code ne $geonameid && $admin4code ne "") {
$ADM4{$countrycode}{$admin1code}{$admin2code}{$admin3code}{$admin4code} = $geonameid;
# debug("$countrycode/$admin1code/$admin2code/$admin3code/$admin4code: $geonameid");
} elsif ($featurecode eq "ADM3" && $admin3code ne $geonameid && $admin3code ne "") {
# $ADM3{$countrycode}{$admin1code}{$admin2code}{$admin3code} = $geonameid;
debug("$countrycode/$admin1code/$admin2code/$admin3code: $geonameid");
} elsif ($featurecode eq "ADM2" && $admin2code ne $geonameid && $admin2code ne "") {
$ADM2{$countrycode}{$admin1code}{$admin2code} = $geonameid;
debug("$countrycode/$admin1code/$admin2code: $geonameid");
} elsif ($featurecode eq "ADM1" && $admin1code ne $geonameid && $admin1code ne "") {
$ADM1{$countrycode}{$admin1code} = $geonameid;
debug("$countrycode/$admin1code: $geonameid");
} elsif ($featurecode=~/^PCL/ && $countrycode ne $geonameid && $countrycode ne "") {
$ADM0{$countrycode} = $geonameid;
debug("$countrycode: $geonameid");
} else {
# do nothing
}
}
close(A);
# handle admin1codes
open(A,"admin1CodesASCII.txt");
while (<A>) {
chomp($_);
($code,$short,$long,$id) = split("\t",$_);
($cc,$ad) = split(/\./, $code);
for $i ($ad,$short,$long) {
$i = cleanup($i);
print B "$id\t$i\n";
}
}
close(A);
# and country codes
open(A,"countryInfo.txt");
while (<A>) {
if (/^\#/ || /^iso/) {next;}
($ISO, $ISO3, $ISONumeric, $fips, $Country, $Capital, $Area, $Population,
$Continent, $tld, $CurrencyCode, $CurrencyName, $Phone,
$PostalCodeFormat, $PostalCodeRegex, $Languages, $geonameid, $neighbours,
$EquivalentFipsCode) = split("\t",$_);
for $i ($ISO,$ISO3,$fips,$Country) {
$i = cleanup($i);
print B "$geonameid\t$i\n";
}
}
close(A);
# TODO: remove any blank names that might've snuck in
# and now the main file...
open(A,"zcat allCountries.zip|");
while (<A>) {
chomp($_);
$lines++;
# if ($lines >= 100000) {die "TESTING";}
($geonameid, $name, $asciiname, $alternatenames, $latitude, $longitude,
$featureclass, $featurecode, $countrycode, $cc2, $admin1code,
$admin2code, $admin3code, $admin4code, $population, $elevation,
$gtopo30, $timezone, $modificationdate) = split("\t",$_);
# NOTE: considered limiting to places w/ population of featurecode AP
# meaning "populated place", but decided to just convert everything
# unless ($population || $featureclass=~/^[ap]$/i ) {next;}
# index featurecode
unless ($FEATURECODE{$featurecode}) {
$FEATURECODE{$featurecode} = ++$featurecodecount;
print E "$featurecodecount\t$featurecode\n";
}
$featurecode = $FEATURECODE{$featurecode};
if ($featurecode =~/^pcl/i) {
$adm = 0;
} elsif ($featurecode =~/^adm(\d)$/i) {
$adm = $1;
} else {
$adm = -1;
}
# In theory, the below lets me store both latitude/longitude in a
# 63-bit (or even 48-bit) integer; this may be useful one day if I
# use sqlite3's implicit oid column; for now, it just confuses things
# convert lat/lon to 3-byte int
$latitude = round($latitude*8388607/90);
$longitude = round($longitude*8388607/180);
# index timezone
unless ($TZ{$timezone}) {
$TZ{$timezone} = ++$count;
print D "$count\t$timezone\n";
}
$tz = $TZ{$timezone};
# set admincodes to geonameids (OK if blank)
$admin4new = $ADM4{$countrycode}{$admin1code}{$admin2code}{$admin3code}{$admin4code};
$admin3new = $ADM3{$countrycode}{$admin1code}{$admin2code}{$admin3code};
$admin2new = $ADM2{$countrycode}{$admin1code}{$admin2code};
$admin1new = $ADM1{$countrycode}{$admin1code};
$admin0new = $ADM0{$countrycode};
debug("ADMINS: $admin4new $admin3new $admin2new $admin1new $admin0new");
# record "parent" only; much more efficient
# TODO: better ways to do this... Perl coalesce?
if ($admin4new) {$parent = $admin4new;} elsif
($admin3new) {$parent = $admin3new;} elsif
($admin2new) {$parent = $admin2new;} elsif
($admin1new) {$parent = $admin1new;} elsif
($admin0new) {$parent = $admin0new;} else {
$parent = 0;
}
debug("PARENT: $parent");
# TODO: parent = 0 is probably an error
# the geonames table must come first, because writing to
# alternate_names mangles stuff
print C join("\t", $geonameid, $asciiname, $latitude, $longitude,
$featurecode, $admin0new, $admin4new, $admin3new, $admin2new, $admin1new,
$population, $tz, $elevation)."\n";
# $name and $asciiname and $alternatenames are alt names
for $i ($name,$asciiname,split(",",$alternatenames)) {
$i = cleanup($i);
print B "$geonameid\t$i\n";
}
}
close(A);
close(B);
close(C);
close(D);
close(E);
system("sort -n /var/tmp/altnames.out | uniq > /var/tmp/altnames2.out");
# unidecode the way I want it
sub cleanup {
my($name) = @_;
# unidecode the whole thing first, lower case, despace
$name = lc(unidecode($name));
# remove spaces
$name=~s/\s//isg;
# remove the word "(general)" and "[provisional]"
$name=~s/\(general\)//isg;
$name=~s/\[provisional\]//isg;
# for other (x), change to x (later decided against this)
# $name=~s/\((.*?)\)/$1/isg;
# this is really ugly + might break stuff
$name=~s/[^a-z]//isg;
# if it still has bad chars, report and return empty
if ($name=~/[^a-z]/) {
warn "Ignoring: $name";
return "";
}
return $name;
}
=item sql
To actually create the sqlite3 db, run these commands
CREATE TABLE geonames (
geonameid INTEGER PRIMARY KEY,
asciiname TEXT,
latitude INT,
longitude INT,
feature_code INT,
parent INT,
population INT,
timezone INT,
elevation INT
);
CREATE INDEX i_feature_code ON geonames(feature_code);
CREATE INDEX i_parent ON geonames(parent);
.separator "\t"
.import /var/tmp/geonames.out geonames
CREATE TABLE altnames (
geonameid INT,
name TEXT
);
CREATE INDEX i_name ON altnames(name);
.import /var/tmp/altnames2.out altnames
DELETE FROM altnames WHERE name = '';
INSERT INTO altnames VALUES (0,'');
INSERT INTO geonames (geonameid) VALUES (0);
VACUUM;
CREATE TABLE tzones (
timezoneid INTEGER PRIMARY KEY,
name TEXT
);
.separator "\t"
.import /var/tmp/tzones.out tzones
=cut