-
Notifications
You must be signed in to change notification settings - Fork 1
/
wikiextract.pl
148 lines (119 loc) · 4.98 KB
/
wikiextract.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/perl -w
#
# Extract ukrainian city information from Wikipedia dump
#
# Copyright (c) 2010, Eugene Sandulenko <sev.mail@gmail.com>
#
# This file is provided under GPLv2 license.
#
# Usage: perl -CD wikiextract.pl ukwiki-20101012-pages-articles.xml [cities.csv]
# where first parameter is bunzip2'ed wiki dump taken from
# http://download.wikimedia.org/ukwiki/
# second optional parameter is name for the output file. Default cities.csv
#
# Output:
# cities.csv -- file containing data extract
# NNNNN.txt -- dump of article contents
# NNNNNn.txt -- dump of article not belonging to cities but which
# still uses KOATUU codes (Oblast, Silrada, etc)
#
# Format of cities.csv file:
# Comma separated list having following fields
# number: file number NNNNN (see above)
# title: title of Wikipedia document
# name_ua: Ukrainian name of the city
# name_ru: Russian name of the city (actually it is RU article
# reference, so it requires additional processing
# koatuu: KOATUU code
# oblast: Oblast of the city
# raion: Raion of the city
# rada: Rada of the city
# elt: elevation of the city
# population: population of the city
# coords: coordinates of the city (in form of coord template)
# zip: zip code of the city
# card: URL of the city card on Verkhovna Rada cite
#
# All fields are represented as they are in the city template
# in the article, that is they require processing and data
# cleansing
use Parse::MediaWikiDump;
use Text::CSV;
use utf8;
BEGIN { $| = 1; }
my $file = shift(@ARGV) or die "must specify a Mediawiki dump file";
my $outfile = shift(@ARGV) || "cities.csv";
my $pages = Parse::MediaWikiDump::Pages->new($file);
my $page;
my $num = 1;
my $art = 0;
my $total = 858749; # Hardcoded number of articles in Ukrainian wikipedia
my @cols = ();
my $csv = Text::CSV->new ( { binary => 1 } ) # should set binary attribute.
or die "Cannot use CSV: ".Text::CSV->error_diag ();
$csv->eol("\n");
open $csvf, ">:encoding(utf8)", $outfile or die "$outfile: $!";
print $csvf "\"num\",\"type\",\"title\",\"name_ua\",\"name_ru\",\"koatuu\",\"oblast\",\"raion\",\"rada\",\"elt\",\"population\",\"coords\",\"zip\",\"card\"\n";
sub putCol {
my $val = shift;
if (defined $val) {
$val =~ s/^\s+|\s+$//g;
}
push @cols, $val;
}
while(defined($page = $pages->next)) {
if (${ $page->text } =~ /од\s+КОАТУУ\s*=/) {
$pass = "n";
if ((${ $page->text } =~ /{{(?:Картка:)?([[Сс][еи]ло(?:\s+України)?|[Сс]елище(?:\s+України)?|[Сс]мт(?:\s+України)?|[Мм]істо\s+України)\s*\|/) && ($page->title !~ /Шаблон:/)) {
$pass = "";
putCol($page->id);
my $type;
if ($1 =~ /[[Сс][еи]ло/) {
$type = "село";
} elsif ($1 =~ /[[Сс]елище/) {
$type = "селище";
} elsif ($1 =~ /[[Сс]мт/) {
$type = "смт";
} elsif ($1 =~ /[[Мм]істо/) {
$type = "місто";
} else {
print STDERR "Wrong type: $1\n";
exit;
}
putCol($type);
putCol($page->title);
if (${ $page->text } =~ /^\s*\|?\s*назва\s*=\s*(.*)$/m) { putCol($1); } else { putCol(""); }
if (${ $page->text } =~ /\[\[ru:(.*)\]\]/m) { putCol($1); } else { putCol(""); }
if (${ $page->text } =~ /^\s*\|\s*код\s*КОАТУУ\s*=(.*)$/m) { putCol($1); } else { putCol(""); }
if (${ $page->text } =~ /^\s*\|\s*область\s*=(.*)$/m) { putCol($1);
} else {
if (${ $page->text } =~ /^\s*\|\s*регіон\s*=(.*)$/m) { putCol($1); } else { putCol(""); }
}
if (${ $page->text } =~ /^\s*\|\s*район\s*=(.*)$/m) { putCol($1); } else { putCol(""); }
if (${ $page->text } =~ /^\s*\|\s*рада\s*=(.*)$/m) { putCol($1); } else { putCol(""); }
if (${ $page->text } =~ /^\s*\|\s*висота\s*=(.*)$/m) { putCol($1); } else { putCol(""); }
if (${ $page->text } =~ /^\s*\|\s*населення\s*=(.*)$/m) { putCol($1); } else { putCol(""); }
if (${ $page->text } =~ /^\s*\|\s*координати\s*=(.*)$/m) { putCol($1); } else { putCol(""); }
if (${ $page->text } =~ /^\s*\|\s*поштовий\s+індекс\s*=(.*)$/m) { putCol($1);
} else {
if (${ $page->text } =~ /^\s*\|\s*поштові\s+індекси\s*=(.*)$/m) { putCol($1); } else { putCol(""); }
}
if (${ $page->text } =~ /^\s*\|\s*облікова\s+картка\s*=(.*)$/m) { putCol($1); } else { putCol(""); }
$csv->print ($csvf, \@cols);
@cols = ();
}
if ($pass ne "") {
if (${ $page->text } =~ /^\s*\|\s*код\s*КОАТУУ\s*=(.*)$/m) {
$pass = "q" if $1 !~ /^\s+$/;
}
open OUT, sprintf(">%05d${pass}.txt", $page->id);
print OUT ${ $page->text };
close OUT;
}
$num++;
}
$art++;
print "\r" . $page->id . (sprintf " %02.2f%%", ($art * 100) / $total) if ($art % 1000 == 0);
}
close $csvf;
print "\r" . (sprintf " %02.2f%% ($art)", ($art * 100) / $total) . "\n";