-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathdata-downloader.js
65 lines (55 loc) · 1.97 KB
/
data-downloader.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
const fs = require('fs');
const http = require('http');
const fetch = url =>
new Promise((resolve, reject) =>
http
.get(url, resp => {
let data = '';
resp.on('data', chunk => {
data += chunk;
});
resp.on('end', () => {
resolve(data);
});
})
.on('error', error => reject(error))
);
// const BASE_URL = 'http://creatingdata.us/data/scatter/fiction/tiles/';
const BASE_URL = 'http://creatingdata.us/data/scatter/hathi/tiles/';
const matchRegex = async (url, regex) => {
const htmlPage = await fetch(url);
const matches = [];
let match;
while ((match = regex.exec(htmlPage))) {
matches.push(match[1]);
}
return matches;
};
const pause = ms => new Promise(resolve => setTimeout(resolve, ms));
(async () => {
const tileFolders = await matchRegex(`${BASE_URL}/`, /href="([0-9]*)\/"/g);
console.log(tileFolders);
for (let j = 5; j < tileFolders.length; j++) {
const tileFolder = tileFolders[j];
const subFolders = await matchRegex(
`${BASE_URL}/${tileFolder}/`,
/href="([0-9]*)\/"/g
);
for (let i = 0; i < subFolders.length; i++) {
const subFolder = subFolders[i];
const files = await matchRegex(
`${BASE_URL}/${tileFolder}/${subFolder}/`,
/href="([0-9]*.tsv)"/g
);
for (let k = 0; k < files.length; k++) {
const file = files[k];
const fileUrl = `${BASE_URL}/${tileFolder}/${subFolder}/${file}`;
console.log(fileUrl);
await pause(500);
const fileData = await fetch(fileUrl);
fs.writeFileSync(`data/${tileFolder}-${subFolder}-${file}`, fileData);
}
}
}
})();
// head -1 data/1-0-0.tsv > foo.txt; tail -n +2 -q data/*.tsv | sed -e '$ d' >> foo.txt