This repository has been archived by the owner on Nov 12, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
keywords.php
96 lines (90 loc) · 2.49 KB
/
keywords.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
<?php
include 'vendor/autoload.php';
use Box\Spout\Reader\Common\Creator\ReaderEntityFactory;
echo PHP_EOL . 'Load file sites.xlsx' . PHP_EOL;
$urls = [];
$reader = ReaderEntityFactory::createReaderFromFile('sites.xlsx');
$reader->open('sites.xlsx');
$i = 0;
foreach ($reader->getSheetIterator() as $sheet) {
foreach ($sheet->getRowIterator() as $row) {
$i++;
if ($i % 10000) {
echo $i . ' ';
}
$row = $row->toArray();
foreach ($row as &$item) {
$item = trim($item);
$item = str_replace("\n", ' ', $item);
$item = str_replace(' ', ' ', $item);
}
if ((!isset($row[0])) || empty($row[0])) {
continue;
}
if (!isset($row[1])) {
$row[1] = '';
}
if (!isset($row[2])) {
$row[2] = '';
}
if (!isset($row[3])) {
$row[3] = '';
}
if (!isset($urls[$row[0]])) {
$urls[$row[0]] = implode('|', $row);
}
}
}
$reader->close();
echo PHP_EOL . 'LOADED ' . $i . ' urls' . PHP_EOL . PHP_EOL;
echo 'Load file main.xlsx' . PHP_EOL;
$reader = ReaderEntityFactory::createReaderFromFile('main.xlsx');
$reader->open('main.xlsx');
$keywords = [];
$i = 0;
foreach ($reader->getSheetIterator() as $sheet) {
foreach ($sheet->getRowIterator() as $row) {
$i++;
if ($i % 10000) {
echo $i . ' ';
}
$row = $row->toArray();
foreach ($row as &$item) {
$item = trim($item);
}
if ((!isset($row[0])) || empty($row[0]) || (!isset($row[1])) || empty($row[1])) {
continue;
}
if (!isset($urls[$row[1]])) {
continue;
}
if (!isset($keywords[$row[0]])) {
$keywords[$row[0]] = [];
}
$keywords[$row[0]][] = $urls[$row[1]];
}
}
$reader->close();
echo PHP_EOL . 'LOADED ' . $i . ' keywords' . PHP_EOL . PHP_EOL;
echo 'Prepare data' . PHP_EOL;
$i = 0;
foreach ($keywords as $key => $keyword) {
$keywords[$key] = $key . '|' . implode('|', $keywords[$key]);
$i++;
if ($i % 10000) {
echo $i . ' ';
}
}
echo PHP_EOL . 'PREPARED ' . $i . ' keywords' . PHP_EOL . PHP_EOL;
echo 'Save to file output.txt' . PHP_EOL;
$i = 0;
$fh = fopen('output.txt', 'w');
foreach ($keywords as $value) {
fwrite($fh, $value . PHP_EOL);
$i++;
if ($i % 10000) {
echo $i . ' ';
}
}
fclose($fh);
echo PHP_EOL . 'Saved ' . $i . ' lines' . PHP_EOL . PHP_EOL;