-
Notifications
You must be signed in to change notification settings - Fork 2
/
index.js
155 lines (125 loc) · 3.57 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const request = require('request')
const fs = require('fs');
const _ = require('lodash')
const path = require('path');
const filePath = path.resolve(__dirname, 'gptslist.txt');
// const PROXY_SERVER = 'http://127.0.0.1:7890';
const PROXY_SERVER = '';
let browser = null;
let gptsList = []
function getOutputFileName () {
const date = new Date()
const year = date.getFullYear()
const month = (date.getMonth() + 1).toString().padStart(2, '0')
const day = date.getDate().toString().padStart(2, '0')
const hours = date.getHours().toString().padStart(2, '0')
const minutes = date.getMinutes().toString().padStart(2, '0')
const seconds = date.getSeconds().toString().padStart(2, '0')
return `${year}-${month}-${day}_${hours}-${minutes}-${seconds}`
}
async function getRedirectShortUrl(targetUrl) {
return await new Promise((resolve) => {
request.get({
url: targetUrl,
proxy: PROXY_SERVER,
}, (error, response) => {
if (response) {
resolve(response.request.path.split('/g/')[1]);
} else {
resolve(null)
}
});
})
}
const sendRequest = async (shortUrl) => {
let firstReq = true;
const targetUrl = 'https://chat.openai.com/g/' + shortUrl
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', async (request) => {
if (firstReq) {
firstReq = false
request.continue()
} else {
request.abort()
}
});
page.on('response', async (response) => {
if (response.status() !== 200) {
if (String(response.status()).startsWith('3')) {
const redirectShortUrl = await getRedirectShortUrl(targetUrl)
if (!redirectShortUrl) {
return
}
await sendRequest(redirectShortUrl)
return;
}
}
let data
try {
data = await response.text();
} catch (err) {
console.error(shortUrl, err)
return
}
try {
const $ = cheerio.load(data);
const dataContent = $('#__NEXT_DATA__').html();
const obj = JSON.parse(dataContent);
// if (obj.props.pageProps.statusCode === 404) {
// console.log('data not found:', shortUrl)
// return
// }
const gpts = _.get(obj, 'props.pageProps.gizmo')
if (!gpts) {
console.log('data not found:', shortUrl)
}
console.log('got data:', shortUrl);
gptsList.push(gpts)
} catch (error) {
console.error('crawler error: ', shortUrl, ' ', error);
}
})
await page.goto(targetUrl);
}
async function sendRequestsSerially(shortUrls) {
browser = await puppeteer.launch({
args: [ `--proxy-server=${ PROXY_SERVER }` ]
});
for (const shortUrl of shortUrls) {
try {
await sendRequest(shortUrl);
} catch (error) {
console.error('Error sending request:', error);
} finally {
await new Promise(resolve => setTimeout(resolve, 2 * 1000));
}
}
await browser.close();
}
function getList() {
return new Promise((resolve) => {
fs.readFile(filePath, 'utf8', async (err, data) => {
if (err) {
console.error('read file error: ', err);
return;
}
const uuids = data.trim().split('\n');
resolve(_.compact(uuids))
});
})
}
async function main() {
const list = await getList()
await sendRequestsSerially(list);
fs.writeFile(`${getOutputFileName()}.json`, JSON.stringify(gptsList, null, 2), (err) => {
if (err) {
console.error(err);
} else {
console.log('File written successfully!');
}
});
}
main()