forked from sullivan-sean/chat-langchainjs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ingest.ts
84 lines (77 loc) · 2.44 KB
/
ingest.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import { HNSWLib } from "langchain/vectorstores";
import { OpenAIEmbeddings } from "langchain/embeddings";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import * as fs from "fs";
import { Document } from "langchain/document";
import { BaseDocumentLoader } from "langchain/document_loaders";
import path from "path";
import { load } from "cheerio";
async function processFile(filePath: string): Promise<Document> {
return await new Promise<Document>((resolve, reject) => {
fs.readFile(filePath, "utf8", (err, fileContents) => {
if (err) {
reject(err);
} else {
const text = load(fileContents).text();
const metadata = { source: filePath };
const doc = new Document({ pageContent: text, metadata: metadata });
resolve(doc);
}
});
});
}
async function processDirectory(directoryPath: string): Promise<Document[]> {
const docs: Document[] = [];
let files: string[];
try {
files = fs.readdirSync(directoryPath);
} catch (err) {
console.error(err);
throw new Error(
`Could not read directory: ${directoryPath}. Did you run \`sh download.sh\`?`
);
}
for (const file of files) {
const filePath = path.join(directoryPath, file);
const stat = fs.statSync(filePath);
if (stat.isDirectory()) {
const newDocs = processDirectory(filePath);
const nestedDocs = await newDocs;
docs.push(...nestedDocs);
} else {
const newDoc = processFile(filePath);
const doc = await newDoc;
docs.push(doc);
}
}
return docs;
}
class ReadTheDocsLoader extends BaseDocumentLoader {
constructor(public filePath: string) {
super();
}
async load(): Promise<Document[]> {
return await processDirectory(this.filePath);
}
}
const directoryPath = "langchain.readthedocs.io";
const loader = new ReadTheDocsLoader(directoryPath);
export const run = async () => {
const rawDocs = await loader.load();
console.log("Loader created.");
/* Split the text into chunks */
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});
const docs = await textSplitter.splitDocuments(rawDocs);
console.log("Docs splitted.");
console.log("Creating vector store...");
/* Create the vectorstore */
const vectorStore = await HNSWLib.fromDocuments(docs, new OpenAIEmbeddings());
await vectorStore.save("data");
};
(async () => {
await run();
console.log("done");
})();