-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
*: replace line by line text loaders by chunk by chunk text loaders
discojs/src/dataset: implement and test repeat and batchWithOverlap
- Loading branch information
Showing
15 changed files
with
313 additions
and
172 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,34 @@ | ||
import * as fs from "node:fs/promises"; | ||
import * as readline from "node:readline/promises"; | ||
|
||
import createDebug from "debug"; | ||
import { createReadStream } from 'node:fs'; | ||
import { Dataset, Text } from "@epfml/discojs"; | ||
|
||
export function load(path: string): Dataset<Text> { | ||
const debug = createDebug("discojs-node:loaders:text"); | ||
|
||
/** | ||
* Returns chunks of text. Use `minChunkSize` to ensure that | ||
* each chunk is bigger than the expected sequence length. | ||
* | ||
* @param path path to the text file to read | ||
* @param minChunkSize default to 16KiB, the minimum size of each chunk in bits | ||
* @returns a dataset of tokenized input and label sequences | ||
*/ | ||
export function load(path: string, minChunkSize = 16384): Dataset<Text> { | ||
return new Dataset(async function* () { | ||
const input = (await fs.open(path)).createReadStream({ encoding: "utf8" }); | ||
if (minChunkSize < 1 || !Number.isInteger(minChunkSize)) | ||
throw new Error("minChunkSize must be positive integers"); | ||
|
||
debug("Setting the chunk size to %o bits", minChunkSize) | ||
// Create a stream to read the text file chunk by chunk | ||
const stream = createReadStream(path, { | ||
encoding: "utf8", | ||
highWaterMark: minChunkSize | ||
}); | ||
for await (const chunk of stream) { | ||
if (typeof chunk !== 'string') | ||
throw new Error('Expected file stream to yield string') | ||
|
||
// `readline` is a bit overkill but seems standard | ||
// https://nodejs.org/api/readline.html#example-read-file-stream-line-by-line | ||
yield* readline.createInterface({ input, crlfDelay: Infinity }); | ||
debug("yield chunk of length: %o", chunk.length); | ||
yield chunk | ||
} | ||
}); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.