Skip to content

Commit

Permalink
✨ Add PdfLoader (#74)
Browse files Browse the repository at this point in the history
  • Loading branch information
lowczarc authored Sep 26, 2023
1 parent 8c25aae commit 36b593a
Show file tree
Hide file tree
Showing 11 changed files with 312 additions and 181 deletions.
10 changes: 10 additions & 0 deletions examples/dataloader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import { readFileSync } from "fs";
import { generate, PdfLoader } from "../lib";

(async () => {
const result = await generate("What is the secret ?", {
data: PdfLoader(readFileSync("/home/lancelot/test.pdf")),
}).infos();

console.log(result);
})();
Binary file removed examples/dataloader/AudioLoader.mp3
Binary file not shown.
Binary file removed examples/dataloader/PdfLoader.pdf
Binary file not shown.
34 changes: 0 additions & 34 deletions examples/dataloader/StringLoader.ts

This file was deleted.

41 changes: 0 additions & 41 deletions examples/dataloader/TextFileLoader.csv

This file was deleted.

69 changes: 0 additions & 69 deletions examples/dataloader/dataloader.ts

This file was deleted.

67 changes: 45 additions & 22 deletions lib/dataloader/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { getDocument, GlobalWorkerOptions, version } from "pdfjs-dist";
import { Memory } from "../memory";
import { transcribe } from "../transcribe";
import { splitString } from "../split";
Expand All @@ -20,24 +21,8 @@ async function batchify<T extends Array<unknown>>(

export type LoaderFunction = (memory: Memory, clientOptions: InputClientOptions) => Promise<void>;

export function TextFileLoader(file: FileInput, maxTokenPerChunk = 100): LoaderFunction {
return async function loadPdfIntoMemory(
memory: Memory,
_clientOptions: InputClientOptions = {},
) {
const fileBuffer = await fileInputToBuffer(file);
const splittedFile = splitString(fileBuffer.toString("utf8"), maxTokenPerChunk);

async function addBatchIntoMemory(batches: string[]) {
await Promise.all(batches.map(async (batch) => memory.add(batch)));
}

await batchify(splittedFile, 10, addBatchIntoMemory);
};
}

export function StringLoader(str: string, maxTokenPerChunk = 100): LoaderFunction {
return async function loadPdfIntoMemory(
return async function loadStringIntoMemory(
memory: Memory,
_clientOptions: InputClientOptions = {},
) {
Expand All @@ -51,20 +36,58 @@ export function StringLoader(str: string, maxTokenPerChunk = 100): LoaderFunctio
};
}

export function TextFileLoader(file: FileInput, maxTokenPerChunk = 100): LoaderFunction {
return async function loadTextIntoMemory(...args) {
const fileBuffer = await fileInputToBuffer(file);
return StringLoader(fileBuffer.toString("utf8"), maxTokenPerChunk)(...args);
};
}

export function AudioLoader(file: FileInput, maxTokenPerChunk = 100): LoaderFunction {
return async function loadAudioIntoMemory(
memory: Memory,
clientOptions: InputClientOptions = {},
) {
const fileBuffer = await fileInputToBuffer(file);
const transcription = await transcribe(fileBuffer, clientOptions);
const transcriptions = splitString(transcription, maxTokenPerChunk);
return StringLoader(transcription, maxTokenPerChunk)(memory, clientOptions);
};
}

async function addBatchIntoMemory(batches: string[]) {
await Promise.all(batches.map(async (batch) => memory.add(batch)));
}
export async function pdfToString(pdf: Uint8Array): Promise<string> {
if (typeof window === "undefined") {
GlobalWorkerOptions.workerSrc = `pdfjs-dist/build/pdf.worker.js`;
} else {
GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${version}/pdf.worker.js`;
}
const pdfDocument = await getDocument(pdf).promise;
const pagesPromises = [];

await batchify(transcriptions, 10, addBatchIntoMemory);
for (let i = 1; i <= pdfDocument.numPages; i++) {
pagesPromises.push(pdfDocument.getPage(i));
}

const pages = await Promise.all(pagesPromises);

const textEntries = await Promise.all(
pages.map(async (page) => {
const pageObject = await page.getTextContent();

return pageObject.items
.map((e) => ("str" in e ? e.str : ""))
.filter((e) => e !== "")
.join("\n");
}),
);

return textEntries.join("\n");
}

export function PdfLoader(file: FileInput, maxTokenPerChunk = 100): LoaderFunction {
return async function loadPdfIntoMemory(...args) {
const fileBuffer = await fileInputToBuffer(file);
const pdfText = await pdfToString(new Uint8Array(fileBuffer));
return StringLoader(pdfText, maxTokenPerChunk)(...args);
};
}

Expand Down
3 changes: 2 additions & 1 deletion lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { usage } from "./user";
import { get as KVGet, set as KVSet } from "./kv";
import PolyfactClientBuilder from "./client";
import { generateImage } from "./image";
import { TextFileLoader, StringLoader, AudioLoader } from "./dataloader";
import { TextFileLoader, StringLoader, AudioLoader, PdfLoader } from "./dataloader";

import {
getAllPrompts,
Expand Down Expand Up @@ -73,6 +73,7 @@ export {
TextFileLoader,
StringLoader,
AudioLoader,
PdfLoader,
};

export default PolyfactClientBuilder;
6 changes: 4 additions & 2 deletions lib/prompt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,17 @@ export type Prompt = {
like?: number;
use?: number;
tags?: string[];
user_id?: string; // eslint-disable-line camelcase
public: boolean;
};

export type PromptInsert = Pick<Prompt, "name" | "description" | "prompt" | "tags">;
export type PromptInsert = Pick<Prompt, "name" | "description" | "prompt" | "tags" | "public">;
export type PromptUpdate = Partial<PromptInsert>;

async function axiosWrapper<T>(
method: "get" | "post" | "put" | "delete",
url: string,
data?: Record<string, string | string[]> | undefined,
data?: Record<string, string | string[] | boolean> | undefined,
clientOptions: InputClientOptions = {},
): Promise<T> {
const { token, endpoint } = await defaultOptions(clientOptions);
Expand Down
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"isomorphic-ws": "^5.0.0",
"js-tiktoken": "^1.0.7",
"pdf-parse": "^1.1.1",
"pdfjs-dist": "^3.10.111",
"polyfact-io-ts": "^2.2.20",
"process": "^0.11.10",
"react": "^18.2.0",
Expand Down Expand Up @@ -67,8 +68,8 @@
"start": "ts-node cmd/index.ts",
"lint": "prettier --check lib/ cmd/ ; eslint lib/ cmd/",
"lint:fix": "prettier --write lib/ cmd/ ; eslint --fix lib/ cmd/",
"build:cmd": "esbuild cmd/index.ts --bundle --outfile=build/polyfact.tmp --platform=node && echo '#!/usr/bin/env node\n' | cat - build/polyfact.tmp > build/polyfact && rm build/polyfact.tmp",
"build:vanilla-js": "esbuild target/vanilla-js.ts --bundle --minify --target=chrome67,firefox68,edge79,safari15 --outfile=build/vanilla-js.js",
"build:cmd": "esbuild cmd/index.ts --bundle --external:canvas --outfile=build/polyfact.tmp --platform=node && echo '#!/usr/bin/env node\n' | cat - build/polyfact.tmp > build/polyfact && rm build/polyfact.tmp",
"build:vanilla-js": "esbuild target/vanilla-js.ts --bundle --external:canvas --minify --target=chrome67,firefox68,edge79,safari15 --outfile=build/vanilla-js.js",
"build": "tsc --target es2021 --lib es2021,DOM --moduleResolution node --strict --esModuleInterop --declaration --jsx react --skipLibCheck --outDir dist --rootDir lib lib/*.ts lib/**/*.ts && npm run build:cmd && cp build/polyfact package.json README.md dist/",
"npm-publish": "npm run build && cd dist && npm publish && cd .."
}
Expand Down
Loading

0 comments on commit 36b593a

Please sign in to comment.