Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent empty commits if files did not change #809

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion packages/hub/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
}
},
"browser": {
"./src/utils/sha256-node.ts": false,
"./src/utils/sha-node.ts": false,
"./src/utils/FileBlob.ts": false,
"./dist/index.js": "./dist/browser/index.js",
"./dist/index.mjs": "./dist/browser/index.mjs"
Expand Down
135 changes: 84 additions & 51 deletions packages/hub/src/lib/commit.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import { checkCredentials } from "../utils/checkCredentials";
import { chunk } from "../utils/chunk";
import { promisesQueue } from "../utils/promisesQueue";
import { promisesQueueStreaming } from "../utils/promisesQueueStreaming";
import { sha1 } from "../utils/sha1";
import { sha256 } from "../utils/sha256";
import { toRepoId } from "../utils/toRepoId";
import { WebBlob } from "../utils/WebBlob";
Expand Down Expand Up @@ -125,8 +126,6 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
const repoId = toRepoId(params.repo);
yield { event: "phase", phase: "preuploading" };

const lfsShas = new Map<string, string | null>();

const abortController = new AbortController();
const abortSignal = abortController.signal;

Expand Down Expand Up @@ -168,6 +167,9 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr

const gitAttributes = allOperations.filter(isFileOperation).find((op) => op.path === ".gitattributes")?.content;

const deleteOps = allOperations.filter((op): op is CommitDeletedEntry => !isFileOperation(op));
const fileOpsToCommit: (CommitBlob &
({ uploadMode: "lfs"; sha: string } | { uploadMode: "regular"; newSha: string | null }))[] = [];
for (const operations of chunk(allOperations.filter(isFileOperation), 100)) {
const payload: ApiPreuploadRequest = {
gitAttributes: gitAttributes && (await gitAttributes.text()),
Expand Down Expand Up @@ -203,43 +205,50 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr

const json: ApiPreuploadResponse = await res.json();

for (const file of json.files) {
if (file.uploadMode === "lfs") {
lfsShas.set(file.path, null);
yield* eventToGenerator<{ event: "fileProgress"; state: "hashing"; path: string; progress: number }, undefined>(
(yieldCallback) => {
return promisesQueue(
json.files.map((file) => async () => {
const op = operations.find((op) => op.path === file.path);
if (!op) {
return; // this should never happen, server-side we always return one entry per operation
}

const iterator =
file.uploadMode === "lfs"
? sha256(op.content, { useWebWorker: params.useWebWorkers, abortSignal })
: sha1(op.content, { abortSignal });
Pierrci marked this conversation as resolved.
Show resolved Hide resolved
let res: IteratorResult<number, string | null>;
do {
res = await iterator.next();
if (!res.done) {
yieldCallback({ event: "fileProgress", path: op.path, progress: res.value, state: "hashing" });
}
} while (!res.done);

if (res.value === null || res.value !== file.oid) {
fileOpsToCommit.push({
...op,
uploadMode: file.uploadMode,
sha: res.value,
} as (typeof fileOpsToCommit)[number]);
// ^^ we know `newSha` can only be `null` in the case of `sha1` as expected, but TS doesn't
}
}),
CONCURRENT_SHAS
);
}
}
);

abortSignal?.throwIfAborted();
}

yield { event: "phase", phase: "uploadingLargeFiles" };

for (const operations of chunk(
allOperations.filter(isFileOperation).filter((op) => lfsShas.has(op.path)),
for (const lfsOpsToCommit of chunk(
fileOpsToCommit.filter((op): op is CommitBlob & { uploadMode: "lfs"; sha: string } => op.uploadMode === "lfs"),
100
)) {
const shas = yield* eventToGenerator<
{ event: "fileProgress"; state: "hashing"; path: string; progress: number },
string[]
>((yieldCallback, returnCallback, rejectCallack) => {
return promisesQueue(
operations.map((op) => async () => {
const iterator = sha256(op.content, { useWebWorker: params.useWebWorkers, abortSignal: abortSignal });
let res: IteratorResult<number, string>;
do {
res = await iterator.next();
if (!res.done) {
yieldCallback({ event: "fileProgress", path: op.path, progress: res.value, state: "hashing" });
}
} while (!res.done);
const sha = res.value;
lfsShas.set(op.path, res.value);
return sha;
}),
CONCURRENT_SHAS
).then(returnCallback, rejectCallack);
});

abortSignal?.throwIfAborted();

const payload: ApiLfsBatchRequest = {
operation: "upload",
// multipart is a custom protocol for HF
Expand All @@ -250,8 +259,8 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
name: params.branch ?? "main",
},
}),
objects: operations.map((op, i) => ({
oid: shas[i],
objects: lfsOpsToCommit.map((op) => ({
oid: op.sha,
size: op.content.size,
})),
};
Expand Down Expand Up @@ -279,7 +288,7 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
const json: ApiLfsBatchResponse = await res.json();
const batchRequestId = res.headers.get("X-Request-Id") || undefined;

const shaToOperation = new Map(operations.map((op, i) => [shas[i], op]));
const shaToOperation = new Map(lfsOpsToCommit.map((op) => [op.sha, op]));

yield* eventToGenerator<CommitProgressEvent, void>((yieldCallback, returnCallback, rejectCallback) => {
return promisesQueueStreaming(
Expand All @@ -293,9 +302,9 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
abortSignal?.throwIfAborted();

if (obj.error) {
const errorMessage = `Error while doing LFS batch call for ${operations[shas.indexOf(obj.oid)].path}: ${
obj.error.message
}${batchRequestId ? ` - Request ID: ${batchRequestId}` : ""}`;
const errorMessage = `Error while doing LFS batch call for ${op.path}: ${obj.error.message}${
batchRequestId ? ` - Request ID: ${batchRequestId}` : ""
}`;
throw new HubApiError(res.url, obj.error.code, batchRequestId, errorMessage);
}
if (!obj.actions?.upload) {
Expand Down Expand Up @@ -367,9 +376,7 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
if (!res.ok) {
throw await createApiError(res, {
requestId: batchRequestId,
message: `Error while uploading part ${part} of ${
operations[shas.indexOf(obj.oid)].path
} to LFS storage`,
message: `Error while uploading part ${part} of ${op.path} to LFS storage`,
});
}

Expand Down Expand Up @@ -399,9 +406,7 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
if (!res.ok) {
throw await createApiError(res, {
requestId: batchRequestId,
message: `Error completing multipart upload of ${
operations[shas.indexOf(obj.oid)].path
} to LFS storage`,
message: `Error completing multipart upload of ${op.path} to LFS storage`,
});
}

Expand Down Expand Up @@ -438,7 +443,7 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
if (!res.ok) {
throw await createApiError(res, {
requestId: batchRequestId,
message: `Error while uploading ${operations[shas.indexOf(obj.oid)].path} to LFS storage`,
message: `Error while uploading ${op.path} to LFS storage`,
});
}

Expand All @@ -459,6 +464,35 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr

yield { event: "phase", phase: "committing" };

const allOpsToCommit = [...deleteOps, ...fileOpsToCommit];
if (allOpsToCommit.length === 0) {
const res: Response = await (params.fetch ?? fetch)(
`${params?.hubUrl || HUB_URL}/api/${repoId.type}s/${repoId.name}`,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there doesn't seem to be any method already to call this endpoint, correct?

Copy link
Member

@coyotte508 coyotte508 Jul 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes we should create one - in another PR - with expand handling too (getModelInfo, ...)

But here I think we should include the sha directly in the preupload response. Given we give already oid for individual files, and those are the oid in the context a specific commit from the remote, we should add the sha in the preupload response.

That way - no need for all this extra code here - cc @Wauplin as well

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should include the sha directly in the preupload response

That would simplifies things indeed!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see 325d917

{
headers: {
accept: "application/json",
...(params.credentials && { Authorization: `Bearer ${params.credentials.accessToken}` }),
},
}
);

if (!res.ok) {
throw await createApiError(res);
}

const { sha } = await res.json();

return {
commit: {
oid: sha,
url: `${params?.hubUrl || HUB_URL}${repoId.type !== "model" ? `/${repoId.type}s` : ""}/${
repoId.name
}/commit/${sha}`,
},
hookOutput: "Nothing to commit",
};
coyotte508 marked this conversation as resolved.
Show resolved Hide resolved
}

return yield* eventToGenerator<CommitProgressEvent, CommitOutput>(
async (yieldCallback, returnCallback, rejectCallback) =>
(params.fetch ?? fetch)(
Expand All @@ -481,17 +515,16 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
} satisfies ApiCommitHeader,
},
...((await Promise.all(
allOperations.map((operation) => {
allOpsToCommit.map((operation) => {
if (isFileOperation(operation)) {
const sha = lfsShas.get(operation.path);
if (sha) {
if (operation.uploadMode === "lfs") {
return {
key: "lfsFile",
value: {
path: operation.path,
algo: "sha256",
size: operation.content.size,
oid: sha,
oid: operation.sha,
} satisfies ApiCommitLfsFile,
};
}
Expand All @@ -509,8 +542,8 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
progressCallback: (progress: number) => {
// For now, we display equal progress for all files
// We could compute the progress based on the size of `convertOperationToNdJson` for each of the files instead
for (const op of allOperations) {
if (isFileOperation(op) && !lfsShas.has(op.path)) {
for (const op of allOpsToCommit) {
if (isFileOperation(op) && op.uploadMode === "regular") {
yieldCallback({
event: "fileProgress",
path: op.path,
Expand Down
5 changes: 5 additions & 0 deletions packages/hub/src/types/api/api-commit.ts
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ export interface ApiPreuploadResponse {
files: Array<{
path: string;
uploadMode: "lfs" | "regular";
/**
* The oid of the blob if it already exists in the repository
* in case of blob is a lfs file, it'll be the lfs oid
Pierrci marked this conversation as resolved.
Show resolved Hide resolved
*/
oid?: string;
}>;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,38 @@ export async function* sha256Node(
abortSignal?: AbortSignal;
}
): AsyncGenerator<number, string> {
const sha256Stream = createHash("sha256");
return yield* shaNode("sha256", buffer, opts);
}

export async function* sha1Node(
buffer: ArrayBuffer | Blob,
opts?: {
abortSignal?: AbortSignal;
}
): AsyncGenerator<number, string> {
return yield* shaNode("sha1", buffer, opts);
}
Pierrci marked this conversation as resolved.
Show resolved Hide resolved

async function* shaNode(
algorithm: "sha1" | "sha256",
buffer: ArrayBuffer | Blob,
opts?: {
abortSignal?: AbortSignal;
}
): AsyncGenerator<number, string> {
const shaStream = createHash(algorithm);
const size = buffer instanceof Blob ? buffer.size : buffer.byteLength;
let done = 0;
const readable =
buffer instanceof Blob ? Readable.fromWeb(buffer.stream() as ReadableStream) : Readable.from(Buffer.from(buffer));

for await (const buffer of readable) {
sha256Stream.update(buffer);
shaStream.update(buffer);
done += buffer.length;
yield done / size;

opts?.abortSignal?.throwIfAborted();
}

return sha256Stream.digest("hex");
return shaStream.digest("hex");
}
31 changes: 31 additions & 0 deletions packages/hub/src/utils/sha1.ts
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tested nothing is broken by renaming this file - https://huggingface.co/spaces/coyotte508/hub-sha256 - and it seems fine

Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import { hexFromBytes } from "./hexFromBytes";
import { isFrontend } from "./isFrontend";

export async function* sha1(buffer: Blob, opts?: { abortSignal?: AbortSignal }): AsyncGenerator<number, string | null> {
yield 0;

if (globalThis.crypto?.subtle) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no threshold like for sha256 since non-LFS files are always < 10MB (which is the threshold for sha256 IIUC)

const res = hexFromBytes(
new Uint8Array(
await globalThis.crypto.subtle.digest("SHA-1", buffer instanceof Blob ? await buffer.arrayBuffer() : buffer)
)
);

yield 1;
return res;
}

if (isFrontend) {
yield 1;
return null; // unsupported if we're here
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to keep things simple, and it's not really a problem IMO, it just means that, in this case, we'll always upload non-LFS files

}

if (!cryptoModule) {
cryptoModule = await import("./sha-node");
}

return yield* cryptoModule.sha1Node(buffer, { abortSignal: opts?.abortSignal });
}

// eslint-disable-next-line @typescript-eslint/consistent-type-imports
let cryptoModule: typeof import("./sha-node");
4 changes: 2 additions & 2 deletions packages/hub/src/utils/sha256.ts
Original file line number Diff line number Diff line change
Expand Up @@ -154,13 +154,13 @@ export async function* sha256(
}

if (!cryptoModule) {
cryptoModule = await import("./sha256-node");
cryptoModule = await import("./sha-node");
}

return yield* cryptoModule.sha256Node(buffer, { abortSignal: opts?.abortSignal });
}

// eslint-disable-next-line @typescript-eslint/consistent-type-imports
let cryptoModule: typeof import("./sha256-node");
let cryptoModule: typeof import("./sha-node");
// eslint-disable-next-line @typescript-eslint/consistent-type-imports
let wasmModule: typeof import("../vendor/hash-wasm/sha256-wrapper");
Loading