Skip to content

Commit

Permalink
Merge pull request #620 from opengisch/QF-4901-etag
Browse files Browse the repository at this point in the history
Use the etag to check whether the file should be synced or not
  • Loading branch information
suricactus authored Nov 1, 2024
2 parents e5e245d + 23cc5c0 commit 8bdecd8
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 4 deletions.
8 changes: 7 additions & 1 deletion qfieldsync/core/cloud_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,13 @@ def get_user_organizations(self, username: str) -> QNetworkReply:
def get_files(self, project_id: str, client: str = "qgis") -> QNetworkReply:
"""Get project files and their versions"""

return self.cloud_get(["files", project_id], {"client": client})
return self.cloud_get(
["files", project_id],
{
"client": client,
"skip_metadata": 1,
},
)

def get_file(self, url: QUrl, local_filename: str) -> QNetworkReply:
"""Download file from external URL"""
Expand Down
66 changes: 63 additions & 3 deletions qfieldsync/core/cloud_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@


import hashlib
import os
import sqlite3
from enum import IntFlag
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional
from typing import Any, Dict, Iterator, List, Optional, Union

from libqfieldsync.utils.qgis import get_qgis_files_within_dir
from qgis.core import QgsProject
Expand All @@ -40,8 +41,50 @@ class ProjectFileCheckout(IntFlag):
LocalAndCloud = 3


def calc_etag(filename: Union[str, Path], part_size: int = 8 * 1024 * 1024) -> str:
"""Calculate ETag as in Object Storage (S3) of a local file.
ETag is a MD5. But for the multipart uploaded files, the MD5 is computed from the concatenation of the MD5s of each uploaded part.
See the inspiration of this implementation here: https://stackoverflow.com/a/58239738/1226137
Args:
filename (str): the local filename
part_size (int): the size of the Object Storage part. Most Object Storages use 8MB. Defaults to 8*1024*1024.
Returns:
str: the calculated ETag value
"""
with open(filename, "rb") as f:
file_size = os.fstat(f.fileno()).st_size

if file_size <= part_size:
BLOCKSIZE = 65536
hasher = hashlib.md5()

buf = f.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = f.read(BLOCKSIZE)

return hasher.hexdigest()
else:
# Say you uploaded a 14MB file and your part size is 5MB.
# Calculate 3 MD5 checksums corresponding to each part, i.e. the checksum of the first 5MB, the second 5MB, and the last 4MB.
# Then take the checksum of their concatenation.
# Since MD5 checksums are hex representations of binary data, just make sure you take the MD5 of the decoded binary concatenation, not of the ASCII or UTF-8 encoded concatenation.
# When that's done, add a hyphen and the number of parts to get the ETag.
md5sums = []
for data in iter(lambda: f.read(part_size), b""):
md5sums.append(hashlib.md5(data).digest())

final_md5sum = hashlib.md5(b"".join(md5sums))

return "{}-{}".format(final_md5sum.hexdigest(), len(md5sums))


class ProjectFile:
def __init__(self, data: Dict[str, Any], local_dir: str = None) -> None:
def __init__(self, data: Dict[str, Any], local_dir: Optional[str] = None) -> None:
self._local_dir = local_dir
self._temp_dir = None
self._data = data
Expand Down Expand Up @@ -97,11 +140,18 @@ def size(self) -> Optional[int]:
def sha256(self) -> Optional[str]:
return self._data.get("sha256")

@property
def etag(self) -> Optional[str]:
# NOTE the etag is actually stored in the `md5sum` key by mistake
return self._data.get("md5sum")

@property
def local_size(self) -> Optional[int]:
if not self.local_path_exists:
return

assert self.local_path

return self.local_path.stat().st_size

@property
Expand Down Expand Up @@ -129,6 +179,16 @@ def local_sha256(self) -> Optional[str]:
with open(self.local_path, "rb") as f:
return hashlib.sha256(f.read()).hexdigest()

@property
def local_etag(self) -> Optional[str]:
if not self.local_path_exists:
return

assert self.local_path
assert self.local_path.is_file()

return calc_etag(self.local_path)

def flush(self) -> None:
if not self._local_dir:
return
Expand Down Expand Up @@ -286,7 +346,7 @@ def files_to_sync(self) -> Iterator[ProjectFile]:
project_file.flush()

# don't attempt to sync files that are the same both locally and remote
if project_file.sha256 == project_file.local_sha256:
if project_file.etag == project_file.local_etag:
continue

# ignore local files that are not in the temp directory
Expand Down

0 comments on commit 8bdecd8

Please sign in to comment.